In [1]:
"""Load sw3-356-instruct"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Model name
model_name = "AI-Sweden-Models/gpt-sw3-356m-instruct"

# Determine the device to use (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)


In [2]:
model.save_pretrained("./pretrained_sw3_356")



In [39]:
print(type("synthetic_data_fixed.json"))

<class 'str'>


In [43]:
import json
import re

# Read your data file
with open("synthetic_data_fixed.json", "r", encoding="utf-8") as f:
    lines = f.readlines()

cleaned_data = []

# Regex to remove unwanted text parts
pattern = r'text"?\["?|text"?\]?'

# Convert to JSON structure
for line in lines:
    line = line.strip()
    if line:
        # Remove unwanted markers and symbols
        clean_line = re.sub(pattern, '', line)
        # Only add non-empty lines
        if clean_line:
            cleaned_data.append({"text": clean_line})

# Save to JSON
with open("converted_data.json", "w", encoding="utf-8") as json_file:
    json.dump(cleaned_data, json_file, ensure_ascii=False, indent=2)


print("Conversion completed: converted_data.json")


Conversion completed: converted_data.json


In [49]:
"""Load synthetic_data_fixed.json and tokenize"""

import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load the cleaned JSON file
data_path = "converted_data.json"
with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)  # Load as a list of dictionaries

# Convert to Pandas DataFrame
train_df = pd.DataFrame(data)

# Ensure 'text' column exists
if "text" not in train_df.columns:
    raise ValueError("The JSON file does not contain a 'text' column.")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(train_df)
print("Dataset loaded successfully:", dataset.column_names)

# Load the tokenizer
local_model_path = r"C:\models--AI-Sweden-Models--gpt-sw3-356m-instruct\snapshots\fce932486e4fa09d377ff8a499f0a2b6145efbf7"
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Define the tokenization function
def tokenize_function(examples):
    tokens = tokenizer(
        examples['text'], 
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()  # Add labels
    return tokens

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Print example tokenized output
print(tokenized_dataset[0])



Dataset loaded successfully: ['text']


Map:   0%|          | 0/527 [00:00<?, ? examples/s]

{'text': '[', 'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [31]:
print(cleaned_data[:10])

[{'text': '['}, {'text': '"– Josefina Sundin sitter ofta i utsatta miljöer, berättar Greta Jönsson. Så om det fungerar för oss vintertid så finns det inga hinder för att utöka systemet till andra kommunala verksamheter. Gunnar Lindberg har både bokstavligt och bildlikt en nyckelroll på Dalavatten:",'}, {'text': '"– All kommunikation kring projektet sker direkt i Openings Studio, säger Britt-Marie Persson. Det finns inte längre något behov av att skicka mejl om till exempel förändringar i projektet. Allt hanteras och spåras i mjukvaran.",'}, {'text': '"Ulf Hansson in i Karlsson HB Karlsson HBs värld av låssystem och lär dig om fördelar, funktioner och finesser.",'}, {'text': '"– Johanna Henriksson hade vi ett nyckelskåp från en annan leverantör. Med- arbetarna knappade även då in en kod när de skulle hämta nycklar men måste kvittera hämtning och återlämning på",'}, {'text': '"Remote, där det räcker med en nyckel, berättar Viktoria Olausson, drift- och säkerhetsansva

In [12]:
"""load data"""
local_model_path = r"C:\models--AI-Sweden-Models--gpt-sw3-356m-instruct\snapshots\fce932486e4fa09d377ff8a499f0a2b6145efbf7"

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model from the local directory
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

In [13]:
"""for memory efficiency"""
!pip install peft bitsandbytes




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [46]:
"""Set Up Training Arguments"""
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt-sw3-finetuned",  # Directory for model checkpoints
    evaluation_strategy="no",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save model checkpoints every epoch
    per_device_train_batch_size=1,     # Adjust based on your GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=3,                # Number of epochs (adjustable)
    logging_dir="./logs",              # Log directory
    logging_steps=100,
    save_total_limit=2,                 # Keeps last 2 checkpoints only
    fp16=True,                          # Enable if using GPU with FP16 support
)




In [50]:
"""Initialize Trainer"""
from transformers import Trainer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(local_model_path)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Now with labels
    tokenizer=tokenizer,  
)

  trainer = Trainer(


In [51]:
"""Start Fine-Tuning"""
trainer.train()

Step,Training Loss
100,2.5925
200,0.4986
300,0.3532
400,0.3454
500,0.3204
600,0.2191
700,0.1733
800,0.1623
900,0.1429
1000,0.1545




TrainOutput(global_step=1581, training_loss=0.34095062114708335, metrics={'train_runtime': 17840.0583, 'train_samples_per_second': 0.089, 'train_steps_per_second': 0.089, 'total_flos': 1468275798048768.0, 'train_loss': 0.34095062114708335, 'epoch': 3.0})

In [52]:
#Save the Fine-Tuned Model
model.save_pretrained("./finetuned_sw3_356")
tokenizer.save_pretrained("./finetuned_sw3_356")

('./finetuned_sw3_356\\tokenizer_config.json',
 './finetuned_sw3_356\\special_tokens_map.json',
 './finetuned_sw3_356\\spiece.model',
 './finetuned_sw3_356\\added_tokens.json')

In [None]:
"""TASK VECTOR OPERATION"""
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer
import nbimporter
from task_vectors import TaskVector
from safetensors.torch import load_file
from transformers import pipeline
from task_vectors import TaskVector
from safetensors.torch import load_file

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # <----- change
model = GPT2LMHeadModel.from_pretrained("gpt2")  # <----- change

# Fine-tuned model (assume you have a fine-tuned model)
finetuned_model_path = "./finetuned_gpt2"   # <----- change
finetuned_model = GPT2LMHeadModel.from_pretrained(finetuned_model_path)

def normalize_weights(param_diff):
    return param_diff / torch.norm(param_diff)
    
# Define TaskVector class
class TaskVector:
    def __init__(self, pretrained_checkpoint, finetuned_checkpoint):
        self.pretrained_model = GPT2LMHeadModel.from_pretrained(pretrained_checkpoint)
        self.finetuned_model = GPT2LMHeadModel.from_pretrained(finetuned_checkpoint)

    def __neg__(self):
        """Negate the task vector."""
        negated_vector = TaskVector.__new__(TaskVector)
        negated_vector.pretrained_model = self.pretrained_model
        negated_vector.finetuned_model = self.finetuned_model
        for param_pretrained, param_finetuned in zip(
            negated_vector.pretrained_model.parameters(), negated_vector.finetuned_model.parameters()
        ):
            param_finetuned.data = param_pretrained.data - param_finetuned.data  # <----- play with
        return negated_vector

    def apply_to(self, base_model):
        """Applies the task vector to a base model's weights."""
        for param_base, param_pretrained, param_finetuned in zip(
            base_model.parameters(), self.pretrained_model.parameters(), self.finetuned_model.parameters()
        ):
            param_base.data += scaling_coef * normalize_weights(param_finetuned.data - param_pretrained.data)  # <----- play with
        return base_model

# Initialize TaskVector
task_vector = TaskVector("gpt2", finetuned_model_path)  # <----- change

# Negate the Task Vector to adjust toward negative sentiment
neg_task_vector = -task_vector  # <----- play with

# Sentence to transfer
input_sentence = "This is a simple sentence and not that long."  # <----- change
"""
prompt should look like this:
[Instruction]: Provide a clear and concise task or question.
[Context]: Include any necessary background information or examples.
[Format]: Specify the desired response format (bullet points, paragraphs, JSON, etc.).
[Constraints]: (Optional) Set any rules, word limits, or tone preferences.
"""

# Generate output using the fine-tuned model
def generate_with_model(model, sentence):  # <----- play with
    inputs = tokenizer.encode(sentence, return_tensors="pt")
    output = model.generate(
    inputs,
    max_length=70,          # Limit output length
    temperature=0.7,         # Adjust temperature for randomness
    top_k=85,                # Limit sampling to top k candidates
    top_p=0.9,               # Nucleus sampling for diversity
    repetition_penalty=3.0,  # Apply repetition penalty
    num_return_sequences=1   # Only generate one sequence
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Generate output after applying task vector
base_model_copy = GPT2LMHeadModel.from_pretrained("gpt2")  # <----- change
scaling_coef = 0.8  # Adjust the scaling factor as needed
task_adjusted_model = neg_task_vector.apply_to(base_model_copy)

# Generate sentences
finetuned_output = generate_with_model(finetuned_model, input_sentence)
adjusted_output = generate_with_model(task_adjusted_model, input_sentence)

# Print input and outputs
print("Input Sentence:", input_sentence)
print("Output (Fine-Tuned Model):", finetuned_output)
print("Output (After Task Vector Adjustment):", adjusted_output)