In [2]:
"""Load sw3-356-instruct"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Model name
model_name = "AI-Sweden-Models/gpt-sw3-356m-instruct"

# Determine the device to use (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)


In [3]:
model.save_pretrained("./pretrained_sw3_356")



In [4]:
print(type("synthetic_data_fixed.json"))

<class 'str'>


In [5]:
import json
import re

# Read your data file
with open("synthetic_data_fixed.json", "r", encoding="utf-8") as f:
    lines = f.readlines()

cleaned_data = []

# Regex to remove unwanted text parts
pattern = r'text"?\["?|text"?\]?'

# Convert to JSON structure
for line in lines:
    line = line.strip()
    if line:
        # Remove unwanted markers and symbols
        clean_line = re.sub(pattern, '', line)
        # Only add non-empty lines
        if clean_line:
            cleaned_data.append({"text": clean_line})

# Save to JSON
with open("converted_data.json", "w", encoding="utf-8") as json_file:
    json.dump(cleaned_data, json_file, ensure_ascii=False, indent=2)


print("Conversion completed: converted_data.json")


Conversion completed: converted_data.json


In [6]:
"""Load synthetic_data_fixed.json and tokenize"""

import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load the cleaned JSON file
data_path = "converted_data.json"
with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)  # Load as a list of dictionaries

# Convert to Pandas DataFrame
train_df = pd.DataFrame(data)

# Ensure 'text' column exists
if "text" not in train_df.columns:
    raise ValueError("The JSON file does not contain a 'text' column.")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(train_df)
print("Dataset loaded successfully:", dataset.column_names)

# Load the tokenizer
local_model_path = r"C:\models--AI-Sweden-Models--gpt-sw3-356m-instruct\snapshots\fce932486e4fa09d377ff8a499f0a2b6145efbf7"
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Define the tokenization function
def tokenize_function(examples):
    tokens = tokenizer(
        examples['text'], 
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()  # Add labels
    return tokens

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Print example tokenized output
print(tokenized_dataset[0])



Dataset loaded successfully: ['text']


Map:   0%|          | 0/527 [00:00<?, ? examples/s]

{'text': '[', 'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
print(cleaned_data[:10])

[{'text': '['}, {'text': '"– Josefina Sundin sitter ofta i utsatta miljöer, berättar Greta Jönsson. Så om det fungerar för oss vintertid så finns det inga hinder för att utöka systemet till andra kommunala verksamheter. Gunnar Lindberg har både bokstavligt och bildlikt en nyckelroll på Dalavatten:",'}, {'text': '"– All kommunikation kring projektet sker direkt i Openings Studio, säger Britt-Marie Persson. Det finns inte längre något behov av att skicka mejl om till exempel förändringar i projektet. Allt hanteras och spåras i mjukvaran.",'}, {'text': '"Ulf Hansson in i Karlsson HB Karlsson HBs värld av låssystem och lär dig om fördelar, funktioner och finesser.",'}, {'text': '"– Johanna Henriksson hade vi ett nyckelskåp från en annan leverantör. Med- arbetarna knappade även då in en kod när de skulle hämta nycklar men måste kvittera hämtning och återlämning på",'}, {'text': '"Remote, där det räcker med en nyckel, berättar Viktoria Olausson, drift- och säkerhetsansva

In [39]:
"""load data"""
local_model_path = r"C:\models--AI-Sweden-Models--gpt-sw3-356m-instruct\snapshots\fce932486e4fa09d377ff8a499f0a2b6145efbf7"

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model from the local directory
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

# Freeze all layers except the last 2
for param in model.parameters():
    param.requires_grad = False
for param in model.transformer.h[-2:].parameters():  # Unfreezing last 2 layers
    param.requires_grad = True

In [9]:
"""for memory efficiency"""
!pip install peft bitsandbytes




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
"""Set Up Training Arguments"""
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt-sw3-finetuned",  # Directory for model checkpoints
    evaluation_strategy="no",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save model checkpoints every epoch
    per_device_train_batch_size=1,     # Adjust based on your GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=3,                # Number of epochs (adjustable)
    logging_dir="./logs",              # Log directory
    logging_steps=100,
    save_total_limit=2,                 # Keeps last 2 checkpoints only
    fp16=True,                          # Enable if using GPU with FP16 support
)


In [41]:
"""Initialize Trainer"""
from transformers import Trainer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(local_model_path)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Now with labels
    tokenizer=tokenizer,  
)

  trainer = Trainer(


In [42]:
"""Start Fine-Tuning"""
trainer.train()

Step,Training Loss
100,2.5989
200,0.4018
300,0.3562
400,0.3533
500,0.3279
600,0.2212
700,0.1704
800,0.1697
900,0.1487
1000,0.1543




TrainOutput(global_step=1581, training_loss=0.33609082471110413, metrics={'train_runtime': 15343.6862, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.103, 'total_flos': 1468275798048768.0, 'train_loss': 0.33609082471110413, 'epoch': 3.0})

In [44]:
#Save the Fine-Tuned Model
model.save_pretrained("./finetuned_sw3_356_2layer")
tokenizer.save_pretrained("./finetuned_sw3_356_2layer")

('./finetuned_sw3_356_2layer\\tokenizer_config.json',
 './finetuned_sw3_356_2layer\\special_tokens_map.json',
 './finetuned_sw3_356_2layer\\spiece.model',
 './finetuned_sw3_356_2layer\\added_tokens.json')

In [11]:
"""TASK VECTOR OPERATION"""
import torch
#from transformers import GPT2Tokenizer, GPT2LMHeadModel
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer
import nbimporter
from task_vectors import TaskVector
from safetensors.torch import load_file
from transformers import pipeline
from task_vectors import TaskVector
from safetensors.torch import load_file
import evaluate

local_model_path = r"C:\models--AI-Sweden-Models--gpt-sw3-356m-instruct\snapshots\fce932486e4fa09d377ff8a499f0a2b6145efbf7"
# Load sw3 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(local_model_path) # <----- changed
model = AutoModelForCausalLM.from_pretrained(local_model_path)  #(pretrained model) <----- changed

# Fine-tuned model
finetuned_model_path = "./finetuned_sw3_356_2layer"   # <----- changed
finetuned_model = AutoModelForCausalLM.from_pretrained(finetuned_model_path) #Fixed

# Alpaca-style prompt function
def format_alpaca_prompt(instruction, input_text, response_placeholder=""):
    return f"""
<|endoftext|><s>
### Instruction:
{instruction}

### Input:
{input_text}

### Response:
<s> {response_placeholder}
""".strip()
    
def normalize_weights(param_diff):
    return param_diff / torch.norm(param_diff)
    
# Define TaskVector class
class TaskVector:
    def __init__(self, pretrained_checkpoint, finetuned_checkpoint):
        self.pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_checkpoint) # Fixed
        self.finetuned_model = AutoModelForCausalLM.from_pretrained(finetuned_checkpoint) # Fixed

    def __neg__(self):
        """Negate the task vector."""
        negated_vector = TaskVector.__new__(TaskVector)
        negated_vector.pretrained_model = self.pretrained_model
        negated_vector.finetuned_model = self.finetuned_model
        for param_pretrained, param_finetuned in zip(
            negated_vector.pretrained_model.parameters(), negated_vector.finetuned_model.parameters()
        ):
            param_finetuned.data = param_pretrained.data - param_finetuned.data  # <----- play with
        return negated_vector

    def apply_to(self, base_model):
        """Applies the task vector to a base model's weights."""
        for param_base, param_pretrained, param_finetuned in zip(
            base_model.parameters(), self.pretrained_model.parameters(), self.finetuned_model.parameters()
        ):
            param_base.data += scaling_coef * normalize_weights(param_finetuned.data - param_pretrained.data)  # <----- play with
        return base_model

# Initialize TaskVector
task_vector = TaskVector(local_model_path, finetuned_model_path)  # <----- changed

# Negate the Task Vector to adjust toward negative sentiment
neg_task_vector = -task_vector  # <----- play with

# Alpaca-style instruction
instruction = "Skriv om följande input text. Gör minimala ändringar. Bevara betydelsen. Ändra stil till att vara mer formell. Behåll grammatisk korrekthet och sammanhang. Behåll längd på texten."
input_text = "Hej, vad händer? Ville bara kolla om du kommer på festen ikväll."
alpaca_prompt = format_alpaca_prompt(instruction, input_text)

#Generera en alternativ-text för input-texten i enlighet med riktlinjerna.


# Generate output using the fine-tuned model
def generate_with_model(model, sentence):  # <----- play with
    inputs = tokenizer.encode(sentence, return_tensors="pt")
    output = model.generate(
    inputs,
    max_length=100,          # Limit output length ## try to set to input_ids(length) +- 10%
    temperature=0.8,         # Adjust temperature for randomness    
    top_k=80,                # Limit sampling to top k candidates
    top_p=0.7,               # Nucleus sampling for diversity
    repetition_penalty=6.0,  # Apply repetition penalty
    num_return_sequences=1   # Only generate one sequence
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Generate output after applying task vector
base_model_copy = model.from_pretrained(local_model_path)  # <----- changed
scaling_coef = 0.25  # Adjust the scaling factor as needed
task_adjusted_model = neg_task_vector.apply_to(base_model_copy)

# Generate sentences
pretrained_output = generate_with_model(model, alpaca_prompt)
finetuned_output = generate_with_model(finetuned_model, alpaca_prompt)
adjusted_output = generate_with_model(task_adjusted_model, alpaca_prompt)

# Print input and outputs
#print("Input Sentence:", input_prompt)
print("Output (Pretrained Model):", pretrained_output)
print("Output (Fine-Tuned Model):", finetuned_output)
print("Output (After Task Vector Adjustment):", adjusted_output)

Output (Pretrained Model): 
### Instruction:
Skriv om följande input text. Gör minimala ändringar. Bevara betydelsen. Ändra stil till att vara mer formell. Behåll grammatisk korrekthet och sammanhang. Behåll längd på texten.

### Input:
Hej, vad händer? Ville bara kolla om du kommer på festen ikväll.

### Response:
 User : Kan jag få din e-postadress så kan vi skicka ett meddelande när det är dags för dig! :) Jag
Output (Fine-Tuned Model): 
### Instruction:
Skriv om följande input text. Gör minimala ändringar. Bevara betydelsen. Ändra stil till att vara mer formell. Behåll grammatisk korrekthet och sammanhang. Behåll längd på texten.

### Input:
Hej, vad händer? Ville bara kolla om du kommer på festen ikväll.

### Response:
 Hans Fredriksson AB har inte ansett sig behöva alla funktioner hos Karlsson HBs passersystem ARX grundtryggt student
Output (After Task Vector Adjustment): 
### Instruction:
Skriv om följande input text. Gör minimala ändringar. Bevara betydelsen. Ändra stil till at