<a href="https://colab.research.google.com/github/Saakshitha/Secpen_model/blob/main/finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Step 1: Install dependencies
!pip install -q --upgrade --no-cache-dir transformers datasets accelerate bitsandbytes sentencepiece huggingface_hub

# ✅ Step 2: Imports
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from huggingface_hub import login
from google.colab import userdata
import torch

# ✅ Step 3: Login to Hugging Face Hub
login(userdata.get("token_hf"))

# ✅ Step 4: Load your dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv(next(iter(uploaded)))

# ✅ Step 5: Format dataset for instruction tuning
def format_instruction(row):
    return f"<s>[INST] Detect the emotion in the following text. Output only the emotion name.\n\nText: \"{row['Text']}\"\n\nOutput: [/INST] {row['Sentiment'].strip()}</s>"

df["formatted"] = df.apply(format_instruction, axis=1)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m229.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.8/512.8 kB[0m [31m374.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m350.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m326.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m341.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m341.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m233.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

Saving df_balanced.csv to df_balanced.csv


In [None]:
# ✅ Step 6: Convert to Hugging Face dataset and tokenize
dataset = Dataset.from_pandas(df[["formatted"]])

model_id = "mistralai/Mistral-7B-Instruct-v0.2"  # 🔧 Define before using

tokenizer = AutoTokenizer.from_pretrained(model_id)
# 🔧 Fix padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    return tokenizer(example["formatted"], truncation=True, padding="max_length", max_length=512)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = dataset["train"]
eval_data = dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/732 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


In [None]:
def format_instruction(row):
    return f"<s>[INST] Detect the emotion in the following text. Output only the emotion name.\n\nText: \"{row['Text']}\"\n\nOutput: [/INST] {row['Sentiment'].strip()}</s>"

df["formatted"] = df.apply(format_instruction, axis=1)

from datasets import Dataset
dataset = Dataset.from_pandas(df[["formatted"]])

def tokenize(example):
    return tokenizer(example["formatted"], truncation=True, padding="max_length", max_length=512)
def format_and_tokenize(example):
    return tokenizer(example["formatted"], truncation=True, padding="max_length", max_length=512)


dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.1)
train_data = train_data.map(format_and_tokenize, batched=True)
eval_data = eval_data.map(format_and_tokenize, batched=True)


# train_data = dataset["train"]
# eval_data = dataset["test"]


Map:   0%|          | 0/732 [00:00<?, ? examples/s]

Map:   0%|          | 0/658 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

In [None]:
train_data.set_format("torch")
eval_data.set_format("torch")


In [None]:
!pip install -q trl>=0.7.4

In [None]:
import trl
print(trl.__version__)


0.18.1


In [None]:
from trl import SFTTrainer, SFTConfig

# ✅ Training configuration (compatible with latest SFTConfig)
sft_config = SFTConfig(
    output_dir="./mistral-lora-emotion",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=10,
    learning_rate=2e-5,
    max_seq_length=512,
    fp16=True,
    report_to=[],
)


In [None]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_data,
    eval_dataset=eval_data,

)
#Start training
trainer.train()

Truncating train dataset:   0%|          | 0/658 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/74 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.1085,0.11129
2,0.0988,0.102831
3,0.0914,0.101239


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=495, training_loss=0.43347612529089957, metrics={'train_runtime': 4941.3137, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.1, 'total_flos': 4.316128453066752e+16, 'train_loss': 0.43347612529089957})

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

model_path = "/content/drive/MyDrive/mistral-lora-emotion"

# Save model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('/content/drive/MyDrive/mistral-lora-emotion/tokenizer_config.json',
 '/content/drive/MyDrive/mistral-lora-emotion/special_tokens_map.json',
 '/content/drive/MyDrive/mistral-lora-emotion/chat_template.jinja',
 '/content/drive/MyDrive/mistral-lora-emotion/tokenizer.model',
 '/content/drive/MyDrive/mistral-lora-emotion/added_tokens.json',
 '/content/drive/MyDrive/mistral-lora-emotion/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch

# Paths
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
lora_model_path = "/content/drive/MyDrive/mistral-lora-emotion"
offload_dir = "/content/offload"  # Make sure this exists

# Create offload dir
import os
os.makedirs(offload_dir, exist_ok=True)

# Load base model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder=offload_dir
)

# Load LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    lora_model_path,
    device_map="auto",
    offload_folder=offload_dir
)

# Wrap in pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

# Function for emotion detection
def detect_emotion(user_text):
    prompt = f'<s>[INST] Detect the emotion in the following text. Output only the emotion name.\n\nText: "{user_text}"\n\nOutput: [/INST]'
    output = pipe(prompt, max_new_tokens=10, do_sample=False)[0]['generated_text']
    response = output.split("[/INST]")[-1].strip().split("</s>")[0].strip()
    return response

# Run
user_input = input("Enter a sentence to detect the emotion: ")
emotion = detect_emotion(user_input)
print(f"\nDetected Emotion: {emotion}")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]