<a href="https://colab.research.google.com/github/Saakshitha/Secpen_model/blob/main/finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ Install required packages
!pip install -q -U transformers accelerate peft bitsandbytes


In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from huggingface_hub import login
from google.colab import userdata
import torch
import pandas as pd
from datasets import Dataset

In [3]:
login(userdata.get("token_hf"))
# ✅ Set paths
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
lora_model_dir = "/content/lora_mistral"
offload_dir = "/content/offload_dir"
os.makedirs(offload_dir, exist_ok=True)

# ✅ Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

Saving df_balanced.csv to df_balanced (4).csv


In [6]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import torch
import os

# ✅ Set base model path
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# ✅ Recommended 4-bit quant config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
import os

required_files = [
    os.path.join(lora_model_dir, "adapter_config.json"),
    os.path.join(lora_model_dir, "adapter_model.bin")
]

if all(os.path.exists(f) for f in required_files):
    print("🔁 Loading fine-tuned LoRA model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    model = PeftModel.from_pretrained(base_model, lora_model_dir)
else:
    print("🆕 Training LoRA on Mistral...")
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    # ✅ LoRA configuration
    peft_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft_config)

    # ✅ Format dataset
    def format_instruction(row):
        return f"<s>[INST] Detect the emotion in the following text. Output only the emotion name.\n\nText: \"{row['Text']}\"\n\nOutput: [/INST] {row['Sentiment'].strip()}</s>"

    df["formatted"] = df.apply(format_instruction, axis=1)
    dataset = Dataset.from_pandas(df[["formatted"]])

    def tokenize(example):
        tokenized = tokenizer(
            example["formatted"],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        tokenized["labels"] = tokenized["input_ids"].clone()
        return tokenized


    dataset = dataset.map(tokenize)

    # ✅ Training setup
    training_args = TrainingArguments(
        output_dir=lora_model_dir,
        per_device_train_batch_size=1,
        num_train_epochs=2,
        logging_steps=10,
        save_strategy="epoch",
        fp16=True,
        optim="paged_adamw_32bit",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer
    )

    trainer.train()

    # ✅ Save model
    model.save_pretrained(lora_model_dir)
    tokenizer.save_pretrained(lora_model_dir)
    print("✅ LoRA fine-tuned model saved.")

🆕 Training LoRA on Mistral...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/732 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: too many values to unpack (expected 4)

In [None]:

# # ✅ Load or Train Model
# if os.path.exists(lora_model_dir):
#     print("🔁 Loading fine-tuned LoRA model...")
#     model = AutoModelForCausalLM.from_pretrained(base_model_id, load_in_4bit=True, device_map="auto", trust_remote_code=True)
#     model = PeftModel.from_pretrained(model, lora_model_dir)
# else:
#     print("🆕 Training LoRA on Mistral...")

#     # ✅ Load base model in 4-bit mode
#     model = AutoModelForCausalLM.from_pretrained(
#         base_model_id,
#         load_in_4bit=True,
#         device_map="auto",
#         trust_remote_code=True
#     )

#     # ✅ Apply LoRA
#     peft_config = LoraConfig(
#         r=8,
#         lora_alpha=32,
#         target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#         lora_dropout=0.1,
#         bias="none",
#         task_type="CAUSAL_LM"
#     )
#     model = get_peft_model(model, peft_config)


#     def format_instruction(row):
#         return f"<s>[INST] Detect the emotion in the following text. Output only the emotion name.\n\nText: \"{row['Text']}\"\n\nOutput: [/INST] {row['Sentiment'].strip()}</s>"

#     df["formatted"] = df.apply(format_instruction, axis=1)

#     dataset=Dataset.from_pandas(df[["formatted"]])

#     def tokenize(example):
#         # return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#         return tokenizer(example["formatted"], truncation=True, padding="max_length", max_length=512)

#     dataset = dataset.map(tokenize)

#     # ✅ Training Arguments
#     training_args = TrainingArguments(
#         output_dir=lora_model_dir,
#         per_device_train_batch_size=1,
#         num_train_epochs=2,
#         logging_steps=10,
#         save_strategy="epoch",
#         fp16=True,
#         optim="paged_adamw_32bit",
#         report_to="none"
#     )

#     # ✅ Trainer
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset,
#         tokenizer=tokenizer
#     )

#     # ✅ Train and Save
#     trainer.train()
#     model.save_pretrained(lora_model_dir)
#     tokenizer.save_pretrained(lora_model_dir)
#     print("✅ LoRA fine-tuned model saved.")

In [None]:
from transformers import pipeline

# Load tokenizer and PEFT-wrapped model
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, load_in_4bit=True, device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(base_model, lora_model_dir)
tokenizer = AutoTokenizer.from_pretrained(lora_model_dir, trust_remote_code=True)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

prompt = "### Instruction: What is AI?\n### Response:"
output = pipe(prompt, max_new_tokens=100, do_sample=True, top_k=50, temperature=0.7)

print(output[0]["generated_text"])
