# Phase 3 Sentiment Fine-Tuning

This Colab-friendly notebook fine-tunes **Mistral-7B-Instruct-v0.3** on labeled Yahoo Finance headlines using LoRA adapters.

> **Prereqs**
> 1. Upload the project folder to Google Drive (or clone via git).
> 2. Generate labeled data via `data_collection.py` → `label_news.py` → `prepare_dataset.py`.
> 3. Ensure you have GPU runtime (T4/A100 preferred).



In [None]:
# Optional: Mount Google Drive when running in Colab
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")
except ModuleNotFoundError:
    print("Google Colab not detected; skipping drive mount.")



In [None]:
import os
import sys

PROJECT_ROOT = os.environ.get("PROJECT_ROOT", "/content/drive/MyDrive/Personalized-Investment-Recommendation-System")
if not os.path.exists(PROJECT_ROOT):
    PROJECT_ROOT = os.environ.get("PROJECT_ROOT", os.getcwd())

print(f"Using project root: {PROJECT_ROOT}")

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)



In [None]:
import subprocess

REQ_PATH = os.path.join(PROJECT_ROOT, "sentiment_pipeline", "requirements.txt")
if os.path.exists(REQ_PATH):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", REQ_PATH])
else:
    print("requirements.txt not found; ensure deps are installed locally.")



In [None]:
from dotenv import load_dotenv
from datasets import load_from_disk

from sentiment_pipeline.utils.config_loader import load_config

load_dotenv()
config = load_config(os.path.join(PROJECT_ROOT, "sentiment_pipeline", "config.yaml"))

dataset_root = os.path.normpath(os.path.join(PROJECT_ROOT, config["paths"]["hf_dataset_dir"]))
train_ds = load_from_disk(os.path.join(dataset_root, "train"))
val_ds = load_from_disk(os.path.join(dataset_root, "validation"))
print(train_ds[:2])



In [None]:
import wandb

if config["wandb"].get("project"):
    wandb.login()
    wandb.init(project=config["wandb"]["project"], entity=config["wandb"].get("entity"))
else:
    print("wandb project not configured; skipping tracking.")



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer

training_cfg = config["training"]
model_name = training_cfg["model_name"]

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

lora_config = LoraConfig(
    r=training_cfg["lora_r"],
    lora_alpha=training_cfg["lora_alpha"],
    lora_dropout=training_cfg["lora_dropout"],
    target_modules=training_cfg["target_modules"],
    bias="none",
    task_type="CAUSAL_LM",
)

training_args = SFTConfig(
    output_dir=os.path.join(PROJECT_ROOT, "sentiment_pipeline", "models", "sentiment_model"),
    per_device_train_batch_size=training_cfg["batch_size"],
    per_device_eval_batch_size=training_cfg["batch_size"],
    gradient_accumulation_steps=training_cfg["gradient_accumulation_steps"],
    learning_rate=training_cfg["learning_rate"],
    num_train_epochs=training_cfg["num_epochs"],
    warmup_steps=training_cfg["warmup_steps"],
    fp16=training_cfg["fp16"],
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    peft_config=lora_config,
    dataset_text_field="text",
)

trainer.train()
trainer.model.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)



In [None]:
sample_prompt = "Headline: Apple announces record-breaking services revenue\nSentiment:"
output = trainer.model.generate(
    **tokenizer(sample_prompt, return_tensors="pt").to(trainer.model.device),
    max_new_tokens=config["inference"]["max_new_tokens"],
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

