# Sentiment Fine-Tuning



In [1]:
# Optional: Mount Google Drive when running in Colab
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")
except ModuleNotFoundError:
    print("Google Colab not detected; skipping drive mount.")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import sys

# Try multiple possible paths for the project
possible_paths = [
    os.environ.get("PROJECT_ROOT"),
    "/content/drive/MyDrive/Personalized-Investment-Recommendation-System",
    "/content/Personalized-Investment-Recommendation-System",
    os.getcwd(),
    os.path.dirname(os.getcwd()),
]

PROJECT_ROOT = None
for path in possible_paths:
    if path and os.path.exists(path):
        # Check if sentiment_pipeline folder exists
        if os.path.exists(os.path.join(path, "sentiment_pipeline")):
            PROJECT_ROOT = path
            break

if PROJECT_ROOT is None:
    # Try current directory and parent
    if os.path.exists("sentiment_pipeline"):
        PROJECT_ROOT = os.getcwd()
    elif os.path.exists("../sentiment_pipeline"):
        PROJECT_ROOT = os.path.dirname(os.getcwd())
    else:
        raise FileNotFoundError(
            "Could not find project root! Please ensure:\n"
            "1. Project is uploaded to Google Drive\n"
            "2. Path is: /content/drive/MyDrive/Personalized-Investment-Recommendation-System\n"
            "3. Or set PROJECT_ROOT environment variable"
        )

print(f"‚úÖ Using project root: {PROJECT_ROOT}")

# Add to Python path
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Verify sentiment_pipeline can be imported
try:
    import sentiment_pipeline
    print(f"‚úÖ sentiment_pipeline module found at: {sentiment_pipeline.__file__}")
except ImportError as e:
    print(f"‚ùå Cannot import sentiment_pipeline: {e}")
    print(f"Current sys.path: {sys.path[:3]}...")
    raise



‚úÖ Using project root: /content/drive/MyDrive/Personalized-Investment-Recommendation-System
‚úÖ sentiment_pipeline module found at: /content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/__init__.py


In [None]:
# # Optional: Generate datasets if they don't exist
# # Uncomment and run this cell if you need to collect and label news data first

# import subprocess
# import sys

# print("Step 1: Collecting Yahoo Finance news...")
# subprocess.check_call([sys.executable, "-m", "sentiment_pipeline.scripts.data_collection"])

# print("\nStep 2: Labeling news with FinBERT...")
# subprocess.check_call([sys.executable, "-m", "sentiment_pipeline.scripts.label_news"])

# print("\nStep 3: Preparing datasets for training...")
# subprocess.check_call([sys.executable, "-m", "sentiment_pipeline.scripts.prepare_dataset"])

# print("\n‚úÖ All datasets ready!")


In [5]:
import subprocess
import importlib

# Aggressive fix for numpy binary incompatibility
print("üîß Fixing numpy binary incompatibility...")

# Step 1: Uninstall all numpy-dependent packages
packages_to_reinstall = ["numpy", "pandas", "scikit-learn", "datasets", "transformers", "accelerate"]
print("Uninstalling packages...")
for pkg in packages_to_reinstall:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", pkg, "-y", "--quiet"],
                            stderr=subprocess.DEVNULL)
    except:
        pass

# Step 2: Install numpy first (specific version)
print("Installing numpy 1.26.4...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4", "--no-cache-dir", "--quiet"])

# Step 3: Verify numpy import works
try:
    import numpy as np
    print(f"‚úÖ NumPy {np.__version__} installed successfully")
except Exception as e:
    print(f"‚ùå NumPy import failed: {e}")
    raise

# Step 4: Reinstall packages that depend on numpy
print("Reinstalling numpy-dependent packages...")
subprocess.check_call([sys.executable, "-m", "pip", "install",
                      "pandas==2.2.2",
                      "scikit-learn==1.5.2",
                      "datasets==3.0.1",
                      "--no-cache-dir", "--quiet"])

# Step 5: Install transformers and related packages
print("Installing transformers ecosystem...")
subprocess.check_call([sys.executable, "-m", "pip", "install",
                      "transformers==4.45.2",
                      "accelerate==0.34.2",
                      "peft==0.13.1",
                      "trl==0.9.6",
                      "--no-cache-dir", "--quiet"])

# Step 6: Fix bitsandbytes
print("Installing bitsandbytes...")
subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "bitsandbytes", "-y", "--quiet"],
                     stderr=subprocess.DEVNULL)
subprocess.check_call([sys.executable, "-m", "pip", "install", "bitsandbytes==0.43.2", "--no-cache-dir", "--quiet"])

# Step 7: Install remaining requirements
REQ_PATH = os.path.join(PROJECT_ROOT, "sentiment_pipeline", "requirements.txt")
if os.path.exists(REQ_PATH):
    print("Installing remaining requirements...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", REQ_PATH, "--no-cache-dir", "--quiet"])
else:
    print("requirements.txt not found; installing core packages...")
    subprocess.check_call([sys.executable, "-m", "pip", "install",
                          "torch==2.4.1",
                          "tqdm==4.66.5",
                          "matplotlib==3.9.2",
                          "python-dotenv==1.0.1",
                          "pyyaml==6.0.2",
                          "yfinance==0.2.44",
                          "--no-cache-dir", "--quiet"])

print("\n‚úÖ Dependencies installed!")
print("‚ö†Ô∏è  IMPORTANT: Restart runtime now (Runtime ‚Üí Restart runtime)")
print("   Then run the remaining cells.")



üîß Fixing numpy binary incompatibility...
Uninstalling packages...
Installing numpy 1.26.4...
‚úÖ NumPy 2.0.2 installed successfully
Reinstalling numpy-dependent packages...
Installing transformers ecosystem...
Installing bitsandbytes...
Installing remaining requirements...

‚úÖ Dependencies installed!
‚ö†Ô∏è  IMPORTANT: Restart runtime now (Runtime ‚Üí Restart runtime)
   Then run the remaining cells.


In [3]:
from dotenv import load_dotenv
from datasets import load_from_disk

# Import after PROJECT_ROOT is set
from sentiment_pipeline.utils.config_loader import load_config

load_dotenv()

# Load config
config_path = os.path.join(PROJECT_ROOT, "sentiment_pipeline", "config.yaml")
if not os.path.exists(config_path):
    raise FileNotFoundError(f"Config file not found: {config_path}")

config = load_config(config_path)
print(f"‚úÖ Config loaded from: {config_path}")

# Load datasets
dataset_root = os.path.normpath(os.path.join(PROJECT_ROOT, config["paths"]["hf_dataset_dir"]))
train_path = os.path.join(dataset_root, "train")
val_path = os.path.join(dataset_root, "validation")

if not os.path.exists(train_path):
    raise FileNotFoundError(
        f"Training dataset not found: {train_path}\n"
        "Please run prepare_dataset.py first to generate datasets."
    )

print(f"Loading datasets from: {dataset_root}")
train_ds = load_from_disk(train_path)
val_ds = load_from_disk(val_path)

print(f"‚úÖ Train samples: {len(train_ds)}, Validation samples: {len(val_ds)}")
print("\nSample training data:")
print(train_ds[:2])



‚úÖ Config loaded from: /content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/config.yaml
Loading datasets from: /content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/data/hf_datasets
‚úÖ Train samples: 32, Validation samples: 8

Sample training data:
{'text': ['Headline: Top Midday Stories: Amazon to Sell Used Ford Vehicles on Website; Novo Nordisk Cuts Wegovy, Ozempic Prices\nSentiment: neutral', 'Headline: Top Midday Stories: Amazon to Sell Used Ford Vehicles on Website; Novo Nordisk Cuts Wegovy, Ozempic Prices\nSentiment: neutral']}


In [4]:
import wandb

if config["wandb"].get("project"):
    wandb.login()
    wandb.init(project=config["wandb"]["project"], entity=config["wandb"].get("entity"))
else:
    print("wandb project not configured; skipping tracking.")



[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mswara090903[0m ([33mswara090903-northeastern-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore", message=".*triton.*")

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    raise RuntimeError(
        "‚ùå GPU not available! Please enable GPU in Colab:\n"
        "Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU (T4 or A100)\n"
        "Then restart runtime and run all cells again."
    )

training_cfg = config["training"]
model_name = training_cfg["model_name"]

# Configure 4-bit quantization (triton warnings can be ignored)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,  # Better compression
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)
print("‚úÖ Model loaded successfully!")

# Verify dataset structure (double-check it's still a Dataset)
print("Verifying dataset structure...")
print(f"Train dataset type: {type(train_ds)}")

# Re-verify it's a Dataset (in case variable got overwritten)
if not isinstance(train_ds, Dataset):
    raise TypeError(
        f"‚ùå train_ds is not a Dataset! Type: {type(train_ds)}\n"
        f"Please re-run cell 4 to load the datasets."
    )

print(f"‚úÖ Train dataset columns: {train_ds.column_names}")
print(f"‚úÖ Train dataset features: {train_ds.features}")
print(f"\nSample train data:")
print(train_ds[0])

# Ensure dataset has 'text' column
if "text" not in train_ds.column_names:
    raise ValueError(
        f"Dataset missing 'text' column! Available columns: {train_ds.column_names}\n"
        "Please regenerate datasets using prepare_dataset.py"
    )

lora_config = LoraConfig(
    r=training_cfg["lora_r"],
    lora_alpha=training_cfg["lora_alpha"],
    lora_dropout=training_cfg["lora_dropout"],
    target_modules=training_cfg["target_modules"],
    bias="none",
    task_type="CAUSAL_LM",
)

# Check trl version
import trl
print(f"TRL version: {trl.__version__}")

training_args = SFTConfig(
    output_dir=os.path.join(PROJECT_ROOT, "sentiment_pipeline", "models", "sentiment_model"),
    per_device_train_batch_size=training_cfg["batch_size"],
    per_device_eval_batch_size=training_cfg["batch_size"],
    gradient_accumulation_steps=training_cfg["gradient_accumulation_steps"],
    learning_rate=training_cfg["learning_rate"],
    num_train_epochs=training_cfg["num_epochs"],
    warmup_steps=training_cfg["warmup_steps"],
    fp16=training_cfg["fp16"],
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    max_seq_length=512,
)

# Workaround for trl 0.9.6 bug - ensure dataset is properly formatted
# The issue is that dataset_text_field parameter gets confused with dataset
# Let's use a formatting function instead which is more reliable

def formatting_prompts_func(examples):
    """Format examples for training."""
    output_texts = []
    for text in examples["text"]:
        output_texts.append(text)
    return output_texts

# Try with formatting function (more reliable)
try:
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        peft_config=lora_config,
        formatting_func=formatting_prompts_func,
    )
    print("‚úÖ Trainer initialized with formatting_func")
except Exception as e:
    print(f"Formatting func failed: {e}")
    print("Trying with dataset_text_field...")
    # Fallback to dataset_text_field
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        peft_config=lora_config,
        dataset_text_field="text",
    )
    print("‚úÖ Trainer initialized with dataset_text_field")

trainer.train()
trainer.model.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)



CUDA available: True
GPU: Tesla T4
CUDA version: 12.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Loading model with 4-bit quantization...


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

‚úÖ Model loaded successfully!
Verifying dataset structure...
Train dataset type: <class 'datasets.arrow_dataset.Dataset'>
‚úÖ Train dataset columns: ['text']
‚úÖ Train dataset features: {'text': Value(dtype='string', id=None)}

Sample train data:
{'text': 'Headline: Top Midday Stories: Amazon to Sell Used Ford Vehicles on Website; Novo Nordisk Cuts Wegovy, Ozempic Prices\nSentiment: neutral'}
TRL version: 0.9.6


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


‚úÖ Trainer initialized with formatting_func




Epoch,Training Loss,Validation Loss
1,No log,3.37569
2,No log,3.368268
3,No log,3.307793


('/content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/models/sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/models/sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/models/sentiment_model/tokenizer.model',
 '/content/drive/MyDrive/Personalized-Investment-Recommendation-System/sentiment_pipeline/models/sentiment_model/added_tokens.json')

In [6]:
sample_prompt = "Headline: Apple announces record-breaking services revenue\nSentiment:"
output = trainer.model.generate(
    **tokenizer(sample_prompt, return_tensors="pt").to(trainer.model.device),
    max_new_tokens=config["inference"]["max_new_tokens"],
)
print(tokenizer.decode(output[0], skip_special_tokens=True))



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Headline: Apple announces record-breaking services revenue
Sentiment: Positive

Apple Inc. (AAPL) announced its Q1 2023 earnings, reporting a record-breaking $22
