In [1]:
import pandas as pd
import re
import torch
import emoji
import numpy as np
from sklearn.model_selection import train_val_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import sys
sys.path.append('../../src')
from evaluation_transformer import evaluate_transformer
from initial_balanced_dataset import create_balanced_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Create dataset (skips if dataset already exists)
create_balanced_dataset()
# Load Balanced Tweets
df=pd.read_csv('../../dataset/initial_balanced_tweets.csv')
df_test = pd.read_csv("../../dataset/test_set.csv")

Both train/val and test datasets already exist. Skipping creation.


In [None]:
# Load and Preprocess Data
df.dropna(subset=['tweet'], inplace=True)
print(f"Loaded {len(df):,} rows.")
df_test.dropna(subset=['tweet'], inplace=True)
print(f"Loaded {len(df):,} rows.")

def preprocess_for_bert(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'@\w+', '@USER', text)
    text = re.sub(r'http[s]?://\S+|www\.\S+', 'HTTPURL', text)
    text = emoji.demojize(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df['bert_text'] = df['tweet'].apply(preprocess_for_bert)
test_df['bert_text'] = df_test['tweet'].apply(preprocess_for_bert)
print("Minimal preprocessing for FinBERT complete.")

In [None]:
# Split Data and Create Hugging Face Datasets
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['sentiment']
)

print(f"Training set size: {len(train_df):,} samples")
print(f"Val set size: {len(val_df):,} samples")
print(f"Val set size: {len(test_df):,} samples")

train_dataset = Dataset.from_dict({"text": train_df['bert_text'].tolist(), "label": train_df['sentiment'].tolist()})
val_dataset = Dataset.from_dict({"text": val_df['bert_text'].tolist(), "label": val_df['sentiment'].tolist()})
test_dataset = Dataset.from_dict({"text": test_df['bert_text'].tolist(), "label": test_df['sentiment'].tolist()})

y_train = train_dataset['label']
y_test  = test_dataset['label']

In [None]:
# FinBERT QLoRA Fine-tuning

# Verify GPU Availability
if torch.cuda.is_available():
    print(f"\nGPU is available! Using: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("\nNo GPU available. This fine-tuning process will be extremely slow on CPU.")
    print("Please go to Runtime > Change runtime type and select 'GPU' as the hardware accelerator.")
    device = torch.device("cpu")

# Load FinBERT Model and Tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with 4-bit quantization for QLoRA
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    torch_dtype=torch.bfloat16, # Better precision with 4-bit quantization on compatible GPUs
    load_in_4bit=True
)

# Prepare model for k-bit training (QLoRA specific)
model = prepare_model_for_kbit_training(model)

# Configure QLoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Tokenize the Datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print(tokenized_train_dataset[0])

In [None]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="../../models/v1-1/baseline/finbert_qlora_minimal-preproc",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2, # Effectively 32 batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="../../training_logs/v1-1/baseline/finbert_qlora_logs_minimal-preproc",
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=torch.cuda.is_available(), # Enable mixed precision if GPU is available
    dataloader_num_workers=2
)

# Define Metrics for Evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Create Trainer Instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the Model
print("\nStarting FinBERT QLoRA fine-tuning...")
trainer.train()
print("Fine-tuning complete!")

In [None]:
# Evaluate Eval Set the Fine-tuned Model
print("\nEvaluating the fine-tuned FinBERT model...")
evaluation_results = trainer.evaluate()
print("Evaluation Results (from Trainer.evaluate()):")
print(evaluation_results)

# Evaluate Test Set
print("EVALUATE ON TEST SET")
evaluate_transformer(trainer, tokenized_train_dataset, tokenized_test_dataset, y_train, y_test, model, save_dir="../../evaluation/baseline/qlora_minimal")

In [None]:
# Saves only the small trainable parts, not the full FinBERT model
save_path = "../../models/v1-1/baseline/finbert_qlora_minimal_finetuned_adapters"
trainer.save_model(save_path)
print(f"\nFine-tuned LoRA adapters saved to {save_path}")