<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/notebooks/phi_2_7_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using A100 GPU

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

In [None]:
# --------------------------------------------------------------------------
# 1. INSTALL REQUIRED PACKAGES
# --------------------------------------------------------------------------
!pip install transformers==4.44.0 datasets scikit-learn matplotlib torch torchvision torchaudio accelerate bitsandbytes -q

In [None]:
# --------------------------------------------------------------------------
# 2. IMPORTS & INITIAL SETUP
# --------------------------------------------------------------------------
import torch
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    PhiForSequenceClassification, # Directly import the specific model class for Phi-2
    Trainer,
    TrainingArguments,
    TrainerCallback
)

# --- Check GPU Availability ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# --- Time Tracker Callback ---
class TimeTrackerCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, model=None, **kwargs):
        self.start_time = time.time()

    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        elapsed_time = time.time() - self.start_time
        print(f"\nEpoch {state.epoch:.0f} training time: {elapsed_time:.2f} seconds")

In [None]:
# --------------------------------------------------------------------------
# 3. DATA LOADING & PREPARATION
# --------------------------------------------------------------------------

# --- Define the Classification Prompt ---
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.
Reddit Post: {post}
Classification:"""

# --- Load and Process the Dataset ---
try:
    df = pd.read_csv('/content/drive/MyDrive/world-inflation/data/reddit/production/main-prod-1040.csv', sep=',')
    print(f"Dataset successfully loaded. Shape: {df.shape}")
    print(f"Class distribution:\n{df['inflation'].value_counts(normalize=True)}")

    def format_with_prompt(post):
        return INFLATION_PROMPT.format(post=post)
    df['formatted_body'] = df['body'].apply(format_with_prompt)

    # --- Split Data into Training and Validation Sets ---
    train_df, val_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df['inflation'])
    print(f"\nTraining set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")

    # --- Convert to Hugging Face Datasets ---
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

except FileNotFoundError:
    print("Error: Dataset file not found.")
    print("Please upload your CSV file and update the path in the 'pd.read_csv' function.")

    data = {
        'body': ["The price of gas is finally going down.", "I can't believe how much a coffee costs now.", "What's the weather like today?"],
        'inflation': [0, 2, 1]
    }
    df = pd.DataFrame(data)
    def format_with_prompt(post):
        return INFLATION_PROMPT.format(post=post)
    df['formatted_body'] = df['body'].apply(format_with_prompt)
    train_df, val_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df['inflation'])
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    print("\nCreated a dummy dataset to allow the script to run.")

In [None]:
# --------------------------------------------------------------------------
# 4. MODEL & TOKENIZER INITIALIZATION
# --------------------------------------------------------------------------
# --- Define Model ---
model_name = "microsoft/phi-2"
print(f"\nInitializing model: {model_name}")

# --- Initialize Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- Load and Configure Model for Sequence Classification ---
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=3,
    id2label={0: "DEFLATION", 1: "NEUTRAL", 2: "INFLATION"},
    label2id={"DEFLATION": 0, "NEUTRAL": 1, "INFLATION": 2},
    trust_remote_code=True
)

model = PhiForSequenceClassification.from_pretrained(
    model_name,
    config=config, # Pass the explicit config
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

model.config.pad_token_id = tokenizer.pad_token_id
print("Model and tokenizer initialized successfully.")

# --- Tokenization Function ---
def tokenize_function(examples):
    tokenized = tokenizer(
        examples['formatted_body'],
        padding="max_length",
        truncation=True,
        max_length=512, # A standard max length for social media posts
        return_tensors="pt"
    )
    tokenized['labels'] = examples['inflation']
    return tokenized

# --- Apply Tokenization ---
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
print("\nTokenization complete.")

In [None]:
# --------------------------------------------------------------------------
# 5. TRAINING CONFIGURATION
# --------------------------------------------------------------------------
# --- Define Evaluation Metrics ---
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Get the class with the highest probability
    preds = np.argmax(predictions, axis=1)

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    accuracy = accuracy_score(labels, preds)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# --- Set Up Training Arguments ---
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/world-inflation/data/model/Phi-3.5-fine-tuning",
    logging_dir="/content/phi-2-inflation-finetune/Phi-3.5-fine-tuning/logs",

    # Training parameters
    num_train_epochs=4,
    learning_rate=5e-5,
    per_device_train_batch_size=4, # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size = 4 * 4 = 16
    weight_decay=0.01,
    warmup_ratio=0.1,

    # Evaluation and saving
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2, # Saves the best and the latest checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,

    # Efficiency
    bf16=True, # Use bfloat16 for better performance on modern GPUs (like L4)
    dataloader_pin_memory=False, # Set to False, can sometimes cause issues

    # Other settings
    remove_unused_columns=True,
    logging_steps=10,
    seed=42,
    report_to="none" # Disable reporting to external services like wandb
)

In [None]:
# --------------------------------------------------------------------------
# 6. INITIALIZE AND START TRAINING
# --------------------------------------------------------------------------
# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TimeTrackerCallback()]
)

# --- Start Training ---
print("\nStarting model training...")
trainer.train()
print("\nTraining finished.")

# --- Evaluate Final Model ---
print("\nEvaluating the best model on the validation set...")
eval_results = trainer.evaluate()
print("\nFinal Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# --------------------------------------------------------------------------
# 5. TRAINING CONFIGURATION
# --------------------------------------------------------------------------
# --- Define Evaluation Metrics ---
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Get the class with the highest probability
    preds = np.argmax(predictions, axis=1)

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    accuracy = accuracy_score(labels, preds)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# --- Set Up Training Arguments ---
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/world-inflation/data/model/Phi-3.5-fine-tuning",
    logging_dir="/content/phi-2-inflation-finetune/Phi-3.5-fine-tuning/logs",

    # Training parameters
    num_train_epochs=4,
    learning_rate=5e-5,
    per_device_train_batch_size=4, # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size = 4 * 4 = 16
    weight_decay=0.01,
    warmup_ratio=0.1,

    # Evaluation and saving
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2, # Saves the best and the latest checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,

    # Efficiency
    bf16=True, # Use bfloat16 for better performance on modern GPUs (like L4)
    dataloader_pin_memory=False, # Set to False, can sometimes cause issues

    # Other settings
    remove_unused_columns=True,
    logging_steps=10,
    seed=42,
    report_to="none" # Disable reporting to external services like wandb
)

In [None]:
# --------------------------------------------------------------------------
# 6. INITIALIZE AND START TRAINING
# --------------------------------------------------------------------------
# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TimeTrackerCallback()]
)

# --- Start Training ---
print("\nStarting model training...")
trainer.train()
print("\nTraining finished.")

# --- Evaluate Final Model ---
print("\nEvaluating the best model on the validation set...")
eval_results = trainer.evaluate()
print("\nFinal Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")