<a href="https://colab.research.google.com/github/Preetam314/CS101/blob/main/NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Install libraries (run in Colab). If you're running locally, you may not need installs.
!pip install -q datasets transformers evaluate seqeval accelerate gradio scikit-learn matplotlib torch>=1.12.0
!pip install -U transformers accelerate


# GPU check (optional)
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: Tesla T4


In [None]:
# Cell 2: Imports and global config
import os
import random
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import torch

# Reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


In [None]:
# Cell 3 (New): Load multiple languages
from datasets import load_dataset, DatasetDict

# 1. Load English data for training
ds_en = load_dataset("amazon_reviews_multi", "en")
# Note: amazon_reviews_multi labels are 1-5 stars. We must make them binary (0, 1)
# like amazon_polarity. We'll map 1,2 -> 0 (neg) and 4,5 -> 1 (pos). We'll discard 3.
def process_data(dataset):
    dataset = dataset.map(lambda x: {"label": 0 if x["stars"] < 3 else (1 if x["stars"] > 3 else -1)})
    dataset = dataset.filter(lambda x: x["label"] != -1)
    return dataset

ds_en = process_data(ds_en)

# 2. Load other languages for *testing only*
ds_de = process_data(load_dataset("amazon_reviews_multi", "de", split="test"))
ds_fr = process_data(load_dataset("amazon_reviews_multi", "fr", split="test"))
ds_ja = process_data(load_dataset("amazon_reviews_multi", "ja", split="test"))

# 3. Create your final dataset dict
# (Using smaller subsets for the demo, just like you did)
train_dataset = ds_en['train'].shuffle(seed=42).select(range(20000))
eval_dataset_en = ds_en['test'].shuffle(seed=42).select(range(2000)) # English test
eval_dataset_de = ds_de.shuffle(seed=42).select(range(2000)) # German test
eval_dataset_fr = ds_fr.shuffle(seed=42).select(range(2000)) # French test
eval_dataset_ja = ds_ja.shuffle(seed=42).select(range(2000)) # Japanese test

# Your main "dataset" for tokenization will be the English one
dataset = DatasetDict({
    'train': train_dataset,
    'test': eval_dataset_en
})

# Keep the other test sets separate for now
test_sets_other_langs = {
    "German": eval_dataset_de,
    "French": eval_dataset_fr,
    "Japanese": eval_dataset_ja
}


In [None]:
# Cell 5: Light model selection + tokenizer setup (memory-friendly)
import os
from transformers import AutoTokenizer

# Disable Weights & Biases tracking
os.environ["WANDB_DISABLED"] = "true"

# Smaller models to reduce RAM/VRAM
model_name_mono = "distilbert-base-uncased"              # English only
model_name_multi = "distilbert-base-multilingual-cased"  # Multilingual

tokenizer_mono = AutoTokenizer.from_pretrained(model_name_mono)
tokenizer_multi = AutoTokenizer.from_pretrained(model_name_multi)

max_length = 256
print("Tokenizers loaded successfully ✅")


In [None]:
# Cell 6 (Corrected): Tokenize the correct columns
from datasets import DatasetDict

# --- Limit dataset size for Colab (adjust numbers if you have more RAM) ---
# This part is fine, but we'll re-run it on the 'dataset' from Cell 3
dataset = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(2000)),
    "test": dataset["test"].shuffle(seed=42).select(range(500))
})

# [FIX] Tokenize 'review_body' instead of 'content'
def tokenize_mono(examples):
    return tokenizer_mono(examples["review_body"], truncation=True, max_length=max_length)

def tokenize_multi(examples):
    return tokenizer_multi(examples["review_body"], truncation=True, max_length=max_length)

print("Tokenizing English dataset for training/evaluation...")
tokenized_mono = dataset.map(tokenize_mono, batched=True)
tokenized_multi = dataset.map(tokenize_multi, batched=True)

# [FIX] Remove columns from the *new* dataset
# The original dataset has many columns, let's list them
original_cols = dataset["train"].column_names
cols_to_remove = [c for c in original_cols if c not in ["input_ids", "attention_mask", "label"]]

print(f"Removing columns: {cols_to_remove}")
tokenized_mono = tokenized_mono.remove_columns(cols_to_remove)
tokenized_multi = tokenized_multi.remove_columns(cols_to_remove)

print("✅ Tokenization done")
print("Columns:", tokenized_mono["train"].column_names)

In [None]:
# Cell 7 (final, tested on transformers 4.40+)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np, torch, os

def make_model_and_trainer(model_name, tokenized_dataset, tokenizer, output_dir,
                           epochs=1, batch_size=8, learning_rate=2e-5):
    os.environ["WANDB_DISABLED"] = "true"
    num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    ).to(device)

    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch", # Changed from evaluation_strategy
        save_strategy="no",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        gradient_checkpointing=True,
        fp16=torch.cuda.is_available(),
        report_to="none",          # disable wandb/tensorboard
        logging_dir=f"{output_dir}/logs"
    )

    data_collator = DataCollatorWithPadding(tokenizer)

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis=1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds, average="weighted")
        }

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    return model, trainer

In [None]:
mono_out = "runs/distilbert_fastdemo"

model_mono, trainer_mono = make_model_and_trainer(
    model_name_mono, tokenized_mono, tokenizer_mono,
    output_dir=mono_out, epochs=0.5, batch_size=8
)

# Override trainer args for speed
trainer_mono.args.evaluation_strategy = "no"
trainer_mono.args.logging_steps = 200
trainer_mono.args.save_strategy = "no"
trainer_mono.args.gradient_accumulation_steps = 4
trainer_mono.args.per_device_train_batch_size = 2
trainer_mono.args.per_device_eval_batch_size = 2

trainer_mono.train()
print("✅ Training finished (fast demo mode)")


eval_mono = trainer_mono.evaluate()
print("Monolingual eval:", eval_mono)

In [None]:
# Cell 9: Train multilingual model (XLM-R) on same English training data
multi_out = "runs/xlm_roberta_multi_demo"
model_multi, trainer_multi = make_model_and_trainer(model_name_multi, tokenized_multi, tokenizer_multi, multi_out, epochs=0.5, batch_size=16, learning_rate=2e-5)

trainer_multi.args.evaluation_strategy = "no"
trainer_multi.args.logging_steps = 200
trainer_multi.args.save_strategy = "no"
trainer_multi.args.gradient_accumulation_steps = 4
trainer_multi.args.per_device_train_batch_size = 2
trainer_multi.args.per_device_eval_batch_size = 2

trainer_multi.train()
eval_multi = trainer_multi.evaluate()
print("Multilingual eval:", eval_multi)


In [None]:
# Cell 10 (Corrected): Evaluation loop with correct column names

# [FIX] Make sure this function is defined *before* you call it.
# You can move this function to Cell 7 or run it here.
def get_preds_and_report(trainer, dataset, tokenizer):
    preds_output = trainer.predict(dataset)
    preds = np.argmax(preds_output.predictions, axis=1)
    labels = preds_output.label_ids
    print(classification_report(labels, preds, digits=4))
    return preds, labels

results = {}

# 1. Evaluate on English (like you already did)
print("--- Monolingual model on English ---")
mono_preds_en, mono_labels_en = get_preds_and_report(trainer_mono, tokenized_mono['test'], tokenizer_mono)
results["Mono (en)"] = f1_score(mono_labels_en, mono_preds_en, average="weighted")

print("--- Multilingual model on English ---")
multi_preds_en, multi_labels_en = get_preds_and_report(trainer_multi, tokenized_multi['test'], tokenizer_multi)
results["Multi (en)"] = f1_score(multi_labels_en, multi_preds_en, average="weighted")


# 2. Now, evaluate on the other languages (ZERO-SHOT)
for lang, test_data in test_sets_other_langs.items():
    print(f"--- Evaluating on {lang} (Zero-Shot) ---")

    # [FIX] Tokenize and remove the *correct* columns for the test sets
    test_cols = test_data.column_names
    cols_to_remove_test = [c for c in test_cols if c not in ["input_ids", "attention_mask", "label"]]

    # Tokenize this language's test set (using the *fixed* functions from Cell 6)
    tok_mono_lang = test_data.map(tokenize_mono, batched=True).remove_columns(cols_to_remove_test)
    tok_multi_lang = test_data.map(tokenize_multi, batched=True).remove_columns(cols_to_remove_test)

    # A) Evaluate Monolingual model (should fail badly)
    print(f"Monolingual model ({lang}):")
    mono_preds, mono_labels = get_preds_and_report(trainer_mono, tok_mono_lang, tokenizer_mono)
    results[f"Mono ({lang})"] = f1_score(mono_labels, mono_preds, average="weighted")

    # B) Evaluate Multilingual model (should do much better)
    print(f"Multilingual model ({lang}):")
    multi_preds, multi_labels = get_preds_and_report(trainer_multi, tok_multi_lang, tokenizer_multi)
    results[f"Multi ({lang})"] = f1_score(multi_labels, multi_preds, average="weighted")

print("--- Summary F1 Scores ---")
print(results)

In [None]:
# Cell 11 (Corrected): Visualize the *actual* cross-lingual results
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay
import pandas as pd

# 1. Create a DataFrame from the 'results' dict for easy plotting
df_results = pd.DataFrame.from_dict(results, orient='index', columns=['F1-Score'])
df_results = df_results.reset_index().rename(columns={'index': 'Test Case'})

# Separate by model and language
df_results[['Model', 'Language']] = df_results['Test Case'].str.extract(r'(Mono|Multi) \((.*)\)')

# 2. Create the bar chart comparing language performance
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Language',
    y='F1-Score',
    hue='Model',
    data=df_results,
    order=['en', 'de', 'fr', 'ja'] # Specify order
)
plt.title('Zero-Shot Cross-Lingual Performance (Trained on English)')
plt.ylim(0, 1)
plt.ylabel('F1-Score (Weighted)')
plt.xlabel('Test Language')
plt.legend(loc='upper right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig("cross_lingual_performance.png")
print("Saved cross-lingual performance plot to cross_lingual_performance.png")
plt.show()


# 3. Confusion Matrix
# [FIX] This matrix will only be for the *last* language in the loop (Japanese).
# Let's plot the matrix for the *English* test set as a baseline.
print("\nMultilingual Model Confusion Matrix (on English Test Set)")
cm_multi_en = confusion_matrix(multi_labels_en, multi_preds_en)
disp = ConfusionMatrixDisplay(cm_multi_en, display_labels=["neg","pos"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Multilingual Model (on English)")
plt.savefig("confusion_matrix_multi_en.png")
print("Saved confusion matrix to confusion_matrix_multi_en.png")
plt.show()

In [None]:
# Cell 12: Show examples where the models disagree
def decode_input(tokenizer, tokenized_batch, idx):
    # reconstruct input text if original text removed
    # if we have no original text column, we can't easily decode; show tokens instead
    input_ids = tokenized_batch['input_ids'][idx]
    return tokenizer.decode(input_ids, skip_special_tokens=True)

# We'll compare predictions on the first 200 test examples
disagreements = []
for i in range(min(200, len(tokenized_mono['test']))):
    # get predictions by running models directly in eval mode
    with torch.no_grad():
        # mono
        mi = {k: torch.tensor([v[i]]).to(device) for k,v in tokenized_mono['test'][i].items() if k in ['input_ids','attention_mask','token_type_ids'] or k in ['input_ids','attention_mask']}
        mo_logits = model_mono(**{k:v for k,v in mi.items() if k in model_mono.forward.__code__.co_varnames})
        m_pred = int(torch.argmax(mo_logits.logits, dim=1).cpu().numpy())

        # multi
        xi = {k: torch.tensor([v[i]]).to(device) for k,v in tokenized_multi['test'][i].items() if k in ['input_ids','attention_mask']}
        xo_logits = model_multi(**{k:v for k,v in xi.items() if k in model_multi.forward.__code__.co_varnames})
        x_pred = int(torch.argmax(xo_logits.logits, dim=1).cpu().numpy())

    true_label = tokenized_mono['test'][i]['label']
    if m_pred != x_pred:
        text = decode_input(tokenizer_mono, tokenized_mono['test'], i)
        disagreements.append((i, text, true_label, m_pred, x_pred))
        if len(disagreements) >= 10:
            break

for d in disagreements:
    idx, text, true, mono_p, multi_p = d
    print(f"Idx {idx} | True: {true} | Mono: {mono_p} | Multi: {multi_p}\n{text}\n---\n")


In [None]:
# Cell 13: Gradio demo to try both models interactively
import gradio as gr

# load tokenizer and models on CPU for demo if GPU not available
mono_pipeline_tokenizer = tokenizer_mono
multi_pipeline_tokenizer = tokenizer_multi
model_mono.eval()
model_multi.eval()

def predict_both(text):
    # mono
    tok_m = mono_pipeline_tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=max_length).to(device)
    with torch.no_grad():
        logits_m = model_mono(**tok_m).logits
    pred_m = int(torch.argmax(logits_m, dim=1).cpu().numpy())
    # multi
    tok_x = multi_pipeline_tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=max_length).to(device)
    with torch.no_grad():
        logits_x = model_multi(**tok_x).logits
    pred_x = int(torch.argmax(logits_x, dim=1).cpu().numpy())
    label_map = {0:"Negative", 1:"Positive"}
    return label_map[pred_m], label_map[pred_x]

iface = gr.Interface(fn=predict_both,
                     inputs=gr.Textbox(lines=4, placeholder="Enter review here..."),
                     outputs=[gr.Label(num_top_classes=1, label="Monolingual (BERT)"),
                              gr.Label(num_top_classes=1, label="Multilingual (XLM-R)")],
                     title="Monolingual vs Multilingual Sentiment Demo",
                     description="Enter text (any language). Models trained on English demo subset; multilingual model may generalize better to other languages.")
iface.launch(share=False)


In [None]:
# Cell 14: Save the fine-tuned models & tokenizers locally
save_dir_mono = "saved_models/bert_mono"
save_dir_multi = "saved_models/xlm_roberta_multi"
os.makedirs(save_dir_mono, exist_ok=True)
os.makedirs(save_dir_multi, exist_ok=True)

model_mono.save_pretrained(save_dir_mono)
tokenizer_mono.save_pretrained(save_dir_mono)

model_multi.save_pretrained(save_dir_multi)
tokenizer_multi.save_pretrained(save_dir_multi)

print("Saved models to:", save_dir_mono, save_dir_multi)
