In [1]:
import pandas as pd
import regex as re
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score, classification_report  




In [2]:
# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
validation_df = pd.read_csv("validation.csv")

In [3]:
train_df.columns

Index(['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars',
       'review_body', 'review_title', 'language', 'product_category'],
      dtype='object')

In [4]:
unique_languages = train_df['language'].unique()
print(unique_languages)

['de' 'en' 'es' 'fr' 'ja' 'zh']


In [5]:
language_counts = train_df['language'].value_counts()
print(language_counts)

language
de    200000
en    200000
es    200000
fr    200000
ja    200000
zh    200000
Name: count, dtype: int64


In [6]:
stars_counts = train_df['stars'].value_counts()
print(stars_counts)

stars
1    240000
2    240000
3    240000
4    240000
5    240000
Name: count, dtype: int64


In [7]:
# Data Preprocessing Function
def preprocess_data(df, sample_frac=0.20, max_samples=10000):
    # Drop unnecessary columns
    cols_to_drop = ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'product_category']
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns], errors='ignore')

    # Handle missing values
    df['review_title'] = df['review_title'].fillna('')
    df['review_body'] = df['review_body'].fillna('')
    df['stars'] = df['stars'].fillna(3).astype(int)

    # Combine and clean text
    df['full_review'] = df['review_title'] + ' ' + df['review_body']
    df['full_review'] = df['full_review'].apply(
        lambda x: re.sub(r'[^\p{L}\s]', '', x).lower().strip()
    )
    df = df[df['full_review'].str.split().str.len() >= 3]
    df = df.drop_duplicates(subset=['full_review'])

    # Create sentiment labels
    df["sentiment"] = df["stars"].apply(
        lambda x: "Positive" if x >= 4 else "Neutral" if x == 3 else "Negative"
    )
    sentiment_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
    df["label"] = df["sentiment"].map(sentiment_mapping)

    # Subsample while maintaining language distribution
    if 'language' in df.columns:
        df = df.groupby('language', group_keys=False).apply(
            lambda x: x.sample(frac=sample_frac, random_state=42)
        )

    # Limit total samples to max_samples
    df = df.sample(n=min(len(df), max_samples), random_state=42)

    return df[['full_review', 'label', 'language', 'sentiment']]

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
validation_df = preprocess_data(validation_df)

  df = df.groupby('language', group_keys=False).apply(
  df = df.groupby('language', group_keys=False).apply(
  df = df.groupby('language', group_keys=False).apply(


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 739386 to 829894
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   full_review  10000 non-null  object
 1   label        10000 non-null  int64 
 2   language     10000 non-null  object
 3   sentiment    10000 non-null  object
dtypes: int64(1), object(3)
memory usage: 390.6+ KB


In [9]:
# Use a smaller, faster model
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
# Convert data to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df[['full_review', 'label']])
test_dataset = Dataset.from_pandas(test_df[['full_review', 'label']])
val_dataset = Dataset.from_pandas(validation_df[['full_review', 'label']])

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['full_review'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4765 [00:00<?, ? examples/s]

Map:   0%|          | 0/4770 [00:00<?, ? examples/s]

In [11]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
    label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
)


# Training arguments optimized for speed
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Enable FP16 for faster training if GPU is available
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "macro_f1": classification_report(
            p.label_ids, preds,
            target_names=["Negative", "Neutral", "Positive"],
            output_dict=True
        )["macro avg"]["f1-score"]
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.7478,0.652451,0.73501,0.608442
2,0.58,0.601879,0.731656,0.686679
3,0.4707,0.603996,0.762055,0.698636


TrainOutput(global_step=1875, training_loss=0.6478162394205729, metrics={'train_runtime': 63061.9871, 'train_samples_per_second': 0.476, 'train_steps_per_second': 0.03, 'total_flos': 1973350632960000.0, 'train_loss': 0.6478162394205729, 'epoch': 3.0})

### **Analysis of Model Performance**  

#### ✅ **Improvements**  
1. **Training Loss Decreasing** – From **0.7478 → 0.5800 → 0.4707**  
   - This means your model is learning and generalizing better on the training data.  
2. **Macro F1 Score Increasing** – From **0.6084 → 0.6867 → 0.6986**  
   - A steady increase in F1 indicates that the model is improving in terms of balanced performance across all classes.  
3. **Validation Loss Fluctuating** – Slight variation (**0.652 → 0.601 → 0.603**)  
   - While it decreased initially, the slight increase in epoch 3 suggests potential overfitting.  

The model has completed **3 epochs** with the following results:

### **Key Metrics**  
- **Training Loss:** **0.6478** (Good, decreasing trend)  
- **Training Time:** **~63,062 seconds (~17.5 hours)** (Seems quite long because i dont have GPU)  
- **Samples per second:** **0.476** (Very slow, likely due to large model size and CPU usage)  
- **Steps per second:** **0.03** (Very low, may need optimization)  
- **Total FLOPs:** **~1.97 quadrillion** (High computational cost)  

In [None]:
from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification
# Final evaluation
results = trainer.evaluate(test_dataset)
print("\n🔹 Test Set Results:")
print(results)

In [22]:
# Save the trained model and tokenizer
trainer.save_model("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\tokenizer.json')

import shutil
from IPython.display import FileLink

# Ensure the 'sentiment_model' folder exists
model_dir = "sentiment_model"

# Zip the 'sentiment_model' folder
shutil.make_archive(model_dir, 'zip', model_dir)

# Provide a download link to the zip file
FileLink(f"{model_dir}.zip")
