In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    XLMRobertaTokenizer, 
    XLMRobertaForSequenceClassification,
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("🤖 MULTILINGUAL FAKE NEWS DETECTION - MODEL TRAINING")
print("="*80)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n💻 Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
else:
    print("   Note: Training on CPU will be slower. Consider using Google Colab for GPU access.")

# Load the combined dataset
print("\n📂 Loading dataset...")
df = pd.read_csv('../data/processed/multilingual_combined.csv')
print(f"✓ Loaded {len(df)} articles across 3 languages")
print(f"\nDataset breakdown:")
for lang in df['language'].unique():
    count = len(df[df['language'] == lang])
    fake = len(df[(df['language'] == lang) & (df['label'] == 1)])
    real = len(df[(df['language'] == lang) & (df['label'] == 0)])
    print(f"  {lang.capitalize():8s}: {count:5d} total ({fake} fake, {real} real)")


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
W1019 15:50:54.828105 11432 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(


🤖 MULTILINGUAL FAKE NEWS DETECTION - MODEL TRAINING

💻 Device: cpu
   Note: Training on CPU will be slower. Consider using Google Colab for GPU access.

📂 Loading dataset...
✓ Loaded 21414 articles across 3 languages

Dataset breakdown:
  English : 10000 total (5000 fake, 5000 real)
  Hindi   : 10000 total (5000 fake, 5000 real)
  Marathi :  1414 total (707 fake, 707 real)


In [2]:
print("\n" + "="*80)
print("📊 SPLITTING DATASET")
print("="*80)

# Split by language to ensure each language is represented in train/val/test
train_dfs = []
val_dfs = []
test_dfs = []

for language in df['language'].unique():
    lang_df = df[df['language'] == language]
    
    # 70% train, 15% validation, 15% test
    train_temp, test_lang = train_test_split(lang_df, test_size=0.15, random_state=42, stratify=lang_df['label'])
    train_lang, val_lang = train_test_split(train_temp, test_size=0.176, random_state=42, stratify=train_temp['label'])  # 0.176 of 85% ≈ 15%
    
    train_dfs.append(train_lang)
    val_dfs.append(val_lang)
    test_dfs.append(test_lang)
    
    print(f"\n{language.capitalize()}:")
    print(f"  Train: {len(train_lang):4d} | Val: {len(val_lang):4d} | Test: {len(test_lang):4d}")

# Combine all languages
train_df = pd.concat(train_dfs, ignore_index=True).sample(frac=1, random_state=42)
val_df = pd.concat(val_dfs, ignore_index=True).sample(frac=1, random_state=42)
test_df = pd.concat(test_dfs, ignore_index=True).sample(frac=1, random_state=42)

print(f"\n{'='*80}")
print(f"TOTAL SPLIT:")
print(f"  Train: {len(train_df):5d} ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Val:   {len(val_df):5d} ({len(val_df)/len(df)*100:.1f}%)")
print(f"  Test:  {len(test_df):5d} ({len(test_df)/len(df)*100:.1f}%)")
print("="*80)



📊 SPLITTING DATASET

English:
  Train: 7004 | Val: 1496 | Test: 1500

Hindi:
  Train: 7004 | Val: 1496 | Test: 1500

Marathi:
  Train:  989 | Val:  212 | Test:  213

TOTAL SPLIT:
  Train: 14997 (70.0%)
  Val:    3204 (15.0%)
  Test:   3213 (15.0%)


In [3]:
print("\n" + "="*80)
print("🔤 TOKENIZATION")
print("="*80)

# Load XLM-RoBERTa tokenizer
model_name = "xlm-roberta-base"
print(f"\n📥 Loading tokenizer: {model_name}")
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
print("✓ Tokenizer loaded")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['text', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[['text', 'label']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['text', 'label', 'language']].reset_index(drop=True))

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        padding='max_length',
        truncation=True, 
        max_length=256  # Reduced for faster training
    )

print("\n🔄 Tokenizing datasets...")
print("  This may take a few minutes...")

train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
val_tokenized = val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
test_tokenized = test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

print("✓ Tokenization complete!")
print(f"  Train: {len(train_tokenized)} samples")
print(f"  Val:   {len(val_tokenized)} samples")
print(f"  Test:  {len(test_tokenized)} samples")



🔤 TOKENIZATION

📥 Loading tokenizer: xlm-roberta-base
✓ Tokenizer loaded

🔄 Tokenizing datasets...
  This may take a few minutes...


Map:   0%|          | 0/14997 [00:00<?, ? examples/s]

Exception in thread Thread-3:
Traceback (most recent call last):
  File "C:\Users\vedan\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "D:\Projects\Multilingual_Misinformation_Detection\env\lib\site-packages\tqdm\_monitor.py", line 84, in run
    instance.refresh(nolock=True)
  File "D:\Projects\Multilingual_Misinformation_Detection\env\lib\site-packages\tqdm\std.py", line 1347, in refresh
    self.display()
  File "D:\Projects\Multilingual_Misinformation_Detection\env\lib\site-packages\tqdm\notebook.py", line 171, in display
    rtext.value = right
  File "D:\Projects\Multilingual_Misinformation_Detection\env\lib\site-packages\traitlets\traitlets.py", line 716, in __set__
    self.set(obj, value)
  File "D:\Projects\Multilingual_Misinformation_Detection\env\lib\site-packages\traitlets\traitlets.py", line 706, in set
    obj._notify_trait(self.name, old_value, new_value)
  File "D:\Projects\Multilingual_Misinformation_Det

Map:   0%|          | 0/3204 [00:00<?, ? examples/s]

Map:   0%|          | 0/3213 [00:00<?, ? examples/s]

✓ Tokenization complete!
  Train: 14997 samples
  Val:   3204 samples
  Test:  3213 samples


In [4]:
print("\n" + "="*80)
print("🤖 MODEL INITIALIZATION")
print("="*80)

# Load pre-trained XLM-RoBERTa model
print(f"\n📥 Loading model: {model_name}")
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Binary classification: fake (1) vs real (0)
    problem_type="single_label_classification"
)
print("✓ Model loaded")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n📊 Model parameters:")
print(f"  Total: {total_params:,}")
print(f"  Trainable: {trainable_params:,}")

# Move model to device
model.to(device)
print(f"  Device: {device}")

# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("\n✓ Metrics function ready")



🤖 MODEL INITIALIZATION

📥 Loading model: xlm-roberta-base


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded

📊 Model parameters:
  Total: 278,045,186
  Trainable: 278,045,186
  Device: cpu

✓ Metrics function ready


In [5]:
print("\n" + "="*80)
print("⚙️ TRAINING CONFIGURATION")
print("="*80)

# Training arguments
training_args = TrainingArguments(
    output_dir='../models/xlm-roberta-fakenews',
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Adjust based on your GPU memory
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='../models/logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",  # Disable wandb/tensorboard
    fp16=torch.cuda.is_available(),  # Mixed precision training if GPU available
)

print("\n📋 Training Configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Warmup steps: {training_args.warmup_steps}")
print(f"  Mixed precision (FP16): {training_args.fp16}")

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

print("\n✓ Trainer initialized and ready!")



⚙️ TRAINING CONFIGURATION

📋 Training Configuration:
  Epochs: 3
  Batch size: 16
  Learning rate: 2e-05
  Warmup steps: 500
  Mixed precision (FP16): False

✓ Trainer initialized and ready!


In [6]:
print("\n" + "="*80)
print("🚀 STARTING TRAINING")
print("="*80)
print("\n⏰ This will take 20-40 minutes depending on your hardware...")
print("📊 Training progress:\n")

# Train!
train_result = trainer.train()

print("\n" + "="*80)
print("✅ TRAINING COMPLETE!")
print("="*80)
print(f"\n📊 Training Results:")
print(f"  Final Loss: {train_result.training_loss:.4f}")
print(f"  Training time: {train_result.metrics['train_runtime']:.2f} seconds")

# Save the model
print("\n💾 Saving model...")
trainer.save_model('../models/xlm-roberta-fakenews-final')
tokenizer.save_pretrained('../models/xlm-roberta-fakenews-final')
print("✓ Model saved to: models/xlm-roberta-fakenews-final/")



🚀 STARTING TRAINING

⏰ This will take 20-40 minutes depending on your hardware...
📊 Training progress:



Step,Training Loss,Validation Loss


KeyboardInterrupt: 