In [5]:
# ===== CELL 1: MOUNT DRIVE =====

from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [6]:
"""
===========================================================================
CODE EXPLAINED
===========================================================================

1. Install libraries, mount Google Drive, import data files for all languages.
2. Check device: GPU if available, else CPU.
3. Load 5-fold train/test data for each language, merge into one DataFrame.
4. Tokenize text using BERT tokenizer, prepare for model input.
5. Load BERT classification model, define training arguments, train with Trainer.
6. Predict on test set, compute and print metrics (Accuracy, Precision, Recall, F1).
Example output for small subset :

Bangla -> Accuracy: 0.9097, Precision: 0.9718, Recall: 0.8981, F1: 0.9335
Hindi -> Accuracy: 0.9316, Precision: 0.9390, Recall: 0.9890, F1: 0.9634
Malayalam -> Accuracy: 0.9933, Precision: 0.9933, Recall: 1.0000, F1: 0.9967
Tamil -> Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1: 1.0000
Telugu -> Accuracy: 0.9215, Precision: 0.9215, Recall: 1.0000, F1: 0.9592


===========================================================================
BERT-BASE-UNCASED FULL DATASET TRAINING
===========================================================================
1. Uses HuggingFace bert-base-uncased for sequence classification.
2. Loads 5-fold train/test data for each language and merges into one DataFrame.
3. Tokenizes text using BERT tokenizer.
4. Trains classification layer (random initialized) on full dataset.
5. Predicts on test set and prints metrics for each language:
   - Accuracy, Precision, Recall, F1
"""
# ===== IMPORTS =====
import os
import pickle
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

warnings.filterwarnings("ignore", category=UserWarning)  # ignore HuggingFace warnings

# ===== DEVICE =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

# ===== PATHS & VARIABLES =====
base_path = '/content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data'
kfold_folder = os.path.join(base_path, 'K_Fold')

languages = ["Bangla", "Hindi", "Malayalam", "Tamil", "Telugu"]
n_splits = 5
num_epochs = 2  # reduced from 3 to 2 for faster training
model_name = "bert-base-uncased"

# ===== HELPER FUNCTIONS =====
def load_all_folds(lang):
    X_train_list, y_train_list, X_test_list, y_test_list = [], [], [], []
    for fold_num in range(1, n_splits+1):
        fold_path = os.path.join(kfold_folder, lang, f'fold_{fold_num}')
        with open(os.path.join(fold_path, 'X_train_raw.pkl'), 'rb') as f:
            X_train_list.extend(pickle.load(f))
        with open(os.path.join(fold_path, 'y_train.pkl'), 'rb') as f:
            y_train_list.extend(pickle.load(f))
        with open(os.path.join(fold_path, 'X_test_raw.pkl'), 'rb') as f:
            X_test_list.extend(pickle.load(f))
        with open(os.path.join(fold_path, 'y_test.pkl'), 'rb') as f:
            y_test_list.extend(pickle.load(f))
    train_df = pd.DataFrame({'text': X_train_list, 'label': y_train_list})
    test_df = pd.DataFrame({'text': X_test_list, 'label': y_test_list})
    return train_df, test_df

def tokenize_data(data):
    return tokenizer(data['text'], truncation=True, padding=True, max_length=128)

def compute_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    return acc, prec, rec, f1

# ===== MODEL & TOKENIZER =====
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=128)

# ===== TRAINING LOOP =====
for lang in languages:
    train_df, test_df = load_all_folds(lang)

    print(f"\n===== TRAINING LANGUAGE: {lang} =====")
    print(f"ðŸ“˜ {lang}: Train={len(train_df)}, Test={len(test_df)}, Epochs={num_epochs}, Device={device.upper()}")

    # Convert to HuggingFace Dataset
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Tokenize
    train_tokenized = train_dataset.map(tokenize_data, batched=True)
    test_tokenized = test_dataset.map(tokenize_data, batched=True)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"/tmp/{lang}_full",
        overwrite_output_dir=True,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_strategy="no",    # hide step logs
        disable_tqdm=True,        # hide progress bars
        report_to=[]              # disables wandb
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        tokenizer=tokenizer
    )

    # Train
    trainer.train()

    # Prediction pipeline
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0 if device=='cuda' else -1)
    predictions = pipe(test_df['text'].tolist())
    pred_labels = [int(pred['label'].split('_')[1]) for pred in predictions]

    # Compute metrics
    acc, prec, rec, f1 = compute_metrics(pred_labels, test_df['label'].tolist())
    print(f"âœ… {lang} Results â†’ Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

print("\nAll languages processed successfully on FULL dataset using GPU if available!")



Using device: cuda

===== TRAINING LANGUAGE: Bangla =====
ðŸ“˜ Bangla: Train=2392, Test=598, Epochs=2, Device=CUDA


Map:   0%|          | 0/2392 [00:00<?, ? examples/s]

Map:   0%|          | 0/598 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Device set to use cuda:0


{'train_runtime': 120.5454, 'train_samples_per_second': 39.686, 'train_steps_per_second': 2.489, 'train_loss': 0.39114484151204426, 'epoch': 2.0}
âœ… Bangla Results â†’ Accuracy: 0.9097, Precision: 0.9718, Recall: 0.8981, F1: 0.9335

===== TRAINING LANGUAGE: Hindi =====
ðŸ“˜ Hindi: Train=2396, Test=599, Epochs=2, Device=CUDA


Map:   0%|          | 0/2396 [00:00<?, ? examples/s]

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Device set to use cuda:0


{'train_runtime': 123.6214, 'train_samples_per_second': 38.764, 'train_steps_per_second': 2.427, 'train_loss': 0.2704199727376302, 'epoch': 2.0}
âœ… Hindi Results â†’ Accuracy: 0.9316, Precision: 0.9390, Recall: 0.9890, F1: 0.9634

===== TRAINING LANGUAGE: Malayalam =====
ðŸ“˜ Malayalam: Train=1200, Test=300, Epochs=2, Device=CUDA


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Device set to use cuda:0


{'train_runtime': 43.1593, 'train_samples_per_second': 55.608, 'train_steps_per_second': 3.475, 'train_loss': 0.04533860842386882, 'epoch': 2.0}
âœ… Malayalam Results â†’ Accuracy: 0.9933, Precision: 0.9933, Recall: 1.0000, F1: 0.9967

===== TRAINING LANGUAGE: Tamil =====
ðŸ“˜ Tamil: Train=364, Test=91, Epochs=2, Device=CUDA


Map:   0%|          | 0/364 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Device set to use cuda:0


{'train_runtime': 39.1122, 'train_samples_per_second': 18.613, 'train_steps_per_second': 1.176, 'train_loss': 0.04916486014490542, 'epoch': 2.0}
âœ… Tamil Results â†’ Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1: 1.0000

===== TRAINING LANGUAGE: Telugu =====
ðŸ“˜ Telugu: Train=2396, Test=599, Epochs=2, Device=CUDA


Map:   0%|          | 0/2396 [00:00<?, ? examples/s]

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Device set to use cuda:0


{'train_runtime': 55.1886, 'train_samples_per_second': 86.83, 'train_steps_per_second': 5.436, 'train_loss': 0.2552701314290365, 'epoch': 2.0}
âœ… Telugu Results â†’ Accuracy: 0.9215, Precision: 0.9215, Recall: 1.0000, F1: 0.9592

All languages processed successfully on FULL dataset using GPU if available!
