In [None]:
# ===== CELL 1: MOUNT DRIVE =====

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [3]:
"""
===========================================================================
MURIL SMALL DATASET TRAINING - BEGINNER FRIENDLY
===========================================================================

1. Uses HuggingFace google/muril-base-cased model for sequence classification.
2. Loads 5-fold train/test data for each language, merges into one DataFrame.

3. Tokenizes text with MurIL tokenizer.
4. Trains classification layer (random initialized) on full dataset.
5. Predicts on test set, prints Accuracy, Precision, Recall, F1.


Bangla  -> Accuracy: 0.7057, Precision: 0.7057, Recall: 1.0000, F1: 0.8275
Hindi   -> Accuracy: 0.9098, Precision: 0.9098, Recall: 1.0000, F1: 0.9528
Malayalam -> Accuracy: 0.9933, Precision: 0.9933, Recall: 1.0000, F1: 0.9967
Tamil   -> Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1: 1.0000
Telugu  -> Accuracy: 0.9215, Precision: 0.9215, Recall: 1.0000, F1: 0.9592

"""

# ===== CELL 2: FULL DATA TRAINING  =====
!pip install -q --upgrade transformers datasets scikit-learn

import os
import pickle
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ===== DEVICE =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

# ===== PATHS =====
base_path = '/content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data'
kfold_folder = os.path.join(base_path, 'K_Fold')

languages = ["Bangla", "Hindi", "Malayalam", "Tamil", "Telugu"]
n_splits = 5
num_epochs = 1  # change if you want more

# ===== MODEL & TOKENIZER =====
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=128)

# ===== HELPER FUNCTIONS =====
def load_all_folds(lang):
    X_train_list, y_train_list, X_test_list, y_test_list = [], [], [], []
    for fold_num in range(1, n_splits+1):
        fold_path = os.path.join(kfold_folder, lang, f'fold_{fold_num}')
        with open(os.path.join(fold_path, 'X_train_raw.pkl'), 'rb') as f:
            X_train_list.extend(pickle.load(f))
        with open(os.path.join(fold_path, 'y_train.pkl'), 'rb') as f:
            y_train_list.extend(pickle.load(f))
        with open(os.path.join(fold_path, 'X_test_raw.pkl'), 'rb') as f:
            X_test_list.extend(pickle.load(f))
        with open(os.path.join(fold_path, 'y_test.pkl'), 'rb') as f:
            y_test_list.extend(pickle.load(f))

    train_df = pd.DataFrame({'text': X_train_list, 'label': y_train_list})
    test_df = pd.DataFrame({'text': X_test_list, 'label': y_test_list})
    return train_df, test_df

def tokenize_data(data):
    return tokenizer(data['text'], truncation=True, padding=True, max_length=128)

def compute_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    return acc, prec, rec, f1

# ===== MAIN LOOP =====
for lang in languages:
    print(f"\n===== LANGUAGE: {lang} =====")

    train_df, test_df = load_all_folds(lang)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_tokenized = train_dataset.map(tokenize_data, batched=True)
    test_tokenized = test_dataset.map(tokenize_data, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    # ===== CLEAN TRAINING ARGS =====
    training_args = TrainingArguments(
        output_dir="/tmp/muril_full",
        overwrite_output_dir=True,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_strategy="epoch",   # log only after each epoch
        save_strategy="no",         # do not save checkpoints
        report_to=[],               # disable wandb
        disable_tqdm=True           # remove progress bars
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        tokenizer=tokenizer
    )

    print(f"Training {lang} for {num_epochs} epoch(s)...")
    trainer.train()

    # ===== EVALUATION =====
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0 if device=='cuda' else -1)
    predictions = pipe(test_df['text'].tolist())
    pred_labels = [int(pred['label'].split('_')[1]) for pred in predictions]

    acc, prec, rec, f1 = compute_metrics(pred_labels, test_df['label'].tolist())
    print(f"Metrics -> Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

print("\nAll languages processed. Training complete!")





Using device: cuda

===== LANGUAGE: Bangla =====


Map:   0%|          | 0/2392 [00:00<?, ? examples/s]

Map:   0%|          | 0/598 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training Bangla for 1 epoch(s)...


Device set to use cuda:0


{'loss': 0.6178, 'grad_norm': 0.33903831243515015, 'learning_rate': 1.3333333333333336e-07, 'epoch': 1.0}
{'train_runtime': 58.6711, 'train_samples_per_second': 40.77, 'train_steps_per_second': 2.557, 'train_loss': 0.6178435262044271, 'epoch': 1.0}
Metrics -> Accuracy: 0.7057, Precision: 0.7057, Recall: 1.0000, F1: 0.8275

===== LANGUAGE: Hindi =====


Map:   0%|          | 0/2396 [00:00<?, ? examples/s]

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training Hindi for 1 epoch(s)...


Device set to use cuda:0


{'loss': 0.5573, 'grad_norm': 0.9824178814888, 'learning_rate': 1.3333333333333336e-07, 'epoch': 1.0}
{'train_runtime': 57.4115, 'train_samples_per_second': 41.734, 'train_steps_per_second': 2.613, 'train_loss': 0.5573468017578125, 'epoch': 1.0}
Metrics -> Accuracy: 0.9098, Precision: 0.9098, Recall: 1.0000, F1: 0.9528

===== LANGUAGE: Malayalam =====


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training Malayalam for 1 epoch(s)...


Device set to use cuda:0


{'loss': 0.6043, 'grad_norm': 0.9457244873046875, 'learning_rate': 2.666666666666667e-07, 'epoch': 1.0}
{'train_runtime': 16.3416, 'train_samples_per_second': 73.432, 'train_steps_per_second': 4.59, 'train_loss': 0.604272206624349, 'epoch': 1.0}
Metrics -> Accuracy: 0.9933, Precision: 0.9933, Recall: 1.0000, F1: 0.9967

===== LANGUAGE: Tamil =====


Map:   0%|          | 0/364 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training Tamil for 1 epoch(s)...


Device set to use cuda:0


{'loss': 0.677, 'grad_norm': 0.8847945332527161, 'learning_rate': 8.695652173913044e-07, 'epoch': 1.0}
{'train_runtime': 4.1684, 'train_samples_per_second': 87.324, 'train_steps_per_second': 5.518, 'train_loss': 0.6769596597422725, 'epoch': 1.0}
Metrics -> Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1: 1.0000

===== LANGUAGE: Telugu =====


Map:   0%|          | 0/2396 [00:00<?, ? examples/s]

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training Telugu for 1 epoch(s)...


Device set to use cuda:0


{'loss': 0.5514, 'grad_norm': 0.747632622718811, 'learning_rate': 1.3333333333333336e-07, 'epoch': 1.0}
{'train_runtime': 37.4276, 'train_samples_per_second': 64.017, 'train_steps_per_second': 4.008, 'train_loss': 0.5514030456542969, 'epoch': 1.0}
Metrics -> Accuracy: 0.9215, Precision: 0.9215, Recall: 1.0000, F1: 0.9592

All languages processed. Training complete!
