In [3]:
from google.colab import drive
drive.mount('/content/drive')  # Mount Drive


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# ===== CELL 2: FAST AUG_MURIL TRAINING FOR HINDI =====
!pip install -q --upgrade transformers datasets scikit-learn

import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ===== DEVICE =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("✅ Using device:", device)

# ===== PATHS =====
data_path = "/content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Augmented_Data_Split/Hindi/aug_train.csv"
df = pd.read_csv(data_path)

# ===== PREP DATA =====
train_df = df.sample(frac=0.8, random_state=42)
test_df  = df.drop(train_df.index)

train_dataset = Dataset.from_pandas(train_df[['Input Sentences','Grammatical Error']].rename(columns={'Input Sentences':'text','Grammatical Error':'label'}))
test_dataset  = Dataset.from_pandas(test_df[['Input Sentences','Grammatical Error']].rename(columns={'Input Sentences':'text','Grammatical Error':'label'}))

# ===== MODEL & TOKENIZER =====
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=128)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# ===== TOKENIZE =====
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=128)

# Fast mapping
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=128, num_proc=1)
test_dataset  = test_dataset.map(tokenize, batched=True, batch_size=128, num_proc=1)

# ===== METRICS =====
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

# ===== TRAINER =====
training_args = TrainingArguments(
    output_dir="/tmp/aug_muril_hindi",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_strategy="no",
    save_strategy="no",
    disable_tqdm=True,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ===== TRAIN =====
trainer.train()

# ===== EVALUATE =====
metrics = trainer.evaluate()
num_train_err = train_df['Grammatical Error'].sum()
num_test_err  = test_df['Grammatical Error'].sum()

print(f"\n✅ Epochs used: {training_args.num_train_epochs}")
print(f"✅ Sentences with errors -> Train: {num_train_err}, Test: {num_test_err}")
print(f"✅ Metrics -> Accuracy: {metrics['eval_accuracy']:.4f}, Precision: {metrics['eval_precision']:.4f}, Recall: {metrics['eval_recall']:.4f}, F1: {metrics['eval_f1']:.4f}")


✅ Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=1):   0%|          | 0/6400 [00:00<?, ? examples/s]

Map (num_proc=1):   0%|          | 0/1600 [00:00<?, ? examples/s]

  trainer = Trainer(


{'train_runtime': 135.6734, 'train_samples_per_second': 47.172, 'train_steps_per_second': 1.474, 'train_loss': 0.6717963409423828, 'epoch': 1.0}
{'eval_loss': 0.6436178684234619, 'eval_accuracy': 0.73, 'eval_precision': 0.8388278388278388, 'eval_recall': 0.571072319201995, 'eval_f1': 0.6795252225519288, 'eval_runtime': 10.4632, 'eval_samples_per_second': 152.917, 'eval_steps_per_second': 4.779, 'epoch': 1.0}

✅ Epochs used: 1
✅ Sentences with errors -> Train: 3221, Test: 802
✅ Metrics -> Accuracy: 0.7300, Precision: 0.8388, Recall: 0.5711, F1: 0.6795
