In [3]:
# ===== CELL 1: MOUNT DRIVE =====

from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [5]:
# ===== CELL 2:Classical ML classifiers with TF-IDF features: TRAIN & EVALUATE CLASSIFIERS =====

import os
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
import torch

# ===== PATHS & CONFIG =====
base_path = '/content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data'
kfold_folder = os.path.join(base_path, 'K_Fold')
models_folder = os.path.join(base_path, '../Models')
os.makedirs(models_folder, exist_ok=True)

languages = ["Bangla", "Hindi", "Malayalam", "Tamil", "Telugu"]
n_splits = 5

# Device check
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

# ===== CLASSIFIERS =====
classifier_dict = {
    'LinearSVC': LinearSVC(),
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'BernoulliNB': BernoulliNB()
}

print("Metrics shown: Acc = Accuracy, Prec = Precision, Rec = Recall, AvgF = Average F1 over 5 folds\n")

# ===== HELPER: LOAD K-FOLD DATA =====
def load_kfold_data(lang):
    folds = []
    for fold_num in range(1, n_splits+1):
        fold_path = os.path.join(kfold_folder, lang, f'fold_{fold_num}')
        with open(os.path.join(fold_path, 'X_train_raw.pkl'), 'rb') as f:
            X_train = pickle.load(f)
        with open(os.path.join(fold_path, 'y_train.pkl'), 'rb') as f:
            y_train = pickle.load(f)
        with open(os.path.join(fold_path, 'X_test_raw.pkl'), 'rb') as f:
            X_test = pickle.load(f)
        with open(os.path.join(fold_path, 'y_test.pkl'), 'rb') as f:
            y_test = pickle.load(f)
        folds.append((X_train, X_test, y_train, y_test))
    return folds

# ===== MAIN LOOP =====
for lang in languages:
    print(f"\n===== TRAINING LANGUAGE: {lang} =====")
    print(f"Model: {lang} → Using 5-fold TF-IDF + Classical ML classifiers")
    folds = load_kfold_data(lang)
    lang_results = {}

    for clf_name, clf_obj in classifier_dict.items():
        fold_metrics = []
        for X_train, X_test, y_train, y_test in folds:
            # Handle case where train has single class
            if len(np.unique(y_train)) < 2:
                value = y_train[0] if isinstance(y_train, list) else y_train.iloc[0]
                y_pred = np.full(len(y_test), fill_value=value)
            else:
                # Convert text to numeric features for scikit-learn
                vectorizer = TfidfVectorizer(max_features=5000)
                X_train_vec = vectorizer.fit_transform(X_train)
                X_test_vec = vectorizer.transform(X_test)

                clf = clf_obj
                clf.fit(X_train_vec, y_train)
                y_pred = clf.predict(X_test_vec)

            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            fold_metrics.append((acc, prec, rec, f1))

        avg_metrics = np.mean(fold_metrics, axis=0)
        lang_results[clf_name] = avg_metrics

    # ===== PRINT RESULTS =====
    table_data = []
    for clf_name, metrics in lang_results.items():
        table_data.append([
            clf_name,
            f"{metrics[0]:.4f}",
            f"{metrics[1]:.4f}",
            f"{metrics[2]:.4f}",
            f"{metrics[3]:.4f}"
        ])

    print(tabulate(table_data, headers=["Classifier", "Acc", "Prec", "Rec", "AvgF"], tablefmt="grid"))

    # Best classifier
    best_clf = max(lang_results.items(), key=lambda x: x[1][3])
    print(f"\nBest classifier: {best_clf[0]} with AvgF: {best_clf[1][3]:.4f}")
    print("(All metrics are averages over 5 folds)")


Using device: cuda
Metrics shown: Acc = Accuracy, Prec = Precision, Rec = Recall, AvgF = Average F1 over 5 folds


===== TRAINING LANGUAGE: Bangla =====
Model: Bangla → Using 5-fold TF-IDF + Classical ML classifiers
+--------------------+--------+--------+--------+--------+
| Classifier         |    Acc |   Prec |    Rec |   AvgF |
| LinearSVC          | 0.7375 | 0.7875 | 0.8602 | 0.8222 |
+--------------------+--------+--------+--------+--------+
| RandomForest       | 0.7541 | 0.7848 | 0.8981 | 0.8374 |
+--------------------+--------+--------+--------+--------+
| LogisticRegression | 0.7307 | 0.745  | 0.9408 | 0.8314 |
+--------------------+--------+--------+--------+--------+
| BernoulliNB        | 0.7525 | 0.7866 | 0.8935 | 0.8361 |
+--------------------+--------+--------+--------+--------+

Best classifier: RandomForest with AvgF: 0.8374
(All metrics are averages over 5 folds)

===== TRAINING LANGUAGE: Hindi =====
Model: Hindi → Using 5-fold TF-IDF + Classical ML classifiers
+----