In [None]:
# ===== CONFIGURATION =====
import os

# Set True if running on Colab
USE_COLAB = True

if USE_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    base_path = '/content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data'
else:
    # Local environment
    base_path = os.getcwd()  # assumes notebook is in Data/ folder

# Folders
tsv_folder = os.path.join(base_path, 'Tsv')
kfold_folder = os.path.join(base_path, 'K_Fold')
os.makedirs(kfold_folder, exist_ok=True)

# Languages and K-Fold
languages = ["Bangla", "Hindi", "Malayalam", "Tamil", "Telugu"]
n_splits = 5  # Stratified K-Fold

# N-gram ranges
word_ngram = (1, 2)      # word-level
char_ngram = (2, 6)      # char-level

# Print paths for verification
print("Base Path:", base_path)
print("TSV folder:", tsv_folder)
print("K-Fold folder:", kfold_folder)

Mounted at /content/drive
Base Path: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data
TSV folder: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv
K-Fold folder: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/K_Fold


In [None]:

import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold

data_dict = {}  # store DataFrame per language

for lang in languages:
    tsv_file = os.path.join(tsv_folder, f'{lang}_labels.tsv')
    if os.path.exists(tsv_file):
        df = pd.read_csv(tsv_file, sep='\t')
        data_dict[lang] = df  # store in dict
        print(f"{lang} TSV loaded: {len(df)} rows, columns: {list(df.columns)}")

        # Count label distribution
        label_counts = df['label'].value_counts()
        n_0 = label_counts.get(0, 0)
        n_1 = label_counts.get(1, 0)
        print(f"{lang} TSV -> Total: {len(df)}, Label 0: {n_0}, Label 1: {n_1}")
        print()
    else:
        print(f"{lang} TSV not found!\n")


Bangla TSV loaded: 598 rows, columns: ['text', 'label']
Bangla TSV -> Total: 598, Label 0: 176, Label 1: 422

Hindi TSV loaded: 599 rows, columns: ['text', 'label']
Hindi TSV -> Total: 599, Label 0: 54, Label 1: 545

Malayalam TSV loaded: 300 rows, columns: ['text', 'label']
Malayalam TSV -> Total: 300, Label 0: 2, Label 1: 298

Tamil TSV loaded: 91 rows, columns: ['text', 'label']
Tamil TSV -> Total: 91, Label 0: 0, Label 1: 91

Telugu TSV loaded: 599 rows, columns: ['text', 'label']
Telugu TSV -> Total: 599, Label 0: 47, Label 1: 552



In [None]:
for lang, df in data_dict.items():
    X = df['text']
    y = df['label']
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    lang_kfold_folder = os.path.join(kfold_folder, lang)
    os.makedirs(lang_kfold_folder, exist_ok=True)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        fold_folder = os.path.join(lang_kfold_folder, f'fold_{fold}')
        os.makedirs(fold_folder, exist_ok=True)

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Fill missing sentences with empty string
        X_train = X_train.fillna('')
        X_test = X_test.fillna('')

        # Save raw text for model training
        with open(os.path.join(fold_folder, 'X_train_raw.pkl'), 'wb') as f:
            pickle.dump(X_train, f)
        with open(os.path.join(fold_folder, 'X_test_raw.pkl'), 'wb') as f:
            pickle.dump(X_test, f)

        # Optional: word-level n-grams (if you want to keep for other experiments)
        word_vectorizer = CountVectorizer(analyzer='word', ngram_range=word_ngram)
        X_train_word = word_vectorizer.fit_transform(X_train)
        X_test_word = word_vectorizer.transform(X_test)

        # Optional: char-level n-grams
        char_vectorizer = CountVectorizer(analyzer='char', ngram_range=char_ngram)
        X_train_char = char_vectorizer.fit_transform(X_train)
        X_test_char = char_vectorizer.transform(X_test)

        # Save pickle files
        with open(os.path.join(fold_folder, 'X_train_word.pkl'), 'wb') as f:
            pickle.dump(X_train_word, f)
        with open(os.path.join(fold_folder, 'X_test_word.pkl'), 'wb') as f:
            pickle.dump(X_test_word, f)
        with open(os.path.join(fold_folder, 'X_train_char.pkl'), 'wb') as f:
            pickle.dump(X_train_char, f)
        with open(os.path.join(fold_folder, 'X_test_char.pkl'), 'wb') as f:
            pickle.dump(X_test_char, f)
        with open(os.path.join(fold_folder, 'y_train.pkl'), 'wb') as f:
            pickle.dump(y_train, f)
        with open(os.path.join(fold_folder, 'y_test.pkl'), 'wb') as f:
            pickle.dump(y_test, f)

        print(f"Saved fold {fold} for {lang}")

    # Print a blank line after finishing all folds for this language
    print()


Saved fold 1 for Bangla
Saved fold 2 for Bangla
Saved fold 3 for Bangla
Saved fold 4 for Bangla
Saved fold 5 for Bangla

Saved fold 1 for Hindi
Saved fold 2 for Hindi
Saved fold 3 for Hindi
Saved fold 4 for Hindi
Saved fold 5 for Hindi





Saved fold 1 for Malayalam
Saved fold 2 for Malayalam
Saved fold 3 for Malayalam
Saved fold 4 for Malayalam
Saved fold 5 for Malayalam

Saved fold 1 for Tamil
Saved fold 2 for Tamil
Saved fold 3 for Tamil
Saved fold 4 for Tamil
Saved fold 5 for Tamil

Saved fold 1 for Telugu
Saved fold 2 for Telugu
Saved fold 3 for Telugu
Saved fold 4 for Telugu
Saved fold 5 for Telugu

