In [13]:
# ===== CONFIGURATION =====
import os
import pandas as pd

# Set True if running on Colab
USE_COLAB = False

if USE_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    # Base path in Colab (adjust according to your Drive structure)
    base_path = '/content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data'
else:
    # Local environment: assumes notebook is inside 'Data/' folder
    base_path = os.getcwd()

# Paths
indic_path = os.path.join(base_path, '..', '..', '..', 'IndicGEC2025')  # adjust relative path
tsv_folder = os.path.join(base_path, "Tsv")
os.makedirs(tsv_folder, exist_ok=True)

# Languages
languages = ["Bangla","Hindi","Malayalam","Tamil","Telugu"]

print(f"Base Path: {base_path}")
print(f"TSV folder: {tsv_folder}")
print(f"IndicGEC folder: {indic_path}")


Base Path: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data
TSV folder: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv
IndicGEC folder: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/../../../IndicGEC2025


In [14]:
import pandas as pd

# List of languages
languages = ["Bangla", "Hindi", "Malayalam", "Tamil", "Telugu"]

for lang in languages:
    train_file = os.path.join(indic_path, lang, "train.csv")
    if os.path.exists(train_file):
        df = pd.read_csv(train_file)

        # Create label: 0 = same, 1 = different
        df['label'] = (df['Input sentence'] != df['Output sentence']).astype(int)

        # Keep only input + label columns, rename input to 'text'
        df_final = df[['Input sentence', 'label']].rename(columns={'Input sentence':'text'})

        # Save TSV
        tsv_file = os.path.join(tsv_folder, f"{lang}_labels.tsv")
        df_final.to_csv(tsv_file, sep='\t', index=False)
        print(f"TSV created for {lang}: {tsv_file}")
    else:
        print(f"Train file not found: {train_file}")


TSV created for Bangla: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv/Bangla_labels.tsv
TSV created for Hindi: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv/Hindi_labels.tsv
TSV created for Malayalam: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv/Malayalam_labels.tsv
TSV created for Tamil: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv/Tamil_labels.tsv
TSV created for Telugu: /content/drive/MyDrive/svnit_shared_task/shared_task/bhasha-workshop/Task1_I/Data/Tsv/Telugu_labels.tsv


In [19]:
import pandas as pd
import os

data_dict = {}  # store DataFrame per language

for lang in languages:
    tsv_file = os.path.join(tsv_folder, f'{lang}_labels.tsv')
    if os.path.exists(tsv_file):
        df = pd.read_csv(tsv_file, sep='\t')
        data_dict[lang] = df  # store in dict
        print(f"{lang} TSV loaded: {len(df)} rows, columns: {list(df.columns)}")

        # Count label distribution
        label_counts = df['label'].value_counts()
        n_0 = label_counts.get(0, 0)
        n_1 = label_counts.get(1, 0)
        print(f"{lang} TSV -> Total: {len(df)}, Label 0: {n_0}, Label 1: {n_1}")

        # Add a blank line for spacing
        print()
    else:
        print(f"{lang} TSV not found!\n")


Bangla TSV loaded: 598 rows, columns: ['text', 'label']
Bangla TSV -> Total: 598, Label 0: 176, Label 1: 422

Hindi TSV loaded: 599 rows, columns: ['text', 'label']
Hindi TSV -> Total: 599, Label 0: 54, Label 1: 545

Malayalam TSV loaded: 300 rows, columns: ['text', 'label']
Malayalam TSV -> Total: 300, Label 0: 2, Label 1: 298

Tamil TSV loaded: 91 rows, columns: ['text', 'label']
Tamil TSV -> Total: 91, Label 0: 0, Label 1: 91

Telugu TSV loaded: 599 rows, columns: ['text', 'label']
Telugu TSV -> Total: 599, Label 0: 47, Label 1: 552

