In [1]:
# --- 1. Setup and Configuration ---

# Install necessary libraries
!pip install transformers pandas scikit-learn accelerate -U
!pip install torch torchvision torchaudio


Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m132.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m156.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, pandas
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
   

In [2]:

# Import necessary libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datetime import datetime
import os
import json
import zipfile
import re
from google.colab import drive
from sklearn.preprocessing import LabelEncoder

# Mount Google Drive to access your dataset files
if 'google.colab' in str(get_ipython()):
    drive.mount('/content/drive')


Mounted at /content/drive


In [3]:

# --- CONFIGURATION ---
# ⚠️ UPDATE THESE CONFIGURATION PARAMETERS ⚠️

# File Paths (Change these to your specific Google Drive paths)
TRAIN_FILE_PATH = "/content/drive/MyDrive/Tamil/tamil_offensive_full_train.csv"
# NEW: Path for your separate validation dataset
VAL_FILE_PATH = "/content/drive/MyDrive/Tamil/tamil_offensive_full_dev.csv"
TEST_FILE_PATH = "/content/drive/MyDrive/Tamil/tamil_offensive_test_without_labels.csv"

# Column Names (Change these to match your CSV files)
TEXT_COLUMN = 'Text'     # Your column name for text
LABEL_COLUMN = 'Labels'  # Your column name for labels (in train/val set)
ID_COLUMN = 'ID'         # Your column name for unique identifier in the test set

# Model Hyperparameters
NUM_CLASSES = 6         # UPDATE THIS to your actual number of unique labels
HIDDEN_DIM = 128
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define Output Directory
timestamp = datetime.now().strftime("%d_%b_%H_%M_%S")
output_dir_name = f"/content/drive/MyDrive/outputs/{timestamp}_IndicBERT_LSTM_CNN_Tamil_Separate_Val"
os.makedirs(output_dir_name, exist_ok=True)
print(f"Output will be saved to: {output_dir_name}")

Using device: cuda
Output will be saved to: /content/drive/MyDrive/outputs/02_Nov_10_25_22_IndicBERT_LSTM_CNN_Tamil_Separate_Val


In [5]:
from huggingface_hub import login
login()  # paste your hf_... token when asked


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# --- 2. Data Loading, Preprocessing, and Encoding ---

# Load Tokenizer and IndicBERT Model
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indic_bert_model = AutoModel.from_pretrained("ai4bharat/indic-bert")

# Load DataFrames
try:
    train_df = pd.read_csv(TRAIN_FILE_PATH)
    val_df = pd.read_csv(VAL_FILE_PATH) # Load separate validation data
    test_df = pd.read_csv(TEST_FILE_PATH)
    print(f"Train samples loaded: {len(train_df)}")
    print(f"Validation samples loaded: {len(val_df)}")
    print(f"Test samples loaded: {len(test_df)}")
except FileNotFoundError as e:
    print(f"Error loading files. Check paths in Cell 1: {e}")
    raise

# **2.1. Text Cleaning Function**
def preprocess_text(text):
    if pd.isna(text) or text is None:
        return ""


    # Keep only letters, spaces, and Indic characters (Tamil U+0B80 to U+0BFF)
    text = re.sub(r'[^a-zA-Z\s\u0900-\u097F\u0B80-\u0BFF]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to all datasets
train_df[TEXT_COLUMN] = train_df[TEXT_COLUMN].apply(preprocess_text)
val_df[TEXT_COLUMN] = val_df[TEXT_COLUMN].apply(preprocess_text)
test_df[TEXT_COLUMN] = test_df[TEXT_COLUMN].apply(preprocess_text)

# **2.2. Label Encoding**
le = LabelEncoder()

# Fit only on training labels and transform all (train and val)
train_df['label'] = le.fit_transform(train_df[LABEL_COLUMN])
val_df['label'] = le.transform(val_df[LABEL_COLUMN]) # Use transform, NOT fit_transform
num_unique_labels = len(le.classes_)

# Check/Update NUM_CLASSES
if num_unique_labels != NUM_CLASSES:
    print(f"INFO: Updating NUM_CLASSES from {NUM_CLASSES} to actual unique labels: {num_unique_labels}")
    NUM_CLASSES = num_unique_labels

print(f"Unique Labels found: {le.classes_}")
print(f"Numerical Class count: {NUM_CLASSES}")

# Extract texts and numerical labels
train_texts = train_df[TEXT_COLUMN].tolist()
train_labels = train_df['label'].tolist()
val_texts = val_df[TEXT_COLUMN].tolist()
val_labels = val_df['label'].tolist()

print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

Train samples loaded: 35139
Validation samples loaded: 4388
Test samples loaded: 4392
Unique Labels found: ['Not_offensive' 'Offensive_Targeted_Insult_Group'
 'Offensive_Targeted_Insult_Individual' 'Offensive_Targeted_Insult_Other'
 'Offensive_Untargetede' 'not-Tamil']
Numerical Class count: 6
Training samples: 35139, Validation samples: 4388


In [7]:
# --- 3. Dataset and Model Definition ---

# **3.1. PyTorch Dataset Class**
class IndicBERTDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.texts)

# Create Datasets
train_dataset = IndicBERTDataset(train_texts, train_labels)
val_dataset = IndicBERTDataset(val_texts, val_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False) # Validation should not be shuffled
print("DataLoaders created for Train and Validation.")

# **3.2. LSTM-CNN-IndicBERT Model Architecture**
class LSTM_CNN_IndicBERT(nn.Module):
    def __init__(self, hidden_dim, num_classes):
        super(LSTM_CNN_IndicBERT, self).__init__()
        self.indic_bert_model = indic_bert_model

        # 768 is the output dimension of IndicBERT
        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True)
        self.conv1 = nn.Conv1d(hidden_dim, 128, kernel_size=3, padding=1)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask):
        # BERT Embedding is frozen (no_grad)
        with torch.no_grad():
            bert_outputs = self.indic_bert_model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = bert_outputs.last_hidden_state

        lstm_out, _ = self.lstm(last_hidden_state)
        # Permute for Conv1D: [Batch, Sequence_Len, Hidden_Dim] -> [Batch, Hidden_Dim, Sequence_Len]
        cnn_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
        pooled = torch.mean(cnn_out, dim=-1) # Global Average Pooling
        output = self.fc(pooled)
        return output

# Initialize Model
lstm_cnn_model = LSTM_CNN_IndicBERT(hidden_dim=HIDDEN_DIM, num_classes=NUM_CLASSES).to(device)
print(f"Model initialized with {NUM_CLASSES} classes on {device}.")

DataLoaders created for Train and Validation.
Model initialized with 6 classes on cuda.


In [10]:
# --- 4. Training and Evaluation Functions ---

def evaluate_model(model, loader, phase="Validation"):
    """Evaluates the model on the provided DataLoader and prints metrics."""
    model.eval()
    preds, true_labels = [], []
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)

            preds.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(true_labels, preds)
    # Use zero_division=0 to handle potential class imbalance issues
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted', zero_division=0)

    print(f"--- {phase} Metrics ---")
    print(f"Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    return f1 # Return F1 for saving the best model

def train_model(model, train_loader, val_loader, epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()
    best_f1 = -1

    print("\nStarting training loop...")
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'\nEpoch [{epoch + 1}/{epochs}], Training Loss: {running_loss / len(train_loader):.4f}')

        # Evaluate on the separate validation set
        current_f1 = evaluate_model(model, val_loader)

        # Save best model checkpoint
        if current_f1 > best_f1:
            best_f1 = current_f1
            model_save_path = f'{output_dir_name}/best_model_epoch_{epoch+1}_f1_{best_f1:.4f}.pt'
            torch.save(model.state_dict(), model_save_path)
            print(f"---> Model checkpoint saved: {model_save_path}")
        else:
            print("No F1 improvement on validation set.")

# **4.1. Start Training**
train_model(lstm_cnn_model, train_loader, val_loader, epochs=EPOCHS)
print("\nTraining complete.")


Starting training loop...

Epoch [1/5], Training Loss: 0.9056
--- Validation Metrics ---
Loss: 0.8784, Accuracy: 0.7331, Precision: 0.5679, Recall: 0.7331, F1: 0.6253
---> Model checkpoint saved: /content/drive/MyDrive/outputs/02_Nov_10_25_22_IndicBERT_LSTM_CNN_Tamil_Separate_Val/best_model_epoch_1_f1_0.6253.pt

Epoch [2/5], Training Loss: 0.8621
--- Validation Metrics ---
Loss: 0.8531, Accuracy: 0.7416, Precision: 0.5693, Recall: 0.7416, F1: 0.6401
---> Model checkpoint saved: /content/drive/MyDrive/outputs/02_Nov_10_25_22_IndicBERT_LSTM_CNN_Tamil_Separate_Val/best_model_epoch_2_f1_0.6401.pt

Epoch [3/5], Training Loss: 0.8447
--- Validation Metrics ---
Loss: 0.8434, Accuracy: 0.7420, Precision: 0.6530, Recall: 0.7420, F1: 0.6407
---> Model checkpoint saved: /content/drive/MyDrive/outputs/02_Nov_10_25_22_IndicBERT_LSTM_CNN_Tamil_Separate_Val/best_model_epoch_3_f1_0.6407.pt

Epoch [4/5], Training Loss: 0.8338
--- Validation Metrics ---
Loss: 0.8354, Accuracy: 0.7429, Precision: 0.6198

In [13]:
# --- 5. Prediction on Test Set and Submission File Generation ---

# NOTE: If you stopped training and want to load the best model, uncomment the lines below:
# best_model_path = "PATH_TO_YOUR_BEST_MODEL.pt" # Update this path
# lstm_cnn_model.load_state_dict(torch.load(best_model_path))
# print(f"Loaded best model from: {best_model_path}")
import zipfile
from google.colab import files
test_texts = test_df[TEXT_COLUMN].tolist()
# Create test dataset without labels
test_dataset = IndicBERTDataset(test_texts)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
lstm_cnn_model.eval()
predictions = []

print("\nStarting prediction on test set...")
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = lstm_cnn_model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predictions.extend(preds.cpu().numpy())

# Convert numerical predictions back to original labels (optional, but good for checks)
predicted_labels_str = le.inverse_transform(predictions)

# **5.1. Create Submission DataFrame**
submission_df = pd.DataFrame({
    ID_COLUMN: test_df[ID_COLUMN],
    'prediction': predictions,
    'prediction_label_str': predicted_labels_str
})

# **5.2. Save predictions to JSONL file**
json_file_name = 'submission.jsonl'
json_file_path = f'{output_dir_name}/{json_file_name}'

with open(json_file_path, 'w') as json_file:
    for _, row in submission_df.iterrows():
        record = {
            "index": row[ID_COLUMN],
            "prediction": int(row['prediction'])
        }
        json_file.write(json.dumps(record) + '\n')

print(f"\n✅ Predictions saved to JSONL: {json_file_path}")

# --- 6. Create and Download Zip File ---

zip_file_name = f'submission_{timestamp}.zip'
zip_file_path_drive = f'{output_dir_name}/{zip_file_name}'
zip_file_path_local = f'./{zip_file_name}'

# Create the zip file in Google Drive
with zipfile.ZipFile(zip_file_path_drive, 'w') as zipf:
    zipf.write(json_file_path, arcname=json_file_name)

print(f"📦 Zip file created in Google Drive: {zip_file_path_drive}")

# Copy to local runtime and trigger download
!cp "{zip_file_path_drive}" "{zip_file_path_local}"
print(f"\n⬇️ Downloading {zip_file_name}...")
files.download(zip_file_name)

print("\n--- Final Predictions and Download Complete ---")


Starting prediction on test set...

✅ Predictions saved to JSONL: /content/drive/MyDrive/outputs/02_Nov_10_25_22_IndicBERT_LSTM_CNN_Tamil_Separate_Val/submission.jsonl
📦 Zip file created in Google Drive: /content/drive/MyDrive/outputs/02_Nov_10_25_22_IndicBERT_LSTM_CNN_Tamil_Separate_Val/submission_02_Nov_10_25_22.zip

⬇️ Downloading submission_02_Nov_10_25_22.zip...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


--- Final Predictions and Download Complete ---


In [16]:
# --- FINAL TEST SET EVALUATION CELL ---
# --- Example Data Loading Cell (Usually run near the top) ---
import pandas as pd
from tqdm.auto import tqdm
# Replace 'path/to/your/test_with_labels.csv' with the actual path
TEST_WITH_LABELS_DF = pd.read_csv('/content/drive/MyDrive/Tamil/tamil_offensive_full_test_with_labels.csv')

# 1. Load the Test DataFrame (Replace 'test_with_labels_df' with your actual DataFrame name)
#    Ensure this DataFrame has both the text and the true label columns.
final_test_df = TEST_WITH_LABELS_DF
final_test_texts = final_test_df[TEXT_COLUMN].tolist()
final_test_labels = final_test_df[LABEL_COLUMN].tolist() # Assuming your true labels are here

# 2. Re-load Best Model (IMPORTANT if you restarted the runtime)
#    If you just finished training, you can skip this.
#    If you restarted Colab, uncomment and set the path to your best checkpoint file.
# best_model_path = "PATH_TO_YOUR_BEST_MODEL.pt" # Example: 'output/best_model_epoch_2_f1_0.7589.pt'
# lstm_cnn_model.load_state_dict(torch.load(best_model_path))
# print(f"Loaded best model from: {best_model_path}")
print("Using current model in memory for final test set evaluation.")

# 3. Create Dataset and DataLoader
# The final_test_labels need to be transformed using the same LabelEncoder (le) used for training
final_test_labels_encoded = le.transform(final_test_labels)
final_test_dataset = IndicBERTDataset(final_test_texts, final_test_labels_encoded)
final_test_loader = DataLoader(final_test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 4. Run Evaluation
# We will use your existing evaluation logic, slightly modified to print 'Test'
lstm_cnn_model.eval()
preds, true_labels = [], []
total_loss = 0
criterion = nn.CrossEntropyLoss()

print("\nStarting final evaluation on test set...")
with torch.no_grad():
    # Use tqdm to show progress during the test prediction
    test_loop = tqdm(final_test_loader, leave=False, desc="Final Test Evaluation")

    for batch in test_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_cnn_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        _, predicted = torch.max(outputs, 1)

        preds.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# 5. Calculate and Print Final Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

avg_loss = total_loss / len(final_test_loader)
accuracy = accuracy_score(true_labels, preds)
# Use 'macro' average for F1, Precision, and Recall as it's often more informative
# for multi-class classification, but you can also use 'weighted' if your validation used it.
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='macro', zero_division=0)

print("-" * 50)
print("✨ FINAL TEST SET PERFORMANCE METRICS ✨")
print(f"{'Loss':<15}: {avg_loss:.4f}")
print(f"{'Accuracy':<15}: {accuracy:.4f}")
print(f"{'Precision (Macro)':<15}: {precision:.4f}")
print(f"{'Recall (Macro)':<15}: {recall:.4f}")
print(f"{'F1-score (Macro)':<15}: {f1:.4f}")
print("-" * 50)

# Optional: Print Classification Report for per-class metrics
from sklearn.metrics import classification_report
print("\nDetailed Classification Report:")
print(classification_report(true_labels, preds, target_names=le.classes_, zero_division=0))

Using current model in memory for final test set evaluation.

Starting final evaluation on test set...


Final Test Evaluation:   0%|          | 0/138 [00:00<?, ?it/s]

--------------------------------------------------
✨ FINAL TEST SET PERFORMANCE METRICS ✨
Loss           : 0.8564
Accuracy       : 0.7375
Precision (Macro): 0.4613
Recall (Macro) : 0.2236
F1-score (Macro): 0.2273
--------------------------------------------------

Detailed Classification Report:
                                      precision    recall  f1-score   support

                       Not_offensive       0.74      1.00      0.85      3190
     Offensive_Targeted_Insult_Group       0.50      0.02      0.04       288
Offensive_Targeted_Insult_Individual       0.00      0.00      0.00       315
     Offensive_Targeted_Insult_Other       0.00      0.00      0.00        71
               Offensive_Untargetede       0.67      0.01      0.01       368
                           not-Tamil       0.86      0.32      0.47       160

                            accuracy                           0.74      4392
                           macro avg       0.46      0.22      0.23      4392