In [11]:
import pandas as pd

DATA_PATH = "data/processed/final_clsa_dataset.parquet"
df = pd.read_parquet(DATA_PATH)

print(df.head())
print(df.shape)
print(df['label'].value_counts())


                                       text_sanskrit  \
0                     भवान् सायङ्काले किं करिष्यति ?   
1                                                      
2  Balance Sheet  मध्ये रिफ़्लेक्षन् दृष्टुम् अपि...   
3  """मनुष्यपुत्रेणावश्यं बहवो यातना भोक्तव्याः प...   
4                                                      

                                        text_english  label  split  
0                   What will you do in the evening?      1  train  
1  Some have praised _Atlantis:_The_Lost_Empire_ ...      0  train  
2  See the reflection in Balance Sheet and  Void ...      1  train  
3  """And he began to teach them, that the Son of...      0  train  
4  I think Cliff Robertson certainly was one of o...      0  train  
(37500, 4)
label
1    12500
0    12500
2    12500
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    df, test_size=0.20, stratify=df["label"], random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label"], random_state=42
)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)


Train: (30000, 4)
Val: (3750, 4)
Test: (3750, 4)


In [13]:
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
encoder = SentenceTransformer(model_name)

print("Embedding dimension:", encoder.get_sentence_embedding_dimension())

Embedding dimension: 384


In [14]:
import numpy as np
from tqdm.auto import tqdm

def embed(texts):
    return encoder.encode(
        list(texts),
        batch_size=64,
        convert_to_numpy=True,
        show_progress_bar=True
    )

X_train = embed(train_df["text_sanskrit"])
y_train = train_df["label"].values

X_val = embed(val_df["text_sanskrit"])
y_val = val_df["label"].values

X_test = embed(test_df["text_sanskrit"])
y_test = test_df["label"].values

print("Train embeddings:", X_train.shape)
print("Validation embeddings:", X_val.shape)
print("Test embeddings:", X_test.shape)


Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Train embeddings: (30000, 384)
Validation embeddings: (3750, 384)
Test embeddings: (3750, 384)


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class EmbeddingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class MLP(nn.Module):
    def __init__(...):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 3)
        )

    def forward(self, x):
        return self.layers(x)


SyntaxError: invalid syntax (48726362.py, line 18)

In [24]:
BATCH_SIZE = 64

train_dataset = EmbeddingDataset(X_train, y_train)
val_dataset   = EmbeddingDataset(X_val, y_val)
test_dataset  = EmbeddingDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = X_train.shape[1]  # 384 for MiniLM
model = MLPClassifier(input_dim=input_dim, hidden_dim=256, dropout=0.3).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 30


In [26]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss / len(train_loader):.4f}")


Epoch 1/30 - Loss: 0.6963
Epoch 2/30 - Loss: 0.6538
Epoch 3/30 - Loss: 0.6455
Epoch 4/30 - Loss: 0.6392
Epoch 5/30 - Loss: 0.6330
Epoch 6/30 - Loss: 0.6269
Epoch 7/30 - Loss: 0.6215
Epoch 8/30 - Loss: 0.6143
Epoch 9/30 - Loss: 0.6092
Epoch 10/30 - Loss: 0.6018
Epoch 11/30 - Loss: 0.5950
Epoch 12/30 - Loss: 0.5910
Epoch 13/30 - Loss: 0.5805
Epoch 14/30 - Loss: 0.5788
Epoch 15/30 - Loss: 0.5721
Epoch 16/30 - Loss: 0.5679
Epoch 17/30 - Loss: 0.5590
Epoch 18/30 - Loss: 0.5561
Epoch 19/30 - Loss: 0.5544
Epoch 20/30 - Loss: 0.5481
Epoch 21/30 - Loss: 0.5463
Epoch 22/30 - Loss: 0.5387
Epoch 23/30 - Loss: 0.5365
Epoch 24/30 - Loss: 0.5333
Epoch 25/30 - Loss: 0.5282
Epoch 26/30 - Loss: 0.5285
Epoch 27/30 - Loss: 0.5250
Epoch 28/30 - Loss: 0.5225
Epoch 29/30 - Loss: 0.5183
Epoch 30/30 - Loss: 0.5146


In [27]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def evaluate(loader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            batch_pred = torch.argmax(logits, dim=1).cpu().numpy()

            preds.extend(batch_pred)
            labels.extend(y_batch.numpy())

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return acc, f1, labels, preds

val_acc, val_f1, _, _   = evaluate(val_loader)
test_acc, test_f1, yt, yp = evaluate(test_loader)

print("VALIDATION:")
print("Accuracy:", val_acc)
print("F1 Macro:", val_f1)

print("\nTEST:")
print("Accuracy:", test_acc)
print("F1 Macro:", test_f1)

print("\nClassification Report:")
print(classification_report(yt, yp))


VALIDATION:
Accuracy: 0.6381333333333333
F1 Macro: 0.5787323145577227

TEST:
Accuracy: 0.6432
F1 Macro: 0.586334399898973

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.90      0.67      1250
           1       0.79      0.88      0.83      1250
           2       0.65      0.15      0.25      1250

    accuracy                           0.64      3750
   macro avg       0.66      0.64      0.59      3750
weighted avg       0.66      0.64      0.59      3750

