In [2]:
!pip install sentence-transformers -q

In [16]:
!pip install xgboost -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
import pandas as pd

DATA_PATH = "data/processed/final_clsa_dataset.parquet"

df = pd.read_parquet(DATA_PATH)

print(df.head())
print(df.shape)
print(df['label'].value_counts())


                                       text_sanskrit  \
0                     भवान् सायङ्काले किं करिष्यति ?   
1                                                      
2  Balance Sheet  मध्ये रिफ़्लेक्षन् दृष्टुम् अपि...   
3  """मनुष्यपुत्रेणावश्यं बहवो यातना भोक्तव्याः प...   
4                                                      

                                        text_english  label  split  
0                   What will you do in the evening?      1  train  
1  Some have praised _Atlantis:_The_Lost_Empire_ ...      0  train  
2  See the reflection in Balance Sheet and  Void ...      1  train  
3  """And he began to teach them, that the Son of...      0  train  
4  I think Cliff Robertson certainly was one of o...      0  train  
(37500, 4)
label
1    12500
0    12500
2    12500
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    df,
    test_size=0.20,
    stratify=df["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["label"],
    random_state=42
)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)

Train: (30000, 4)
Val: (3750, 4)
Test: (3750, 4)


In [9]:

from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer("sentence-transformers/LaBSE")


print("Embedding dimension:", encoder.get_sentence_embedding_dimension())

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Embedding dimension: 768


In [10]:
import numpy as np
from tqdm.auto import tqdm

def embed(texts):
    return encoder.encode(
        list(texts),
        batch_size=64,
        convert_to_numpy=True,
        show_progress_bar=True
    )

X_train = embed(train_df["text_sanskrit"])
y_train = train_df["label"].values

X_val = embed(val_df["text_sanskrit"])
y_val = val_df["label"].values

X_test = embed(test_df["text_sanskrit"])
y_test = test_df["label"].values

print("Train embeddings:", X_train.shape)
print("Validation embeddings:", X_val.shape)
print("Test embeddings:", X_test.shape)


Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Train embeddings: (30000, 768)
Validation embeddings: (3750, 768)
Test embeddings: (3750, 768)


In [21]:
from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC

clf1 = LinearSVC(C=1.0)
clf1.fit(X_train, y_train)


clf2 = LogisticRegression(
    max_iter=5000,
    C=2.0,
    solver="lbfgs",
    multi_class="multinomial",
    n_jobs=-1
)
clf2.fit(X_train, y_train)




In [23]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

print("RESULTS - Linear SVC")
val_preds = clf1.predict(X_val)
test_preds = clf1.predict(X_test)

print("VALIDATION RESULTS :")
print("Accuracy:", accuracy_score(y_val, val_preds))
print("F1 Macro:", f1_score(y_val, val_preds, average="macro"))

print("\nTEST RESULTS :")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("F1 Macro:", f1_score(y_test, test_preds, average="macro"))

print("\nClassification Report:\n")
print(classification_report(y_test, test_preds))

RESULTS - Linear SVC
VALIDATION RESULTS :
Accuracy: 0.6504
F1 Macro: 0.594120154957464

TEST RESULTS :
Accuracy: 0.6576
F1 Macro: 0.5998922088649358

Classification Report:

              precision    recall  f1-score   support

           0       0.55      0.88      0.68      1250
           1       0.79      0.92      0.85      1250
           2       0.72      0.16      0.27      1250

    accuracy                           0.66      3750
   macro avg       0.69      0.66      0.60      3750
weighted avg       0.69      0.66      0.60      3750



In [24]:
print("RESULTS - Logistic Regression")
val_preds = clf2.predict(X_val)
test_preds = clf2.predict(X_test)

print("VALIDATION RESULTS :")
print("Accuracy:", accuracy_score(y_val, val_preds))
print("F1 Macro:", f1_score(y_val, val_preds, average="macro"))

print("\nTEST RESULTS :")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("F1 Macro:", f1_score(y_test, test_preds, average="macro"))

print("\nClassification Report:\n")
print(classification_report(y_test, test_preds))


RESULTS - Logistic Regression
VALIDATION RESULTS :
Accuracy: 0.6496
F1 Macro: 0.5930054944558859

TEST RESULTS :
Accuracy: 0.6578666666666667
F1 Macro: 0.6007778585098692

Classification Report:

              precision    recall  f1-score   support

           0       0.55      0.89      0.68      1250
           1       0.80      0.92      0.85      1250
           2       0.72      0.16      0.27      1250

    accuracy                           0.66      3750
   macro avg       0.69      0.66      0.60      3750
weighted avg       0.69      0.66      0.60      3750

