In [1]:
import json
from datasets import load_dataset, Dataset
from transformers import pipeline
from tqdm.auto import tqdm
import torch
import pandas as pd
import os

# Paths
os.makedirs("data/processed", exist_ok=True)
OUT_PATH = "data/processed/saamayik_labeled.parquet"

# Load Saamayik (train split contains translation objects)
ds = load_dataset("acomquest/Saamayik")
raw = ds['train']

device_id = 0 if torch.cuda.is_available() else -1
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=device_id
)

def map_sentiment_label(label_str):
    if label_str.lower() == 'negative':
        return 0
    elif label_str.lower() == 'neutral':
        return 1
    elif label_str.lower() == 'positive':
        return 2
    else:
        # Some models return 'LABEL_0' style; normalize common cases
        s = str(label_str).lower()
        if 'neg' in s:
            return 0
        if 'neu' in s:
            return 1
        if 'pos' in s:
            return 2
        raise ValueError(f"Unknown sentiment label: {label_str}")

texts_en = []
texts_sa = []
labels = []

print(f"Processing Saamayik dataset: {len(raw)} examples")
for ex in tqdm(raw, desc="Labeling Saamayik"):
    translation = ex.get('translation', None)
    if translation is None:
        continue
    # translation may be a dict or JSON string
    if isinstance(translation, str):
        try:
            trans = json.loads(translation)
        except Exception:
            # fallback: treat the string as raw english text
            trans = {'en': translation, 'sa': ''}
    elif isinstance(translation, dict):
        trans = translation
    else:
        continue

    en_text = (trans.get('en') or '').strip()
    sa_text = (trans.get('sa') or '').strip()
    if not en_text or not sa_text:
        continue

    # classify english text (batch single)
    try:
        res = sentiment_classifier(en_text)[0]
        label = map_sentiment_label(res.get('label'))
    except Exception:
        # if classifier fails on a long example, skip
        continue

    texts_en.append(en_text)
    texts_sa.append(sa_text)
    labels.append(label)

df = pd.DataFrame({
    "text_sanskrit": texts_sa,
    "text_english": texts_en,
    "label": labels
})
print("Constructed Saamayik labeled DataFrame, shape:", df.shape)
df.to_parquet(OUT_PATH, index=False)
print("Saved to", OUT_PATH)

saamayik_labeled = Dataset.from_pandas(df)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Processing Saamayik dataset: 43493 examples


Labeling Saamayik:   0%|          | 0/43493 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Constructed Saamayik labeled DataFrame, shape: (43493, 3)
Saved to data/processed/saamayik_labeled.parquet


In [3]:
!pip install huggingface_hub -q
!pip install IndicTransToolkit

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting IndicTransToolkit
  Downloading indictranstoolkit-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.1 kB)
Collecting cython (from IndicTransToolkit)
  Downloading cython-3.2.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (6.7 kB)
Collecting sacremoses (from IndicTransToolkit)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting sacrebleu (from IndicTransToolkit)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting indic-nlp-library-itt (from IndicTransToolkit)
  Downloading indic_nlp_library_itt-0.1.1-py3-none-any.whl.metadata (2.2 kB)
Collecting morfessor (from indic-nlp-library-itt->IndicTransToolkit)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx-argparse (from indic-nlp-library-itt->IndicTransToolkit)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-li

In [4]:
from huggingface_hub import login
login(token="hf_T****")  #use your token 

In [6]:
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor
from tqdm.auto import tqdm
import os

os.makedirs("data/processed", exist_ok=True)
OUT_PATH_IMDB = "data/processed/imdb_augmented.parquet"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

imdb = load_dataset("imdb")["train"]

# Filter neg/pos
neg = [ex["text"] for ex in imdb if ex["label"] == 0]
pos = [ex["text"] for ex in imdb if ex["label"] == 1]

random.shuffle(neg)
random.shuffle(pos)

neg_en = neg[:15000]
pos_en = pos[:15000]

print("Loaded IMDb:")
print("Negative samples:", len(neg_en))
print("Positive samples:", len(pos_en))

# Load IndicTrans2 EN->SAN model
model_name = "ai4bharat/indictrans2-en-indic-dist-200M"
tokenizer_mt = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model_mt = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
).to(DEVICE)

ip = IndicProcessor(inference=True)
src_lang, tgt_lang = "eng_Latn", "san_Deva"

def translate_batch(texts):
    prepped = ip.preprocess_batch(texts, src_lang=src_lang, tgt_lang=tgt_lang)
    inputs = tokenizer_mt(
        prepped,
        truncation=True,
        padding="longest",
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        out = model_mt.generate(
            **inputs,
            max_length=256,
            num_beams=5,
            use_cache=True
        )
    decoded = tokenizer_mt.batch_decode(
        out,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return ip.postprocess_batch(decoded, lang=tgt_lang)

def batched_translate(texts, batch_size=16, desc="Translating"):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch = texts[i:i + batch_size]
        try:
            sa = translate_batch(batch)
        except Exception:
            # If a batch fails, handle individually
            sa = []
            for t in batch:
                try:
                    sa.append(translate_batch([t])[0])
                except Exception:
                    sa.append("")
        results.extend(sa)
    return results


print("Translating NEGATIVE examples...")
neg_sa = batched_translate(neg_en, desc="Neg → Sanskrit")

print("Translating POSITIVE examples...")
pos_sa = batched_translate(pos_en, desc="Pos → Sanskrit")

# Build final augmented dataset
texts_sanskrit = neg_sa + pos_sa
texts_english = neg_en + pos_en
labels = [0] * len(neg_sa) + [2] * len(pos_sa)

df_imdb = pd.DataFrame({
    "text_sanskrit": texts_sanskrit,
    "text_english": texts_english,
    "label": labels
})

print("IMDb augmented shape:", df_imdb.shape)

df_imdb.to_parquet(OUT_PATH_IMDB, index=False)
print("Saved to", OUT_PATH_IMDB)


Loaded IMDb:
Negative samples: 12500
Positive samples: 12500
Translating NEGATIVE examples...


Neg → Sanskrit:   0%|          | 0/782 [00:00<?, ?it/s]

Translating POSITIVE examples...


Pos → Sanskrit:   0%|          | 0/782 [00:00<?, ?it/s]

IMDb augmented shape: (25000, 3)
Saved to data/processed/imdb_augmented.parquet


In [9]:
from sklearn.model_selection import train_test_split

sa_path = "data/processed/saamayik_labeled.parquet"
imdb_path = "data/processed/imdb_augmented.parquet"
out_path = "data/processed/final_clsa_dataset.parquet"

df_sa = pd.read_parquet(sa_path)
df_imdb = pd.read_parquet(imdb_path)

df = pd.concat([df_sa, df_imdb], ignore_index=True)
df = df.dropna(subset=["text_sanskrit", "text_english", "label"])
df["label"] = df["label"].astype(int)

print("Combined shape:", df.shape)
print(df["label"].value_counts())

# Split per class
df_neg = df[df.label == 0]
df_neu = df[df.label == 1]
df_pos = df[df.label == 2]

print("\nBefore balancing:")
print("neg:", len(df_neg), "pos:", len(df_pos), "neu:", len(df_neu))

TARGET = 12500

df_neg_bal = df_neg.sample(n=TARGET, random_state=42) if len(df_neg) >= TARGET else df_neg
df_pos_bal = df_pos.sample(n=TARGET, random_state=42) if len(df_pos) >= TARGET else df_pos
df_neu_bal = df_neu.sample(n=TARGET, random_state=42) if len(df_neu) >= TARGET else df_neu

df_balanced = pd.concat([df_neg_bal, df_pos_bal, df_neu_bal], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nAfter balancing:")
print(df_balanced["label"].value_counts())

# Train/Val split
train_df, val_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced["label"],
    random_state=42
)

final_df = pd.concat([
    train_df.assign(split="train"),
    val_df.assign(split="validation")
])

final_df.to_parquet(out_path, index=False)
print("\nSaved final dataset to:", out_path)
print("Final dataset shape:", final_df.shape)


Combined shape: (68493, 3)
label
1    36246
2    17342
0    14905
Name: count, dtype: int64

Before balancing:
neg: 14905 pos: 17342 neu: 36246

After balancing:
label
2    12500
0    12500
1    12500
Name: count, dtype: int64

Saved final dataset to: data/processed/final_clsa_dataset.parquet
Final dataset shape: (37500, 4)


In [13]:
from sklearn.model_selection import train_test_split
from datasets import DatasetDict

FINAL_DATA_PATH = "data/processed/final_clsa_dataset.parquet"
MODEL_NAME = "xlm-roberta-base"

df = pd.read_parquet(FINAL_DATA_PATH)
print("Balanced dataset:", df.shape)
print(df['label'].value_counts())


train_df, temp_df = train_test_split(
    df,
    test_size=0.20,         
    stratify=df["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,        
    stratify=temp_df["label"],
    random_state=42
)

print("Train:", train_df.shape)
print("Validation:", val_df.shape)
print("Test:", test_df.shape)

train_df["split"] = "train"
val_df["split"] = "validation"
test_df["split"] = "test"

df_final = pd.concat([train_df, val_df, test_df], ignore_index=True)

hf_dataset = Dataset.from_pandas(df_final)

dataset_dict = DatasetDict({
    "train": hf_dataset.filter(lambda x: x["split"] == "train"),
    "validation": hf_dataset.filter(lambda x: x["split"] == "validation"),
    "test": hf_dataset.filter(lambda x: x["split"] == "test"),
})

print(dataset_dict)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_fn(batch):
    return tokenizer(
        batch["text_sanskrit"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized_dataset = dataset_dict.map(
    preprocess_fn,
    batched=True,
    remove_columns=["text_sanskrit", "text_english", "split"]
)

tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

tokenized_dataset.set_format("torch")


Balanced dataset: (37500, 4)
label
1    12500
0    12500
2    12500
Name: count, dtype: int64
Train: (30000, 4)
Validation: (3750, 4)
Test: (3750, 4)


Filter:   0%|          | 0/37500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text_sanskrit', 'text_english', 'label', 'split'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['text_sanskrit', 'text_english', 'label', 'split'],
        num_rows: 3750
    })
    test: Dataset({
        features: ['text_sanskrit', 'text_english', 'label', 'split'],
        num_rows: 3750
    })
})


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

In [19]:
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)
import evaluate
import os

MODEL_NAME = "xlm-roberta-base"
OUTPUT_DIR = "./clsa_xlmr_optimized"
os.makedirs(OUTPUT_DIR, exist_ok=True)

USE_FP16 = torch.cuda.is_available()

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_fn(batch):
    combined = [
        sa + " </s> " + en
        for sa, en in zip(batch["text_sanskrit"], batch["text_english"])
    ]
    return tokenizer(
        combined,
        truncation=True,
        padding="max_length",
        max_length=384
    )

tokenized_dataset = dataset_dict.map(
    preprocess_fn,
    batched=True,
    remove_columns=["text_sanskrit", "text_english", "split"]
)

tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)
model.gradient_checkpointing_enable()

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "f1_macro": f1}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=5e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  
    num_train_epochs=7,
    weight_decay=0.01,
    fp16=USE_FP16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=200,
    report_to="none",
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Starting optimized CLSA training...")
train_output = trainer.train()
print("Training finished.")
print(train_output.metrics)

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model saved to:", OUTPUT_DIR)

val_results = trainer.evaluate(tokenized_dataset["validation"])
print("Validation results:", val_results)

test_results = trainer.evaluate(tokenized_dataset["test"])
print("Test results:", test_results)


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting optimized CLSA training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4184,0.335142,0.871467,0.871483
2,0.2684,0.26698,0.893067,0.893248
3,0.2328,0.246691,0.909867,0.910048
4,0.1891,0.260179,0.914133,0.9143
5,0.1681,0.273548,0.9128,0.913007
6,0.1387,0.282002,0.913867,0.914091


Training finished.
{'train_runtime': 2690.5024, 'train_samples_per_second': 78.052, 'train_steps_per_second': 2.44, 'total_flos': 3.552031139328e+16, 'train_loss': 0.27497159261269694, 'epoch': 6.0}
Model saved to: ./clsa_xlmr_optimized


Validation results: {'eval_loss': 0.26017871499061584, 'eval_accuracy': 0.9141333333333334, 'eval_f1_macro': 0.9142996553051465, 'eval_runtime': 10.7661, 'eval_samples_per_second': 348.315, 'eval_steps_per_second': 43.563, 'epoch': 6.0}
Test results: {'eval_loss': 0.25333917140960693, 'eval_accuracy': 0.9106666666666666, 'eval_f1_macro': 0.9107841309931008, 'eval_runtime': 11.5217, 'eval_samples_per_second': 325.473, 'eval_steps_per_second': 40.706, 'epoch': 6.0}


In [20]:
test_results

{'eval_loss': 0.25333917140960693,
 'eval_accuracy': 0.9106666666666666,
 'eval_f1_macro': 0.9107841309931008,
 'eval_runtime': 11.5217,
 'eval_samples_per_second': 325.473,
 'eval_steps_per_second': 40.706,
 'epoch': 6.0}