In [None]:
!pip install nlpaug nltk
!python -m nltk.downloader wordnet omw-1.4

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as naw

In [None]:
df = pd.read_csv('/content/train_clean.csv')
test = pd.read_csv('/content/test_clean.csv')

In [None]:
label_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']

label_counts = df[label_cols].sum().sort_values(ascending=False)


label_percent = (label_counts / label_counts.sum() * 100).round(2)


label_table = pd.DataFrame({
    'Label': label_counts.index,
    'Count': label_counts.values,
    'Percentage (%)': label_percent.values
})

print(label_table)




*   1x for 500 in sadness,suprise
* 1x for 800 in jou



* anger 500 - 2000(3x)


In [None]:
def load_model(name):
    tok = MarianTokenizer.from_pretrained(name)
    model = MarianMTModel.from_pretrained(name)
    return tok, model

en2hi_tok, en2hi_model = load_model("Helsinki-NLP/opus-mt-en-hi")
hi2en_tok, hi2en_model = load_model("Helsinki-NLP/opus-mt-hi-en")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def translate(texts, tokenizer, model, batch_size=32):
    all_out = []
    model = model.to(device)
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating", leave=False):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=256)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_out.extend(decoded)
    return all_out

def back_translate(texts, batch_size=64):
    mid = translate(texts, en2hi_tok, en2hi_model, batch_size)
    back = translate(mid, hi2en_tok, hi2en_model, batch_size)
    return back

In [None]:
import pandas as pd
import numpy as np

def augment(df):
    plan = {
        "sadness": {"pick": 500, "times": 1},
        "surprise": {"pick": 500, "times": 1},
        "joy": {"pick": 800, "times": 1},
        "anger": {"pick": 500, "times": 5},
    }

    label_cols = ['anger','joy','surprise','fear','sadness']
    augmented_rows = []
    used_indices = set()

    for label, cfg in plan.items():
        available = df[(df[label] == 1) & (~df.index.isin(used_indices))]
        if len(available) < cfg["pick"]:
            print(f"Not enough {label} samples ({len(available)}) - picking all available.")
            subset = available
        else:
            subset = available.sample(cfg["pick"], random_state=42)

        used_indices.update(subset.index)
        texts = subset["text"].tolist()
        print(f"{label}: picked {len(texts)} → {cfg['times']}× Hindi back-translation")

        for i in range(cfg["times"]):
            print(f"Round {i+1}/{cfg['times']}")
            aug_texts = back_translate(texts, batch_size=64)

            new_df = subset.copy()
            new_df["text"] = aug_texts
            new_df["id"] = [f"{label}_aug_{i}_{idx}" for idx in range(len(new_df))]
            augmented_rows.append(new_df)


    df_aug = pd.concat(augmented_rows, ignore_index=True)
    combined_df = pd.concat([df, df_aug], ignore_index=True)

    return combined_df


In [None]:
df_augment = augment(df)


sadness: picked 500 → 1× Hindi back-translation
Round 1/1




surprise: picked 500 → 1× Hindi back-translation
Round 1/1




joy: picked 800 → 1× Hindi back-translation
Round 1/1




Not enough anger samples (445) — picking all available.
anger: picked 445 → 5× Hindi back-translation
Round 1/5




Round 2/5




Round 3/5




Round 4/5


Translating:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
def augment_syn(df):
    plan = {
        "sadness": {"pick": 800, "times": 1},
        "surprise": {"pick": 800, "times": 1},
        "joy": {"pick": 1200, "times":1},
        "anger": {"pick": 500, "times": 5},
    }

    label_cols = ['anger', 'joy', 'surprise', 'fear', 'sadness']
    augmented_rows = []
    used_indices = set()


    aug = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased',
        action='substitute',
        device='cuda'
    )

    for label, cfg in plan.items():
        available = df[(df[label] == 1) & (~df.index.isin(used_indices))]
        if len(available) < cfg["pick"]:
            print(f"Not enough {label} samples ({len(available)}) - picking all available.")
            subset = available
        else:
            subset = available.sample(cfg["pick"], random_state=42)

        used_indices.update(subset.index)
        texts = subset["text"].tolist()
        print(f"{label}: picked {len(texts)} → {cfg['times']}× contextual synonym augmentation")

        for i in range(cfg["times"]):
            print(f"Round {i+1}/{cfg['times']}")
            aug_texts = aug.augment(texts)

            new_df = subset.copy()
            new_df["text"] = aug_texts
            new_df["id"] = [f"{label}_bert_syn_{i}_{idx}" for idx in range(len(new_df))]
            augmented_rows.append(new_df)

    df_aug = pd.concat(augmented_rows, ignore_index=True)
    combined_df = pd.concat([df, df_aug], ignore_index=True)

    return combined_df

In [None]:
df_balanced = augment_syn(df_augment)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

The following layers were not sharded: bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.intermediate.dense.bias, cls.predictions.transform.LayerNorm.bias, bert.encoder.layer.*.attention.self.key.weight, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.output.dense.weight, bert.embeddings.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, cls.predictions.transform.dense.weight, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.encoder.layer.*.attention.self.value.weight, cls.predictions.transform.LayerNorm.weight, cls.predictions.decoder.bias, bert.embeddings.word_embeddings.weight, bert.encoder.layer.*.attention.self.value.bias, cls.predictions.decoder.weight, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.attention.output.dense.weight, cls.predictions.transform.dense.bia

sadness: picked 500 → 1× contextual synonym augmentation
Round 1/1
surprise: picked 500 → 1× contextual synonym augmentation
Round 1/1
joy: picked 800 → 1× contextual synonym augmentation
Round 1/1
anger: picked 500 → 5× contextual synonym augmentation
Round 1/5
Round 2/5
Round 3/5
Round 4/5
Round 5/5


In [None]:
df_balanced.shape

(13311, 8)

In [None]:
df_balanced.text.describe()

Unnamed: 0,text
count,13309
unique,11529
top,"Oh, I don't know... against the Bill of Power?"
freq,5


In [None]:
df_balanced = df_balanced.drop_duplicates(subset='text', keep='first')

In [None]:
df_balanced.shape

(11530, 8)

In [None]:
df_balanced.to_csv("augmented_data.csv", index=False)

In [None]:
label_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']

label_counts = df_balanced[label_cols].sum().sort_values(ascending=False)


label_percent = (label_counts / label_counts.sum() * 100).round(2)


label_table = pd.DataFrame({
    'Label': label_counts.index,
    'Count': label_counts.values,
    'Percentage (%)': label_percent.values
})

print(label_table)


      Label  Count  Percentage (%)
0      fear   6488           30.86
1     anger   4034           19.18
2   sadness   3755           17.86
3  surprise   3648           17.35
4       joy   3102           14.75


In [None]:
duplicates = df_balanced['text'].value_counts().to_frame('count')
duplicates = duplicates[duplicates['count'] > 1]
print(duplicates)


Empty DataFrame
Columns: [count]
Index: []
