In [None]:
!pip install textattack transformers sentencepiece pandas tqdm

Collecting textattack
  Downloading textattack-0.3.10-py3-none-any.whl.metadata (38 kB)
Collecting bert-score>=0.3.5 (from textattack)
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting flair (from textattack)
  Downloading flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting language-tool-python (from textattack)
  Downloading language_tool_python-2.9.3-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lemminflect (from textattack)
  Downloading lemminflect-0.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting lru-dict (from textattack)
  Downloading lru_dict-1.3.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting terminaltables (from textattack)
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting word2number (from textattack)
 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from textattack.augmentation import CLAREAugmenter
from tqdm import tqdm
import random
import os

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# === PATHS ===
original_path = "/content/drive/MyDrive/merged_question_intent_dataset.csv"  # ✅ Your raw input dataset
checkpoint_path = "/content/drive/MyDrive/augmented_backup.csv"  # ✅ Progress is saved here
final_output_path = "/content/drive/MyDrive/random_augmented_balanced_dataset.csv"  # ✅ Final merged file

# === LOAD DATA ===
original_df = pd.read_csv(original_path)

# Load or create backup file
try:
    augmented_df = pd.read_csv(checkpoint_path)
    print(f"✅ Resuming with {len(augmented_df)} augmented rows already present.")
except FileNotFoundError:
    augmented_df = pd.DataFrame(columns=["text", "intent"])
    print("⚠️ No backup found. Starting fresh.")

# === COUNT CURRENT SAMPLES PER CLASS ===
original_counts = original_df['intent'].value_counts().to_dict()
aug_counts = augmented_df['intent'].value_counts().to_dict()

# Pick target counts
random.seed(42)  # For reproducibility
target_counts = {cls: random.randint(1300, 1800) for cls in original_counts.keys()}

# Compute what's still needed
classes_to_augment = {}
for cls, orig_count in original_counts.items():
    already_aug = aug_counts.get(cls, 0)
    remaining = target_counts[cls] - orig_count - already_aug
    if remaining > 0:
        classes_to_augment[cls] = remaining

if not classes_to_augment:
    print("🎉 All classes are already fully augmented. Nothing to do.")
else:
    print("\n📊 Remaining classes needing augmentation:")
    for cls, count in classes_to_augment.items():
        print(f" → {cls}: {count} more")


✅ Resuming with 706 augmented rows already present.

📊 Remaining classes needing augmentation:
 → symptoms: 41 more
 → disease manifestations: 264 more
 → cause: 743 more
 → definition: 592 more
 → usage: 577 more
 → method: 594 more
 → applicable disease: 588 more
 → treatment method: 903 more
 → ask more: 594 more


In [None]:
# === SETUP AUGMENTER ===
augmenter = CLAREAugmenter(model="roberta-base", transformations_per_example=1)
grouped = original_df.groupby("intent")
augmented_rows = augmented_df.values.tolist()

# === AUGMENT ONLY WHAT'S MISSING ===
for intent, needed in classes_to_augment.items():
    print(f"\n🔧 Augmenting '{intent}' → Need: {needed}")
    group = grouped.get_group(intent)
    texts = group['text'].tolist()
    generated = 0

    with tqdm(total=needed, desc=f"Generating '{intent}'", ncols=100) as pbar:
        while generated < needed:
            for text in texts:
                if generated >= needed:
                    break
                try:
                    aug_texts = augmenter.augment(text)
                    if aug_texts:
                        augmented_rows.append((aug_texts[0], intent))
                        generated += 1
                        pbar.update(1)

                        # Save progress every 50 augmentations
                        if generated % 50 == 0:
                            pd.DataFrame(augmented_rows, columns=["text", "intent"]).to_csv(checkpoint_path, index=False)
                except Exception as e:
                    print(f"⚠️ Error on: {e}")
                    continue


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


🔧 Augmenting 'symptoms' → Need: 41


Generating 'symptoms':   0%|                                                 | 0/41 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/72.9M [00:00<?, ?B/s]

2025-05-21 14:29:53,706 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, VERB, PUNCT, ADP, DET, PROPN, PRON, ADJ, ADV, CCONJ, PART, NUM, AUX, INTJ, SYM, X, <START>, <STOP>


Generating 'symptoms':  66%|████████████████████████▎            | 27/41 [1:58:28<46:17, 198.37s/it]

In [None]:
# === SAVE FINAL MERGED DATA ===
aug_df = pd.DataFrame(augmented_rows, columns=["text", "intent"])
final_df = pd.concat([original_df, aug_df]).sample(frac=1).reset_index(drop=True)
final_df.to_csv(final_output_path, index=False)

print(f"\n✅ Done! Final dataset saved to:\n📁 {final_output_path}")
print(f"📊 Total records: {len(final_df)}")
