In [None]:
! pip install transformers datasets
! pip install --upgrade transformers

import os
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from transformers import MarianMTModel, MarianTokenizer
from openai import OpenAI
import time
import logging
import json
import argparse
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Union, Tuple
import re

from datasets import Dataset as HFDataset


# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
###############################################
# DATA STRUCTURES
###############################################

@dataclass
class TextSample:
    """A single text sample with its class label"""
    text: str
    label: Union[int, str]
    transformed: Optional[str] = None





@dataclass
class TextDataset:
    """A collection of text samples with metadata"""
    samples: List[TextSample]
    class_names: List[str]
    name: str
    headings: str

    def get_class_distribution(self):
        """Returns the distribution of classes in the dataset"""
        distribution = {}
        for sample in self.samples:
            if sample.label not in distribution:
                distribution[sample.label] = 0
            distribution[sample.label] += 1
        return distribution

    def get_samples_by_class(self):
        """Groups samples by class"""
        samples_by_class = {}
        for sample in self.samples:
            if sample.label not in samples_by_class:
                samples_by_class[sample.label] = []
            samples_by_class[sample.label].append(sample)
        return samples_by_class

    def to_dataframe(self):
        """Convert to pandas DataFrame"""
        return pd.DataFrame([{"text": s.text, "label": s.label} for s in self.samples])

    @staticmethod
    def from_dataframe(df, text_col="text", label_col="label", name="custom_dataset", class_names=None):
        """Create a Dataset from a pandas DataFrame"""
        transform_col = any("transform" in col for col in df.columns)
        if not transform_col:
          samples = [TextSample(text=row[text_col], label=row[label_col]) for _, row in df.iterrows()]
        else:
          trans_col_name = next((col for col in df.columns if "transform" in col), None)
          samples = [
              TextSample(
                  text=row[text_col],
                  label=row[label_col],
                  transformed=row[trans_col_name] if pd.notna(row[trans_col_name]) and row[trans_col_name] != "" else None
              )
              for _, row in df.iterrows()
          ]
        headings = label_col + ', ' + text_col
        if class_names is None:
            class_names = sorted(list(set(df[label_col].unique())))
        return TextDataset(samples=samples, class_names=class_names, name=name, headings = headings)

    def save_to_json(self, filepath):
        """Save dataset to JSON file"""
        with open(filepath, 'w') as f:
            json.dump({
                "samples": [{"text": s.text, "label": s.label} for s in self.samples],
                "class_names": self.class_names,
                "name": self.name
            }, f)

    @classmethod
    def load_from_json(cls, filepath):
        """Load dataset from JSON file"""
        with open(filepath, 'r') as f:
            data = json.load(f)
        samples = [TextSample(text=s["text"], label=s["label"]) for s in data["samples"]]
        return cls(samples=samples, class_names=data["class_names"], name=data["name"])

    def balance_classes(self, strategy="oversample", ratio=1.0):
        """Balance dataset classes using the specified strategy

        Args:
            strategy: "oversample" (duplicate minority class samples) or
                     "undersample" (remove majority class samples)
            ratio: For oversampling - what ratio of the majority class to aim for
                   For undersampling - what ratio of the minority class to aim for

        Returns:
            A new Dataset with balanced classes
        """
        # Get class distribution
        distribution = self.get_class_distribution()
        samples_by_class = self.get_samples_by_class()

        # Find majority and minority class sizes
        majority_size = max(distribution.values())
        minority_size = min(distribution.values())

        new_samples = []

        if strategy == "oversample":
            # Calculate target size for minority classes
            target_size = int(majority_size * ratio)

            for class_label, samples in samples_by_class.items():
                current_size = len(samples)

                if current_size < target_size:
                    # Oversample this class
                    samples_to_add = target_size - current_size
                    oversampled = random.choices(samples, k=samples_to_add)
                    new_samples.extend(samples + oversampled)
                else:
                    # Keep this class as is
                    new_samples.extend(samples)

        elif strategy == "undersample":
            # Calculate target size for majority classes
            target_size = int(minority_size * ratio)

            for class_label, samples in samples_by_class.items():
                current_size = len(samples)

                if current_size > target_size:
                    # Undersample this class
                    undersampled = random.sample(samples, target_size)
                    new_samples.extend(undersampled)
                else:
                    # Keep this class as is
                    new_samples.extend(samples)
        else:
            raise ValueError(f"Unknown balancing strategy: {strategy}")

        return TextDataset(samples=new_samples, class_names=self.class_names, name=f"{self.name}_{strategy}d")
    def get_column_headline_names(self) -> str:
        """
        Return your dataset's column names as a comma-separated string.
        E.g. if you loaded from columns `isFake` and `text`, you'll get:
            "isFake, text"
        """
        return self.headings

    def split(self, test_size=0.2, val_size=0, random_state=42, stratify=True):
        """Split dataset into train/val/test sets

        Returns:
            Tuple of (train_dataset, val_dataset, test_dataset)
        """
        df = self.to_dataframe()

        # Use label for stratification if requested
        strat = df['label'] if stratify else None

        # Split into train+val and test
        train_val_df, test_df = train_test_split(
            df, test_size=test_size, random_state=random_state, stratify=strat
        )

        # Update stratification for next split
        strat_train_val = train_val_df['label'] if stratify else None

        # Split train+val into train and val
        train_df, val_df = train_test_split(
            train_val_df,
            test_size=val_size/(1-test_size),  # Adjust val_size relative to train+val size
            random_state=random_state,
            stratify=strat_train_val
        )

        # Create new dataset objects
        train_dataset = Dataset.from_dataframe(
            train_df,
            name=f"{self.name}_train",
            class_names=self.class_names
        )

        val_dataset = Dataset.from_dataframe(
            val_df,
            name=f"{self.name}_val",
            class_names=self.class_names
        )

        test_dataset = Dataset.from_dataframe(
            test_df,
            name=f"{self.name}_test",
            class_names=self.class_names
        )

        return train_dataset, val_dataset, test_dataset


class LLMPromptGenerator:
    """Base class for generating prompts to instruct LLMs for synthetic data generation"""

    def generate_prompt(self, description: str, dataset: Dataset, target_class=None, n = 5, num_examples=5) -> str:
        """Generate a prompt for LLM-based synthetic data generation"""
        raise NotImplementedError

    def get_name(self) -> str:
        """Returns a name for this prompt generator"""
        return self.__class__.__name__

class GenericPromptGenerator(LLMPromptGenerator):
    """Generate generic prompts for synthetic data generation"""
    def generate_prompt(self,
                        description: str,
                        dataset: Dataset,
                        target_class: Optional[int] = None,
                        num_row: int = 5,
                        num_examples: int = 5) -> str:
        """
        description: a free‐form description of the task
        dataset: your Dataset instance (must have enough samples)
        target_class: if set, only that class (0,1,2…) is emitted
        n: examples per class in each block
        num_examples: how many blocks to repeat
        """
        samples_by_class = dataset.get_samples_by_class()

        classes = list(samples_by_class.keys())

        header = dataset.get_column_headline_names()
        prompt = f"{description}\n{header}\n"

        for block_idx in range(num_examples):
            for cls_idx, cls_name in enumerate(classes):
                # normalize cls → integer index



                # slice out the next n examples for this class
                examples = samples_by_class[cls_name]
                # start = block_idx * n
                # end = start + n
                for ex in random.sample(examples, num_row):
                    prompt += f"{cls_name}_label, {' '.join(ex.text.split()[:150])}\n"

            # repeat header to cue the model
            prompt += f"{header}\n"

        return prompt

class SeparateClassPromptGenerator(LLMPromptGenerator):
    """Class wise grouping prompts for synthetic data generation"""
    def generate_prompt(self,
                        description: str,
                        dataset: Dataset,
                        target_class: Optional[int] = None,
                        num_row: int = 5,
                        num_examples: int = 5) -> str:
        """
        description: a free‐form description of the task
        dataset: your Dataset instance (must have enough samples)
        target_class: if set, only that class (0,1,2…) is emitted
        n: examples per class in each block
        num_examples: how many blocks to repeat
        """
        samples_by_class = dataset.get_samples_by_class()

        classes = list(samples_by_class.keys())

        header = dataset.get_column_headline_names()
        prompt = f"{description}\n{header}\n"
        for cls_idx, cls_name in enumerate(classes):
          examples = samples_by_class[cls_name]
          for block_idx in range(num_examples):
            for ex in random.sample(examples, num_row):
                    prompt += f"{cls_name}_label, {' '.join(ex.text.split()[:150])}\n"
            prompt += f"{header}\n"

        return prompt



class TechniquePromptGenerator(LLMPromptGenerator):
    """Instruction reverse engineering prompts instructing LLMs to use specific techniques"""
    def generate_prompt(self,
                        description: str,
                        dataset: Dataset,
                        target_class: Optional[int] = None,
                        num_row: int = 5,
                        num_examples: int = 5) -> str:
        """
        description: a free‐form description of the task
        dataset: your Dataset instance (must have enough samples)
        target_class: if set, only that class (0,1,2…) is emitted
        n: examples per class in each block
        num_examples: how many blocks to repeat
        """
        samples_by_class = dataset.get_samples_by_class()

        # decide which classes to include
        if target_class is not None:
            classes = [target_class]
        else:
            # use the labels as they appear in samples_by_class
            classes = list(samples_by_class.keys())

        header = dataset.get_column_headline_names()
        prompt = f"{description}\n"
        examples = samples_by_class[target_class]
        print(len(examples))
        for block_idx in range(num_examples):
          ex = random.sample(examples, 1)[0]
          prompt += f"Modified: {' '.join(ex.transformed.split()[:150])}\n Original: {' '.join(ex.text.split()[:150])}\n"

            # for cls_idx, cls_name in enumerate(classes):
            #     # normalize cls → integer index
            #     letter = chr(ord("A") + cls_idx)


            #     # slice out the next n examples for this class
            #     examples = samples_by_class[cls_name]
            #     # start = block_idx * n
            #     # end = start + n
            #     for ex in random.sample(examples, num_row):
            #         prompt += f"{letter}.\n{cls_name}, {' '.join(ex.text.split()[:150])}\n"

            # repeat header to cue the model
        samp = random.sample(examples, 1)[0]
        prompt += f"Based on the examples above, identify the original based on this modification. The original is a text classifcation example of '{target_class}'. Return only the original text and do NOT copy the modified version:\n Modified: {' '.join(samp.transformed.split()[:150])} \n Original: "


        return prompt


class MinorityClassPromptGenerator(LLMPromptGenerator):
    """Minority class only, instructing LLMs to use specific techniques"""

    def generate_prompt(self,
                        description: str,
                        dataset: Dataset,
                        target_class: Optional[int] = None,
                        num_row: int = 1,
                        num_examples: int = 1) -> str:

        """
        description: a free‐form description of the task
        dataset: your Dataset instance (must have enough samples)
        target_class: if set, only that class (0,1,2…) is emitted
        n: examples per class in each block
        num_examples: how many blocks to repeat
        """
        samples_by_class = dataset.get_samples_by_class()

        # decide which classes to include
        if target_class is not None:
            classes = [target_class]
        else:
            # use the labels as they appear in samples_by_class
            classes = list(samples_by_class.keys())

        header = dataset.get_column_headline_names()
        prompt = f"{description}\n"
        examples = samples_by_class[target_class]


        for block_idx in range(num_examples):
            letter = 'A'
            for ex in random.sample(examples, 1):
                    prompt += f"{target_class}_label, {' '.join(ex.text.split()[:150])}\n"
            # for cls_idx, cls_name in enumerate(classes):
            #     # normalize cls → integer index
            #     letter = chr(ord("A") + cls_idx)


            #     # slice out the next n examples for this class
            #     examples = samples_by_class[cls_name]
            #     # start = block_idx * n
            #     # end = start + n
            #     for ex in random.sample(examples, num_row):
            #         prompt += f"{letter}.\n{cls_name}, {' '.join(ex.text.split()[:150])}\n"

            # repeat header to cue the model
            # prompt += f"{header}\n"
        # prompt += f"{letter}.\n"

        return prompt


class InstructionChoicePromptGenerator(LLMPromptGenerator):
    """Instruction choice for specific techniques"""

    def generate_prompt(self,
                        description: str,
                        dataset: Dataset,
                        target_class: Optional[int] = None,
                        num_row: int = 1,
                        num_examples: int = 1) -> str:

        """
        description: a free‐form description of the task
        dataset: your Dataset instance (must have enough samples)
        target_class: if set, only that class (0,1,2…) is emitted
        n: examples per class in each block
        num_examples: how many blocks to repeat
        """
        samples_by_class = dataset.get_samples_by_class()

        # decide which classes to include
        if target_class is not None:
            classes = [target_class]
        else:
            # use the labels as they appear in samples_by_class
            classes = list(samples_by_class.keys())

        header = dataset.get_column_headline_names()
        prompt = f"{description}\n"
        examples = samples_by_class[target_class]

        ex = random.sample(examples, 1)[0]
        prompt += """
        [BackTranslation]: Rewrite the sentence as if it had been translated into another language and back into English, ensuring meaning is preserved but phrasing is changed

        [WordDeletion]: Rewrite the sentence by removing at least two non-essential words while keeping its overall meaning

        [WordSwap]: Rewrite the sentence by swapping at least two essential words while preserving overall meaning

        [SynonymReplacement]: Rewrite the sentence by replacing at least two essential words with synonyms

        Now, use any of the strategies above to generate an augmented version of:\n
        """
        prompt += f"Original: {' '.join(ex.text.split()[:150])}\n Modified:"
            # for cls_idx, cls_name in enumerate(classes):
            #     # normalize cls → integer index
            #     letter = chr(ord("A") + cls_idx)


            #     # slice out the next n examples for this class
            #     examples = samples_by_class[cls_name]
            #     # start = block_idx * n
            #     # end = start + n
            #     for ex in random.sample(examples, num_row):
            #         prompt += f"{letter}.\n{cls_name}, {' '.join(ex.text.split()[:150])}\n"

            # repeat header to cue the model
            # prompt += f"{header}\n"
        # prompt += f"{letter}.\n"

        return prompt



class LLMSyntheticDataGenerator:
    """Generate synthetic data using LLMs"""

    def __init__(self, prompt_generator: LLMPromptGenerator, api_key=None, model="gpt-3.5-turbo"):
        self.prompt_generator = prompt_generator
        self.model = model

        # Configure API
        self.client = OpenAI(
  api_key="<replace with api key>"
)




    def parse_llm_response(self, response_text, target_class_name, parse_type = "generic") -> List[TextSample]:

      # if(parse_type == "generic"):
      #   pattern = rf'{re.escape(target_class_name)},\s*(.+?)(?=(?:[A-Z]\.|$))'
      #   reviews = re.findall(pattern, response_text, re.DOTALL)
      #   samples = []
      #   for review in reviews:
      #     text = review.strip()
      #     class_label = target_class_name
      #     samples.append(TextSample(text=text, label=class_label))
      #   return samples
      if(parse_type == "minority" or parse_type == "separate class" or parse_type == "generic"):
        pattern = rf'^{re.escape(target_class_name)}_label,\s*(.+)'
        reviews = re.findall(pattern, response_text, re.MULTILINE)

        samples = []
        for review in reviews:
            text = review.strip()
            class_label = target_class_name
            samples.append(TextSample(text=text, label=class_label))
        return samples
      elif(parse_type == "single"):
        lines = response_text.strip().splitlines()
        samples = []
        for line in lines:
            if line:
              line = re.sub(r'^(?:\[[^\]]+\]|\w+):\s*', '', line)
              samples.append(TextSample(text=line, label=target_class_name))

        return samples

    def generate(self, dataset: Dataset, description = "", target_class=None, num_examples_per_prompt=5, max_retries=3, num_calls = 2, num_row = 2, parse_type = "generic") -> Dataset:
        """Generate synthetic data using LLM"""
        samples_by_class = dataset.get_samples_by_class()

        # If target class specified, only generate data for that class

        new_samples = []

        for call in range(num_calls):
            print(f'Call {call}')

            prompt = self.prompt_generator.generate_prompt(
                description,
                dataset,
                target_class= target_class,
                num_row = num_row,
                num_examples=num_examples_per_prompt
            )

            # Generate response from LLM
            retry_count = 0
            while retry_count < max_retries:
                try:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant that generates synthetic text data for classification tasks."},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=3000,
                        temperature=0.7
                    )

                    response_text = response.choices[0].message.content
                    generated_samples = self.parse_llm_response(response_text, target_class, parse_type)
                    print(len(generated_samples))
                    print('response', generated_samples)
                    print(prompt)

                    new_samples.extend(generated_samples)
                    break
                except Exception as e:
                    logger.error(f"Error generating samples: {str(e)}")
                    retry_count += 1
                    time.sleep(2)  # Wait before retrying

        # Create new dataset with original + synthetic samples
        all_samples = dataset.samples + new_samples
        prompt_generator_name = self.prompt_generator.get_name().lower()
        new_dataset = TextDataset(samples=all_samples, class_names=dataset.class_names, name=f"{dataset.name}_{prompt_generator_name}", headings = dataset.headings)
        return new_dataset, prompt_generator_name

In [None]:
def load_dataset_from_csv(df, text_col="text", label_col="label", class_names=None, name=None):
    """Load dataset from a CSV file"""

    # Ensure text and label columns exist
    if text_col not in df.columns:
        raise ValueError(f"Text column '{text_col}' not found in CSV.")
    if label_col not in df.columns:
        raise ValueError(f"Label column '{label_col}' not found in CSV.")

    # Extract class names if not provided
    if class_names is None:
        class_names = sorted(list(df[label_col].unique()))
        # Make sure class names are strings
        class_names = [str(c) for c in class_names]

    # Use the from_dataframe method
    return TextDataset.from_dataframe(df, text_col=text_col, label_col=label_col,
                                  name=name, class_names=class_names)





In [None]:
dataset = load_dataset_from_csv("imdb.csv", text_col="review", label_col="sentiment")


In [None]:
prompt_gen = GenericPromptGenerator()

In [None]:
llm_gen = LLMSyntheticDataGenerator(prompt_gen, model="gpt-3.5-turbo")

In [None]:
augmented_dataset, responses = llm_gen.generate(dataset, description= "generate synthetic data for sentiment classifcation with the following movie reviews", target_class="negative", num_examples_per_prompt=3, num_calls = 2, num_row = 2)


prommpt generate synthetic data for sentiment classifcation with the following movie reviews
review,sentiment
A.
positive, What I liked best about this feature-length animated film from 1941 is the great feel it gives for the early 1940s. It's the songs, the clothing, automobiles, buildings lingo of the day, etc. You feel like you've stepped back into time.<br /><br />From reading some of the reviews here, I see this was a hard-luck film, being released a couple of days before the Pearl Harbor attack. Wow, no one would be interested in going to the movies for a
A.
positive, "Homeward Bound: The Incredible Journey" is one of those wonderful old movies about house pets. Deserves a place among the great movies of its genre and even the cinema world in general, together with other animal movies like "Old Yeller", "Napoleon", "Fluke" and "Air Bud". This means it is more than just a movie about pets.<br /><br />Can this possibly be just a "remake"? It is too good to be a "remake"! I know thi

In [None]:

def split_and_test(filepath, label_col='label', text_col = 'text', test_size=0.2, random_state=42, prompt_gen = None, description = "",  num_examples_per_prompt=3, num_calls = 2, target_class="negative", num_row = 3, parse_type = "generic"):
    """
    Splits a DataFrame into stratified train and test sets based on the class distribution in label_col.

    Args:
        df (pd.DataFrame): The input DataFrame containing features and a label column.
        label_col (str): The name of the column containing class labels.
        test_size (float): Fraction of data to be used as the test set.
        random_state (int, optional): Random seed for reproducibility.

    Returns:
        train_df (pd.DataFrame): Stratified training set.
        test_df (pd.DataFrame): Stratified test set.
    """

    df = pd.read_csv(filepath)
    if isinstance(prompt_gen, TechniquePromptGenerator):
      # 1. Separate surprise class
      minority_df = df[df[label_col] == target_class]
      other_df = df[df[label_col] != target_class]

      # 2. Split surprise based on transformed_text presence
      minority_train = minority_df[minority_df['transformed_text'].notna()]
      minority_test = minority_df[minority_df['transformed_text'].isna()]

      # 3. Stratified split on other classes
      other_train, other_test = train_test_split(
          other_df,
          test_size=0.2,
          stratify=other_df[label_col],
          random_state=random_state
      )

      # 4. Combine
      train_df = pd.concat([minority_train, other_train], ignore_index=True)
      test_df = pd.concat([minority_test, other_test], ignore_index=True)

      train_df = train_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
      test_df = test_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
      print("TechniquePromptGenerator.")
    else:
      train_df, test_df = train_test_split(
          df,
          test_size=test_size,
          stratify=df[label_col],
          random_state=random_state
      )

    dataset = load_dataset_from_csv(train_df, text_col=text_col, label_col=label_col)
    llm_gen = LLMSyntheticDataGenerator(prompt_gen, model="gpt-3.5-turbo")


    augmented_dataset, responses = llm_gen.generate(dataset, description= description, target_class=target_class, num_examples_per_prompt= num_examples_per_prompt, num_calls = num_calls, num_row = num_row, parse_type = parse_type)
    updated_train_df = augmented_dataset.to_dataframe() # columns are just text and label
    updated_train_df.to_csv(f'augmented_train.csv_{filepath}', index=False)
    updated_train_df = updated_train_df.sample(frac=1).reset_index(drop=True)


    # Step 3: Tokenize and convert to HuggingFace Dataset
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def tokenize_fn(examples):
        return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)
      #Map string labels to integers
    label_list = sorted(df[label_col].unique())
    label2id = {label: idx for idx, label in enumerate(label_list)}
    id2label = {idx: label for label, idx in label2id.items()}

    # Apply the mapping to both train and test
    updated_train_df["label"] = updated_train_df["label"].map(label2id)
    test_df["label"] = test_df[label_col].map(label2id)

    hf_train_dataset = HFDataset.from_pandas(updated_train_df)
    hf_test_dataset = HFDataset.from_pandas(test_df)
    # hf_test_dataset = hf_test_dataset.rename_column(label_col, "label")
    if(text_col != "text"):
      hf_test_dataset = hf_test_dataset.rename_column(text_col, "text")
    hf_train_dataset = hf_train_dataset.map(tokenize_fn, batched=True)
    hf_test_dataset = hf_test_dataset.map(tokenize_fn, batched=True)

    # hf_train_dataset = hf_train_dataset.rename_column(label_col, "label")

    hf_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    hf_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    # Step 4: Model and Trainer
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label2id),
        label2id=label2id,
        id2label=id2label
    )


    training_args = TrainingArguments(
        output_dir="output_dir",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=2,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=250,
        disable_tqdm=False,
        learning_rate=2e-5,
        report_to="none"
    )


    # def compute_metrics(pred):
    #     labels = pred.label_ids
    #     preds = pred.predictions.argmax(-1)
    #     report = classification_report(labels, preds, output_dict=True, zero_division=0)
    #     print("\n" + classification_report(labels, preds, zero_division=0))

    #     # Flatten per-class precision/recall if needed
    #     metrics = {
    #         "accuracy": report["accuracy"],
    #         "f1_macro": report["macro avg"]["f1-score"]
    #     }

    #     for class_label in report:
    #         if class_label not in ["accuracy", "macro avg", "weighted avg"]:
    #             metrics[f"precision_{class_label}"] = report[class_label]["precision"]
    #             metrics[f"recall_{class_label}"] = report[class_label]["recall"]
    #             metrics[f"f1_{class_label}"] = report[class_label]["f1-score"]

    #     return metrics
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        report = classification_report(labels, preds, output_dict=True, zero_division=0)

        # Build the metrics dictionary
        metrics = {
            "accuracy": report["accuracy"],
            "f1_macro": report["macro avg"]["f1-score"]
        }

        for class_label in report:
            if class_label not in ["accuracy", "macro avg", "weighted avg"]:
                metrics[f"precision_{class_label}"] = report[class_label]["precision"]
                metrics[f"recall_{class_label}"] = report[class_label]["recall"]
                metrics[f"f1_{class_label}"] = report[class_label]["f1-score"]

        print("\nClassification Report:\n")
        print(classification_report(labels, preds, zero_division=0))

        return metrics

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train_dataset,
        eval_dataset=hf_test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()


    return updated_train_df, test_df

In [None]:
#split_and_test("hate_speech.csv", label_col='detection', text_col = 'speech', test_size=0.2, random_state=42, prompt_gen = GenericPromptGenerator(), description = "generate synthetic data for hate speech detection, here are example classifcations:",  num_examples_per_prompt=3, num_calls = 1, target_class="Hate", num_row = 2)

Map:   0%|          | 0/962 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
updated_train_df, test_df = split_and_test("hate.csv", label_col='detection', text_col = 'speech', test_size=0.2, random_state=42, prompt_gen = GenericPromptGenerator(), description = "Generate synthetic data for Hate_label based on the examples, keeping the same format.",  num_examples_per_prompt=2, num_calls = 160, target_class="Hate", num_row = 1)

Call 0
2
response [TextSample(text='Those people are just a bunch of criminals and should be sent back to where they came from .', label='Hate', transformed=None), TextSample(text='These immigrants are just a drain on our resources and should be deported immediately .', label='Hate', transformed=None)]
Generate synthetic data for Hate_label based on the examples, keeping the same format.
detection, speech
no hate_label, I think having free healthcare and free education should be a given if you 're a citizen of the country you 're in and you 're not an undesirable .
Hate_label, No wonder this races achieve nothing .
detection, speech
no hate_label, I think they are just desperately trying to be more popish than the Pope himself and avoid accusations of racism before they are even expressed by demanding increased immigration .
Hate_label, Why would you want to see this pollution spread further into our gene pool ?
detection, speech

Call 1
2
response [TextSample(text="It's about time we 

Map:   0%|          | 0/1231 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.413517,0.8,0.682505,0.425926,0.575,0.489362,0.908602,0.845,0.875648
2,No log,0.369678,0.845833,0.679086,0.555556,0.375,0.447761,0.882629,0.94,0.910412



Classification Report:

              precision    recall  f1-score   support

           0       0.43      0.57      0.49        40
           1       0.91      0.84      0.88       200

    accuracy                           0.80       240
   macro avg       0.67      0.71      0.68       240
weighted avg       0.83      0.80      0.81       240


Classification Report:

              precision    recall  f1-score   support

           0       0.56      0.38      0.45        40
           1       0.88      0.94      0.91       200

    accuracy                           0.85       240
   macro avg       0.72      0.66      0.68       240
weighted avg       0.83      0.85      0.83       240



In [None]:
updated_train_df, test_df = split_and_test("emotion.csv", label_col='label', text_col = 'text', test_size=0.2, random_state=42, prompt_gen = GenericPromptGenerator(), description = "Generate two synthetic data for surprise_label based on the examples, keeping the same format.",  num_examples_per_prompt=2, num_calls = 80, target_class="surprise", num_row = 2)

Call 0
2
response [TextSample(text="i couldn't believe my eyes when I saw the surprise party my friends threw for me", label='surprise', transformed=None), TextSample(text='feeling shocked and amazed at the news of my promotion at work', label='surprise', transformed=None)]
Generate two synthetic data for surprise_label based on the examples, keeping the same format.
label, text
anger_label, i know we create our own destiny but do you ever feel resentful for the way your life turned out
anger_label, i didnt know whether or not to feel flattered or some sort of disgusted
fear_label, i have moments where i just feel so overwhelmed that my eyes well up with tears
fear_label, i didnt feel threatened at all by the people like i would have for the first minutes walking in indonesia
love_label, i always thought loving someone is the greatest feeling but i realized that loving a friend is even better
love_label, i am not feeling horny im just letting baba see the emote
sadness_label, i feel de

Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1,Precision 2,Recall 2,F1 2,Precision 3,Recall 3,F1 3,Precision 4,Recall 4,F1 4,Precision 5,Recall 5,F1 5
1,No log,1.67049,0.3125,0.305888,0.857143,0.2,0.324324,0.264706,0.15,0.191489,0.454545,0.5,0.47619,0.338235,0.383333,0.359375,0.361111,0.216667,0.270833,0.127451,0.65,0.213115
2,No log,1.466644,0.453125,0.44741,0.586957,0.45,0.509434,0.37037,0.666667,0.47619,0.642857,0.45,0.529412,0.444444,0.4,0.421053,0.451613,0.233333,0.307692,0.333333,0.65,0.440678



Classification Report:

              precision    recall  f1-score   support

           0       0.86      0.20      0.32        60
           1       0.26      0.15      0.19        60
           2       0.45      0.50      0.48        60
           3       0.34      0.38      0.36        60
           4       0.36      0.22      0.27        60
           5       0.13      0.65      0.21        20

    accuracy                           0.31       320
   macro avg       0.40      0.35      0.31       320
weighted avg       0.43      0.31      0.32       320


Classification Report:

              precision    recall  f1-score   support

           0       0.59      0.45      0.51        60
           1       0.37      0.67      0.48        60
           2       0.64      0.45      0.53        60
           3       0.44      0.40      0.42        60
           4       0.45      0.23      0.31        60
           5       0.33      0.65      0.44        20

    accuracy               

In [None]:
updated_train_df, test_df = split_and_test("sarcasm.csv", label_col='label', text_col = 'headline', test_size=0.2, random_state=42, prompt_gen = GenericPromptGenerator(), description = "Generate exactly two synthetic data for sarcastic_label based on the examples, keeping the same format.",  num_examples_per_prompt=2, num_calls = 80, target_class="sarcastic", num_row = 2)

Call 0
2
response [TextSample(text='local man breaks record for longest time spent staring at microwave', label='sarcastic', transformed=None), TextSample(text='groundbreaking study reveals water is wet', label='sarcastic', transformed=None)]
Generate exactly two synthetic data for sarcastic_label based on the examples, keeping the same format.
label, headline
sarcastic_label, area man purchases the devil's advocate on dvd for some reason
sarcastic_label, study finds flushing toilets wastes billions of gallons of piss and shit annually
not sarcastic_label, how (not) to repeat history
not sarcastic_label, decoding america's immigration sentiment
label, headline
sarcastic_label, road sign over-explains highway's dangers
sarcastic_label, mars maven begins mission to take thousands of high-resolution desktop backgrounds
not sarcastic_label, this dance inspired by 'moonlight' is almost as gorgeous as the real thing
not sarcastic_label, america, the next hobby lobby case is heading for the s

Map:   0%|          | 0/1110 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.341824,0.854167,0.745076,0.918782,0.905,0.911839,0.55814,0.6,0.578313
2,No log,0.34515,0.858333,0.749969,0.919192,0.91,0.914573,0.571429,0.6,0.585366



Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       200
           1       0.56      0.60      0.58        40

    accuracy                           0.85       240
   macro avg       0.74      0.75      0.75       240
weighted avg       0.86      0.85      0.86       240


Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       200
           1       0.57      0.60      0.59        40

    accuracy                           0.86       240
   macro avg       0.75      0.76      0.75       240
weighted avg       0.86      0.86      0.86       240



In [None]:
updated_train_df, test_df = split_and_test("emotion.csv", label_col='label', text_col = 'text', test_size=0.2, random_state=42, prompt_gen = MinorityClassPromptGenerator(), description = "Generate synthetic data for surpise_label based on the examples, keeping the same format",  num_examples_per_prompt=1, num_calls = 80, target_class="surprise", num_row = 1, parse_type = "minority")

Call 0
response [TextSample(text='i was completely caught off guard when my friends threw me a surprise party for my birthday and it was the best feeling ever', label='surprise', transformed=None)]
Generate synthetic data for surpise_label based on the examples, keeping the same format
surprise_label, i guess it doesn t help that i got sick on black friday and was forced against my will to maintain my promise to stay in but being back in the city feels amazing

Call 1
response [TextSample(text='i cannot believe this is happening', label='surprise', transformed=None), TextSample(text='this is really unexpected', label='surprise', transformed=None), TextSample(text='I never saw this coming', label='surprise', transformed=None), TextSample(text="wow, I'm totally shocked", label='surprise', transformed=None), TextSample(text='this is a big surprise for me', label='surprise', transformed=None)]
Generate synthetic data for surpise_label based on the examples, keeping the same format
surprise

Map:   0%|          | 0/1497 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1,Precision 2,Recall 2,F1 2,Precision 3,Recall 3,F1 3,Precision 4,Recall 4,F1 4,Precision 5,Recall 5,F1 5
1,No log,1.688626,0.240625,0.185409,0.142857,0.016667,0.029851,0.0,0.0,0.0,0.355556,0.266667,0.304762,0.0,0.0,0.0,0.205882,0.816667,0.328859,0.37931,0.55,0.44898
2,No log,1.56666,0.4125,0.4042,0.666667,0.166667,0.266667,0.457143,0.266667,0.336842,0.522727,0.383333,0.442308,0.442308,0.383333,0.410714,0.335878,0.733333,0.460733,0.372093,0.8,0.507937



Classification Report:

              precision    recall  f1-score   support

           0       0.14      0.02      0.03        60
           1       0.00      0.00      0.00        60
           2       0.36      0.27      0.30        60
           3       0.00      0.00      0.00        60
           4       0.21      0.82      0.33        60
           5       0.38      0.55      0.45        20

    accuracy                           0.24       320
   macro avg       0.18      0.28      0.19       320
weighted avg       0.16      0.24      0.15       320


Classification Report:

              precision    recall  f1-score   support

           0       0.67      0.17      0.27        60
           1       0.46      0.27      0.34        60
           2       0.52      0.38      0.44        60
           3       0.44      0.38      0.41        60
           4       0.34      0.73      0.46        60
           5       0.37      0.80      0.51        20

    accuracy               

In [None]:
updated_train_df, test_df = split_and_test("hate.csv", label_col='detection', text_col = 'speech', test_size=0.2, random_state=42, prompt_gen = MinorityClassPromptGenerator(), description = "Generate synthetic data for Hate_label based on the examples, keeping the same format",  num_examples_per_prompt=1, num_calls = 160, target_class="Hate", num_row = 1, parse_type = "minority")

Call 0
response [TextSample(text="immigrants are ruining our country. Send them back to where they came from! They don't belong here. #NoImmigrantsAllowed", label='Hate', transformed=None)]
Generate synthetic data for Hate_label based on the examples, keeping the same format
Hate_label, immigrants are desperate to stay here. who can blame them ? our government treats them as lords and we as muck. welcome to SFI Pluirin !

Call 1
response [TextSample(text='Well, perhaps they are gradually moving towards dismantling Western Europe.', label='Hate', transformed=None)]
Generate synthetic data for Hate_label based on the examples, keeping the same format
Hate_label, Well, maybe they 're slowly working their way to destroying Eastern Europe .

Call 2
response [TextSample(text='They instead choose based on convenience, minimal risk, and selfish self-interest.', label='Hate', transformed=None), TextSample(text="They opt for what's easy, safe, and self-serving instead.", label='Hate', transforme

Map:   0%|          | 0/1173 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.393023,0.808333,0.655,0.425,0.425,0.425,0.885,0.885,0.885
2,No log,0.362002,0.854167,0.729303,0.567568,0.525,0.545455,0.906404,0.92,0.913151



Classification Report:

              precision    recall  f1-score   support

           0       0.42      0.42      0.42        40
           1       0.89      0.89      0.89       200

    accuracy                           0.81       240
   macro avg       0.66      0.66      0.66       240
weighted avg       0.81      0.81      0.81       240


Classification Report:

              precision    recall  f1-score   support

           0       0.57      0.53      0.55        40
           1       0.91      0.92      0.91       200

    accuracy                           0.85       240
   macro avg       0.74      0.72      0.73       240
weighted avg       0.85      0.85      0.85       240



In [None]:
updated_train_df, test_df = split_and_test("hate.csv", label_col='detection', text_col = 'speech', test_size=0.2, random_state=42, prompt_gen = SeparateClassPromptGenerator(), description = "Generate synthetic data for Hate_label based on the examples, keeping the same format",  num_examples_per_prompt=1, num_calls = 160, target_class="Hate", num_row = 2, parse_type = "separate class")

Call 0


ERROR:__main__:Error generating samples: object of type 'NoneType' has no len()


KeyboardInterrupt: 

In [None]:
updated_train_df, test_df = split_and_test("emotion.csv", label_col='label', text_col = 'text', test_size=0.2, random_state=42, prompt_gen = SeparateClassPromptGenerator(), description = "Generate one synthetic data for surprise_label based on the examples, keeping the same format",  num_examples_per_prompt=1, num_calls = 80, target_class="surprise", num_row = 1, parse_type = "separate class")

Call 0
response [TextSample(text='as i walked into the room, I was shocked to see all my friends gathered for a surprise birthday party.', label='surprise', transformed=None)]
Generate one synthetic data for surprise_label based on the examples, keeping the same format
label, text
anger_label, i feel wronged but the judges people make at times however i also found out that actually in life we just need to be responsible to our own actions and and the people around us
label, text
fear_label, i have been given appointments with oncologists and radiologists per protocol following breast cancer surgery i have to admit that i feel strange
label, text
love_label, i have never known a love like the love i feel for you sweet emma and benjamin
label, text
sadness_label, i feel beaten by it
label, text
joy_label, i feel really lucky that i m making a living doing this and i think it s important to pass the word about tap dance
label, text
surprise_label, i stared up at him amazed by the feeling 

Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1,Precision 2,Recall 2,F1 2,Precision 3,Recall 3,F1 3,Precision 4,Recall 4,F1 4,Precision 5,Recall 5,F1 5
1,No log,1.574937,0.415625,0.36491,0.769231,0.333333,0.465116,0.4,0.766667,0.525714,0.6,0.2,0.3,0.352381,0.616667,0.448485,0.333333,0.266667,0.296296,0.333333,0.1,0.153846
2,No log,1.372868,0.50625,0.470465,0.565217,0.433333,0.490566,0.469027,0.883333,0.612717,0.533333,0.666667,0.592593,0.454545,0.25,0.322581,0.5,0.383333,0.433962,0.714286,0.25,0.37037



Classification Report:

              precision    recall  f1-score   support

           0       0.77      0.33      0.47        60
           1       0.40      0.77      0.53        60
           2       0.60      0.20      0.30        60
           3       0.35      0.62      0.45        60
           4       0.33      0.27      0.30        60
           5       0.33      0.10      0.15        20

    accuracy                           0.42       320
   macro avg       0.46      0.38      0.36       320
weighted avg       0.48      0.42      0.39       320


Classification Report:

              precision    recall  f1-score   support

           0       0.57      0.43      0.49        60
           1       0.47      0.88      0.61        60
           2       0.53      0.67      0.59        60
           3       0.45      0.25      0.32        60
           4       0.50      0.38      0.43        60
           5       0.71      0.25      0.37        20

    accuracy               

In [None]:
updated_train_df, test_df = split_and_test("sarcasm.csv", label_col='label', text_col = 'headline', test_size=0.2, random_state=42, prompt_gen = SeparateClassPromptGenerator(), description = "Generate exactly two synthetic data only for sarcastic_label based on the examples, keeping the same format",  num_examples_per_prompt=1, num_calls = 80, target_class="sarcastic", num_row = 2, parse_type = "separate class")

Call 0
response sarcastic_label, groundbreaking study reveals eating vegetables may actually be good for you
sarcastic_label, shocking new research suggests exercise could improve physical health
Generate exactly two synthetic data only for sarcastic_label based on the examples, keeping the same format
label, headline
sarcastic_label, study finds leading cause of depression hearing words '2016 frontrunners'
sarcastic_label, ryan lochte now changing account of events going back years before robbery
label, headline
not sarcastic_label, alyson stoner opens up about falling in love with a woman
not sarcastic_label, a new joint message from the kremlin and the trump administration
label, headline

Call 1
response sarcastic_label, 'Wow, protesting is definitely more important than education,' say brilliant minds everywhere
sarcastic_label, Scientists discover groundbreaking new species of unicorns roaming Earth's surface
Generate exactly two synthetic data only for sarcastic_label based on t

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.451345,0.8,0.701214,0.926966,0.825,0.873016,0.435484,0.675,0.529412
2,No log,0.350765,0.866667,0.732404,0.9,0.945,0.921951,0.633333,0.475,0.542857



Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.82      0.87       200
           1       0.44      0.68      0.53        40

    accuracy                           0.80       240
   macro avg       0.68      0.75      0.70       240
weighted avg       0.85      0.80      0.82       240


Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       200
           1       0.63      0.47      0.54        40

    accuracy                           0.87       240
   macro avg       0.77      0.71      0.73       240
weighted avg       0.86      0.87      0.86       240



In [None]:
updated_train_df, test_df = split_and_test("sarcasm.csv", label_col='label', text_col = 'headline', test_size=0.2, random_state=42, prompt_gen = InstructionChoicePromptGenerator(), description = "Generate exactly one new Sarcastic statement using different styles, respond with just the modified statement",  num_examples_per_prompt=1, num_calls = 160, target_class="sarcastic", num_row = 2, parse_type = "single")

Call 0
1
response [SynonymReplacement]: research discovers primary reason for despair listening to phrases '2016 frontrunners'
Generate exactly one new Sarcastic statement using different styles, respond with just the modified statement

        [BackTranslation]: Rewrite the sentence as if it had been translated into another language and back into English, ensuring meaning is preserved but phrasing is changed

        [WordDeletion]: Rewrite the sentence by removing at least two non-essential words while keeping its overall meaning

        [WordSwap]: Rewrite the sentence by swapping at least two essential words while preserving overall meaning

        [SynonymReplacement]: Rewrite the sentence by replacing at least two essential words with synonyms

        Now, use any of the strategies above to generate an augmented version of:

        Original: study finds leading cause of depression hearing words '2016 frontrunners'
 Modified:
Call 1
1
response [WordSwap]: ryan lochte currentl

KeyboardInterrupt: 

In [None]:
updated_train_df, test_df = split_and_test("emotion_combined.csv", label_col='emotion', text_col = 'text', test_size=0.2, random_state=42, prompt_gen = TechniquePromptGenerator(), description = "The following modified senteces were modified from the original examples of the 'Suprise' emotion used in text classifcation classifcation. Modifications were done by combination of synonym replacement, word deletion, and word order swap",  num_examples_per_prompt=2, num_calls = 80, target_class="surprise", num_row = 2, parse_type = "single")

TechniquePromptGenerator.
Call 0
80
1
response [TextSample(text='i take the air out of thither with ampere better understanding of what report going along Indiana the experiment merely also if one little stunned that i had only feeling a equation to was all of this', label='surprise', transformed=None)]
The following modified senteces were modified from the original examples of the 'Suprise' emotion used in text classifcation classifcation. Modifications were done by combination of synonym replacement, word deletion, and word order swap
Modified: i could a hours on drop set and feel awesome
 Original: i could spend hours on a set and feel amazing
Modified: me guess IT the exist worst feeling it gives unity shivers and just thinking about it stimulate my teeth feel foreign
 Original: i think it is the worst feeling it gives me the shivers and just thinking about it makes my teeth feel strange
Based on the examples above, identify the original based on this modification. The original is 

Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1,Precision 2,Recall 2,F1 2,Precision 3,Recall 3,F1 3,Precision 4,Recall 4,F1 4,Precision 5,Recall 5,F1 5
1,No log,1.627195,0.396875,0.331487,0.478261,0.366667,0.415094,0.382812,0.816667,0.521277,0.428571,0.55,0.481752,0.5,0.2,0.285714,0.232558,0.166667,0.194175,0.5,0.05,0.090909
2,No log,1.457335,0.48125,0.400768,0.46875,0.5,0.483871,0.474747,0.783333,0.591195,0.534247,0.65,0.586466,0.5,0.366667,0.423077,0.4,0.266667,0.32,0.0,0.0,0.0



Classification Report:

              precision    recall  f1-score   support

           0       0.48      0.37      0.42        60
           1       0.38      0.82      0.52        60
           2       0.43      0.55      0.48        60
           3       0.50      0.20      0.29        60
           4       0.23      0.17      0.19        60
           5       0.50      0.05      0.09        20

    accuracy                           0.40       320
   macro avg       0.42      0.36      0.33       320
weighted avg       0.41      0.40      0.36       320


Classification Report:

              precision    recall  f1-score   support

           0       0.47      0.50      0.48        60
           1       0.47      0.78      0.59        60
           2       0.53      0.65      0.59        60
           3       0.50      0.37      0.42        60
           4       0.40      0.27      0.32        60
           5       0.00      0.00      0.00        20

    accuracy               

In [None]:
updated_train_df, test_df = split_and_test("emotion_backtrans.csv", label_col='emotion', text_col = 'text', test_size=0.2, random_state=42, prompt_gen = TechniquePromptGenerator(), description = "The following modified senteces were modified from original examples of the 'Suprise' emotion used to train emotion classifcation. Modifications were done by paraphrasing. Do NOT simply copy the modified version",  num_examples_per_prompt=2, num_calls = 80, target_class="surprise", num_row = 2, parse_type = "single")

TechniquePromptGenerator.
Call 0
80
1
response i came out of there with a better understanding of what was happening in the experience but also feeling a little amazed that i had only one equation to describe all this
The following modified senteces were modified from original examples of the 'Suprise' emotion used to train emotion classifcation. Modifications were done by paraphrasing. Do NOT simply copy the modified version
Modified: I could spend hours on a set and feel amazing
 Original: i could spend hours on a set and feel amazing
Modified: I think it's the worst feeling he gives me the chills and just thinking about it makes my teeth weird
 Original: i think it is the worst feeling it gives me the shivers and just thinking about it makes my teeth feel strange
Based on the examples above, identify the original based on this modification. The original is a text classifcation example of 'surprise'. Return only the original text:
 Modified: I came out of there with a better understa

Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1,Precision 2,Recall 2,F1 2,Precision 3,Recall 3,F1 3,Precision 4,Recall 4,F1 4,Precision 5,Recall 5,F1 5
1,No log,1.609785,0.390625,0.307468,0.390244,0.266667,0.316832,0.379032,0.783333,0.51087,0.487179,0.633333,0.550725,0.318182,0.116667,0.170732,0.309091,0.283333,0.295652,0.0,0.0,0.0
2,No log,1.41285,0.496875,0.416228,0.462687,0.516667,0.488189,0.485149,0.816667,0.608696,0.578125,0.616667,0.596774,0.48,0.4,0.436364,0.473684,0.3,0.367347,0.0,0.0,0.0



Classification Report:

              precision    recall  f1-score   support

           0       0.39      0.27      0.32        60
           1       0.38      0.78      0.51        60
           2       0.49      0.63      0.55        60
           3       0.32      0.12      0.17        60
           4       0.31      0.28      0.30        60
           5       0.00      0.00      0.00        20

    accuracy                           0.39       320
   macro avg       0.31      0.35      0.31       320
weighted avg       0.35      0.39      0.35       320


Classification Report:

              precision    recall  f1-score   support

           0       0.46      0.52      0.49        60
           1       0.49      0.82      0.61        60
           2       0.58      0.62      0.60        60
           3       0.48      0.40      0.44        60
           4       0.47      0.30      0.37        60
           5       0.00      0.00      0.00        20

    accuracy               

In [None]:
updated_train_df, test_df = split_and_test("hate_backtrans.csv", label_col='detection', text_col = 'speech', test_size=0.2, random_state=42, prompt_gen = TechniquePromptGenerator(), description = "The following modified senteces were modified from original examples of Hate speech used to train hate speech classifcation. Modifications were done by paraphrasing.",  num_examples_per_prompt=2, num_calls = 80, target_class="Hate", num_row = 2, parse_type = "single")

TechniquePromptGenerator.
Call 0
160
1
response If a show does not involve homosexuals and/or interracial coupling, then it has a bunch of Jews masking like whites running around with weapons killing other whites.
The following modified senteces were modified from original examples of Hate speech used to train hate speech classifcation. Modifications were done by paraphrasing. Do NOT simply copy the modified version
Modified: The regime wants the Germans to hate each other, a pity that they are not already brown Muslims!
 Original: The regime wants Germans to hate themselves , too bad that they are not already brown Muslims !
Modified: Deanna Spingola and Dr. Lorraine Day - Zionism, Holocaust, AIDS and Vaccines YouTube - Deanna Spingola and Dr. Lorraine Day - Zionism, Holocaust, AIDS and Vaccines Listen/View full interview @ Deanna Spingola and Dr. Lorraine Day - Zionism, Holocaust, AIDS and Vaccines: Free download and streaming: Archives Internet www.DrDay.com www.GoodNewsAbout God.co

Map:   0%|          | 0/1040 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.347601,0.870833,0.743793,0.645161,0.5,0.56338,0.904306,0.945,0.924205
2,No log,0.304568,0.8875,0.791177,0.675676,0.625,0.649351,0.926108,0.94,0.933002



Classification Report:

              precision    recall  f1-score   support

           0       0.65      0.50      0.56        40
           1       0.90      0.94      0.92       200

    accuracy                           0.87       240
   macro avg       0.77      0.72      0.74       240
weighted avg       0.86      0.87      0.86       240


Classification Report:

              precision    recall  f1-score   support

           0       0.68      0.62      0.65        40
           1       0.93      0.94      0.93       200

    accuracy                           0.89       240
   macro avg       0.80      0.78      0.79       240
weighted avg       0.88      0.89      0.89       240



In [None]:
updated_train_df, test_df = split_and_test("hate_combined.csv", label_col='detection', text_col = 'speech', test_size=0.2, random_state=42, prompt_gen = TechniquePromptGenerator(), description = "The following modified senteces were modified from original examples of Hate speech used to train hate speech classifcation. Modifications were done by combination of synonym replacement, word deletion, and word order swap.",  num_examples_per_prompt=2, num_calls = 160, target_class="Hate", num_row = 2, parse_type = "single")

TechniquePromptGenerator.
Call 0
160
1
response Faggots are pushing of order their business.
The following modified senteces were modified from original examples of Hate speech used to train hate speech classifcation. Modifications were done by combination of synonym replacement, word deletion, and word order swap.
Modified: idol group bless wisdom the judge and jury against this please of thugs that they all might beryllium cling.
 Original: God please bless with wisdom the judge and jury against this group of thugs that they all might be hanged .
Modified: I perform n't are why the gardai execute n't simply grab them deport them none of them see title to be
 Original: I do n't see why the gardai do n't just grab them and deport them none of them are entitled to be here
Based on the examples above, identify the original based on this modification. The original is a text classifcation example of 'Hate'. Return only the original text and do NOT copy the modified version:
 Modified: Fagg

Map:   0%|          | 0/1122 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.312523,0.8875,0.76582,0.740741,0.5,0.597015,0.906103,0.965,0.934625
2,No log,0.280697,0.879167,0.770864,0.657143,0.575,0.613333,0.917073,0.94,0.928395



Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.50      0.60        40
           1       0.91      0.96      0.93       200

    accuracy                           0.89       240
   macro avg       0.82      0.73      0.77       240
weighted avg       0.88      0.89      0.88       240


Classification Report:

              precision    recall  f1-score   support

           0       0.66      0.57      0.61        40
           1       0.92      0.94      0.93       200

    accuracy                           0.88       240
   macro avg       0.79      0.76      0.77       240
weighted avg       0.87      0.88      0.88       240



In [None]:
updated_train_df, test_df = split_and_test("sarcasm_backtrans.csv", label_col='detection', text_col = 'headline', test_size=0.2, random_state=42, prompt_gen = TechniquePromptGenerator(), description = "The following modified senteces were modified from original examples of Sarcasm used to train sarcasm text classifcation. Modifications were done by paraphrasing.",  num_examples_per_prompt=2, num_calls = 160, target_class="sarcastic", num_row = 2, parse_type = "single")

TechniquePromptGenerator.
Call 0
160
1
response Original: u.n. court marked by thousands of children's letters to the militia
The following modified senteces were modified from original examples of Sarcasm used to train sarcasm text classifcation. Modifications were done by paraphrasing.
Modified: On Friday, 13 nuclear waste disposal technicians lost their lives.
 Original: casual friday claims lives of 13 nuclear-waste-disposal technicians
Modified: The mental hospital fire leaves hundreds of demons homeless
 Original: mental hospital fire leaves hundreds of demons homeless
Based on the examples above, identify the original based on this modification. The original is a text classifcation example of 'sarcastic'. Return only the original text and do NOT copy the modified version:
 Modified: u.n. court marked by thousands of children's letters to the militia 
 Original: 
Call 1
160
1
response the kamikaze swimmers finally reach the port of pearls
The following modified senteces were modi

Map:   0%|          | 0/1121 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.364953,0.8375,0.726244,0.917098,0.885,0.900763,0.510638,0.6,0.551724
2,No log,0.326388,0.866667,0.718928,0.892523,0.955,0.922705,0.653846,0.425,0.515152



Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       200
           1       0.51      0.60      0.55        40

    accuracy                           0.84       240
   macro avg       0.71      0.74      0.73       240
weighted avg       0.85      0.84      0.84       240


Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       200
           1       0.65      0.42      0.52        40

    accuracy                           0.87       240
   macro avg       0.77      0.69      0.72       240
weighted avg       0.85      0.87      0.85       240



In [None]:
updated_train_df, test_df = split_and_test("sarcasm_combined.csv", label_col='detection', text_col = 'headline', test_size=0.2, random_state=42, prompt_gen = TechniquePromptGenerator(), description = "The following modified senteces were modified from original examples of Sarcasm used to train sarcasm text classifcation. Modifications were done by combination of synonym replacement, word deletion, and word order swap.",  num_examples_per_prompt=2, num_calls = 160, target_class="sarcastic", num_row = 2, parse_type = "single")

TechniquePromptGenerator.
Call 0
160
1
response u.n. tribunal swayed by thousands of children's letters to Milosevic
The following modified senteces were modified from original examples of Sarcasm used to train sarcasm text classifcation. Modifications were done by combination of synonym replacement, word deletion, and word order swap.
Modified: casual friday technicians lives of long dozen nuclear-waste-disposal call
 Original: casual friday claims lives of 13 nuclear-waste-disposal technicians
Modified: genial leaves infirmary hundreds of homeless
 Original: mental hospital fire leaves hundreds of demons homeless
Based on the examples above, identify the original based on this modification. The original is a text classifcation example of 'sarcastic'. Return only the original text and do NOT copy the modified version:
 Modified: u.n. tribunal of by thousands sway children 's letter to milosevic 
 Original: 
Call 1
160
1
response Original: kamikaze swimmers last at pearl harbor
The fol

Map:   0%|          | 0/1124 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.336598,0.866667,0.732404,0.9,0.945,0.921951,0.633333,0.475,0.542857
2,No log,0.318406,0.879167,0.748473,0.901408,0.96,0.929782,0.703704,0.475,0.567164



Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       200
           1       0.63      0.47      0.54        40

    accuracy                           0.87       240
   macro avg       0.77      0.71      0.73       240
weighted avg       0.86      0.87      0.86       240


Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       200
           1       0.70      0.47      0.57        40

    accuracy                           0.88       240
   macro avg       0.80      0.72      0.75       240
weighted avg       0.87      0.88      0.87       240



In [None]:
updated_train_df, test_df = split_and_test("sarcasm.csv", label_col='label', text_col = 'headline', test_size=0.2, random_state=42, prompt_gen = MinorityClassPromptGenerator(), description = "Generate exactly one synthetic data only for sarcastic_label based on the examples, keeping the same format",  num_examples_per_prompt=2, num_calls = 160, target_class="sarcastic", num_row = 1, parse_type = "minority")

Call 0
Generate exactly one synthetic data only for sarcastic_label based on the examples, keeping the same format
sarcastic_label, study finds leading cause of depression hearing words '2016 frontrunners'
sarcastic_label, ryan lochte now changing account of events going back years before robbery

response sarcastic_label, groundbreaking study reveals that eating vegetables is the key to eternal happiness
1
Call 1
Generate exactly one synthetic data only for sarcastic_label based on the examples, keeping the same format
sarcastic_label, congress splits into male and female senators to discuss newest reproductive bill
sarcastic_label, 'these kids should be in school instead of protesting,' say people so tantalizingly close to getting the point

response sarcastic_label, I'm sure splitting congress by gender is the most effective way to address reproductive rights issues.
1
Call 2
Generate exactly one synthetic data only for sarcastic_label based on the examples, keeping the same format


Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,No log,0.422778,0.808333,0.705097,0.923077,0.84,0.879581,0.448276,0.65,0.530612
2,No log,0.357734,0.858333,0.708738,0.891509,0.945,0.917476,0.607143,0.425,0.5



Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       200
           1       0.45      0.65      0.53        40

    accuracy                           0.81       240
   macro avg       0.69      0.74      0.71       240
weighted avg       0.84      0.81      0.82       240


Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.94      0.92       200
           1       0.61      0.42      0.50        40

    accuracy                           0.86       240
   macro avg       0.75      0.68      0.71       240
weighted avg       0.84      0.86      0.85       240

