## Setup Libraries

In [None]:
!pip install openai
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Downl

### Import the libraries and the dataset

In [None]:
import pandas as pd
from tqdm import tqdm
from langdetect import detect

In [2]:
df = pd.read_csv('dataset.csv')

### Define the call function to the API

In [None]:
# Set your OpenAI API key
import openai
openai.api_key = ''

def generate_sentence(sentence, hate_text_1, hate_text_2, non_hate_text_1, non_hate_text_2, full_language, label=1):
    """
    Generates an augmented sentence similar to the passed sentence for text classification,
    with given examples of hate and non-hate texts.

    Args:
        sentence (str): The sentence to be augmented.
        hate_text_1 (str): Example of a hate text.
        hate_text_2 (str): Example of a hate text.
        non_hate_text_1 (str): Example of a non-hate text.
        non_hate_text_2 (str): Example of a non-hate text.
        full_language (str): The full name or description of the language.
        label (int, optional): The label of the augmented sentence. Defaults to 1.

    Returns:
        str: The augmented sentence.
    """
    prompt_arr = [
        f"You are a {full_language} data augmentation model. You have to generate data for a text classification task. The sentence can be classified into two categories: hate and non-hate. I'll give two examples of each category, then pass a new sentence called to_augment and its label. You will need to generate a sentence similar to the passed to_augment sentence such that its label won't change:\n",
        f"Hate:\n1. {hate_text_1}\n2. {hate_text_2}\n",
        f"\nNon-Hate:\n1. {non_hate_text_1}\n2. {non_hate_text_2}\n",
        f"\nTo_Augment: {sentence}\nLabel: {str(label)}\n\nThe Augmented Text is:\n"
    ]
    prompt = ''.join(prompt_arr)
    print(prompt)
    response = openai.Completion.create(
        engine="davinci",
        prompt=prompt,
        max_tokens=int(len(sentence) * 1.2),
        temperature=0.75,
        n=1,
        stop=None
    )
    rephrased_sentence = response.choices[0].text.strip().split('\n')[0]
    return rephrased_sentence

# Example usage
# sentence = "माझं नाव आहे सोहिनी."
# rephrased = generate_sentence(sentence, "Hate example 1", "Hate example 2", "Non-hate example 1", "Non-hate example 2", "Marathi", label=1)
# print(f"Original Sentence: {sentence}")
# print(f"Augmented Sentence: {rephrased}")


In [None]:
# static examples for each class
hate_text_1 = "देशातील युवा #youtubevstiktok वर जेवढा आक्रमक झाला आहे तेवढा जर पायदळ जाणाऱ्या मजूरासाठी झाला असता तर सरकारला झक मारून मजुरांची सोय करुन द्यावी लागली असती. #मजदूर #म #"
hate_text_2 = "@Avinashmule22 @mangeshspa नंबर घेतला ना बुल्ल्या कॉल कर ना गांडीत दम असेल तर"
non_hate_text_1 = "@Liberal_India1 मग यावरून सिद्ध काय होतं ते सांग रताळ्या 😂🤣 निष्कर्ष न काढताच गोष्टी सांगायच्या नसतात"
non_hate_text_2 = "@meNeeleshNRane आजपर्यंतचा सगळ्यात निष्क्रिय पालकमंत्री सिंधुदुर्ग जिल्ह्यातला म्हणजे दीपक केसरकर उर्फ काळतोंड्या याच्या राज्यात ना जिल्ह्याला इज्जत भेटत ना विकासाचे प्रकल्प"

In [None]:
'''
Detect if the text is in the input lang
'''
def is_lang(text,input_lang):
    try:
        lang = detect(text)
        if lang == input_lang:
            return True  # Detected language is Marathi
        else:
            return False  # Detected language is not Marathi
    except:
        return False  # Failed to detect language

## Calling it all together in the main function

In [None]:
def call_rephrase(df, lang_token, full_language):
    """
    Calls the rephrase function to generate augmented sentences for each row in the given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame containing 'text' and 'label' columns.
        lang_token: (str): The language token used to check the generated sentence's language.
        full_language (str): The full name or description of the language.

    Returns:
        None
    """
    augmented = []
    for index, row in df.iterrows():
        print(index)
        # Access the values of each column for the current row
        sentence = row['text']
        label = row['label']
        while True:
            prompt = f"Rephrase the following {full_language} sentence, without changing its sentiment:\n\n" + sentence + f"\n\nRephrased {full_language} sentence"
            generated = generate_sentence(prompt, full_language)
            if is_lang(generated, lang_token):
                break  # Break out of the loop if condition is not met
        if f"Generated {full_language} sentence:" in generated:
            generated.replace(f"Generated {full_language} sentence:", "")
        augmented.append([generated, label])

    new_df = pd.DataFrame(augmented, columns=df.columns)
    df_new = df.append(new_df, ignore_index=True)
    df_new = df_new.sample(frac=1)
    df_new.to_csv('df_new.csv')

# Example usage
# call_rephrase(df, lang_token, full_language)

In [None]:
call_rephrase(df, 'mr','Marathi')

0
1
2
3
4


  df_new = df.append(new_df, ignore_index=True)
