# Data Augmentation experiments

## Imports and data read

In [1]:
import pandas as pd
import numpy as np
import pprint

from tqdm import tqdm
from openai import OpenAI

## Pipeline

https://platform.openai.com/docs/api-reference/authentication

In [2]:
api_key = 'sk-proj-7yyg4KC696T7WK6CmLeWT3BlbkFJfjiC6bs5r8R7yoGLCIUP'

In [3]:
client = OpenAI(api_key=api_key)

In [5]:
df = pd.read_parquet('./data_provided/final_dataset/final_17042025.parquet')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12224 entries, 0 to 12223
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   response_id           12224 non-null  int64 
 1   document_id           12224 non-null  int64 
 2   user_id               12224 non-null  int64 
 3   annotator_sentiment   12224 non-null  object
 4   is_ck_annotation      12224 non-null  int64 
 5   response_timestamp    12224 non-null  object
 6   document_content      12224 non-null  object
 7   annotation_date       12224 non-null  object
 8   username              12224 non-null  object
 9   unique_document_id    12224 non-null  object
 10  language_wc           12224 non-null  object
 11  document_length       12224 non-null  int64 
 12  gpt_labels_v1         12224 non-null  object
 13  language_gpt          12224 non-null  object
 14  language_manual       12224 non-null  object
 15  language              12224 non-null

In [7]:
df.language.unique()

array(['ua', 'ru', 'mixed'], dtype=object)

## Stratification label balancing

The core idea is to reduce the imbalance between classes in the dataset by generating new samples by Chat GPT

In [8]:
df.stratification_label.value_counts()

stratification_label
neutral_ua        3291
negative_ua       2433
positive_ua       1859
negative_ru       1799
neutral_ru        1208
mixed_ua           442
positive_ru        441
negative_mixed     309
neutral_mixed      203
mixed_ru           120
positive_mixed      73
mixed_mixed         46
Name: count, dtype: int64

In [9]:
max_stratification = df.stratification_label.value_counts().max()

In [10]:
max_stratification

np.int64(3291)

In [11]:
classes_to_augment = (df.stratification_label.value_counts() - max_stratification).reset_index()

In [12]:
classes_to_augment['count'] = classes_to_augment['count'].apply(abs) 

In [13]:
classes_to_augment

Unnamed: 0,stratification_label,count
0,neutral_ua,0
1,negative_ua,858
2,positive_ua,1432
3,negative_ru,1492
4,neutral_ru,2083
5,mixed_ua,2849
6,positive_ru,2850
7,negative_mixed,2982
8,neutral_mixed,3088
9,mixed_ru,3171


In [14]:
classes_to_augment['count'].sum()

np.int64(27268)

In [20]:
# for text in df.loc[(df.stratification_label == 'mixed_mixed'), 'document_content']:
#     pprint.pprint(text, width=250)
#     print('----------------------------')
#     print('----------------------------')
#     print('----------------------------')

In [None]:
def analyze_sentiment(comment, system_prompt):
    """
    Sends a request to OpenAI's GPT model to analyze sentiment.
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": comment}
        ]
    )   
    return response.choices[0].message.content

In [15]:
prompt_inputs = {
    'negative_ua': [858, 'include only Ukrainian words', 'negative sentiment'],
    'positive_ua': [1432, 'include only Ukrainian words', 'positive sentiment'],
    'negative_ru': [1492, 'include only Russian words', 'negative sentiment'],
    'neutral_ru': [2083, 'include only Russian words', 'neutral sentiment'],
    'mixed_ua': [2849, 'include only Ukrainian words', 'mixed sentiment (express positive and negative emotions in different part of the text output'],
    'positive_ru': [2850, 'include only Russian words', 'positive sentiment'],
    'negative_mixed': [2982, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'negative sentiment'],
    'neutral_mixed': [3088, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'neutral sentiment'],
    'mixed_ru': [3171, 'include only Russian words', 'mixed sentiment (express positive and negative emotions in different part of the text output'],
    'positive_mixed': [3218, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'positive sentiment'],
    'mixed_mixed': [3245, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'mixed sentiment (express positive and negative emotions in different part of the text output)'],

}

In [16]:
prompt_outputs = {
    
}

In [None]:
for strat_label, inputs in tqdm(prompt_inputs.items()):

    prompt_outputs[strat_label] = []
    
    for _ in tqdm(range(inputs[0])):
        language = inputs[1]
        sentiment = inputs[2]
        text = df.loc[(df.stratification_label == strat_label), 'document_content'].sample(1).values[0]
        


        system_prompt_overall = f'''

                You are a sentiment analysis expert. You need to help to create a dataset of texts needed for training an ML model. Your help is to write a text which will be included to the dataset. This is important that the text must {language}. The sentiment of the text should express {sentiment}.
                The example of such a text is provided below.

                Write the text similar to the provided example. You can do just a rewording. However, remember, that the resulted text must {language}. 
                
                Also, uou must write only the text without any additional comments from yourself. 

            '''
        
        comment = f'''

        The text example is below: 
        """
        {text}
        """

        '''

        prompt_outputs[strat_label].append(analyze_sentiment(comment, system_prompt=system_prompt_overall))



100%|██████████| 858/858 [00:00<00:00, 2384.80it/s]
100%|██████████| 1432/1432 [00:00<00:00, 2654.47it/s]
100%|██████████| 1492/1492 [00:00<00:00, 2587.65it/s]
100%|██████████| 2083/2083 [00:00<00:00, 2771.68it/s]
100%|██████████| 2849/2849 [00:00<00:00, 2973.30it/s]
100%|██████████| 2850/2850 [00:01<00:00, 2643.97it/s]
100%|██████████| 2982/2982 [00:00<00:00, 3034.13it/s]
100%|██████████| 3088/3088 [00:00<00:00, 3119.38it/s]
100%|██████████| 3171/3171 [00:01<00:00, 3156.01it/s]
100%|██████████| 3218/3218 [00:01<00:00, 3150.78it/s]
100%|██████████| 3245/3245 [00:01<00:00, 2819.02it/s]
100%|██████████| 11/11 [00:09<00:00,  1.17it/s]


In [None]:
import pickle

def save_prompt_outputs(prompt_outputs, filename='prompt_outputs.pkl'):
    """
    Save the prompt_outputs dictionary to a pickle file.
    
    Args:
        prompt_outputs (dict): The dictionary containing the outputs to save
        filename (str): Name of the pickle file to save to (default: 'prompt_outputs.pkl')
    
    Returns:
        bool: True if the save was successful, False otherwise
    """
    try:
        with open(filename, 'wb') as file:
            pickle.dump(prompt_outputs, file)
        print(f"Successfully saved prompt_outputs to {filename}")
        return True
    except Exception as e:
        print(f"Error saving prompt_outputs: {e}")
        return False

In [None]:
save_prompt_outputs(prompt_outputs, filename='prompt_outputs.pkl')

In [None]:
df_augmented = pd.DataFrame({'stratification_label':[], 
             'document_content': []})

In [None]:
for strat_label, texts in tqdm(prompt_outputs.items()):
    temp_df = pd.DataFrame({'stratification_label':[strat_label for i in range(len(texts))], 
             'document_content': texts})

    df_augmented = pd.concat([df_augmented, temp_df], ignore_index=True)

In [None]:
df_augmented.to_parquet('augmentations.parquet')