# Data Augmentation experiments

## Imports and data read

In [1]:
import pandas as pd
import numpy as np
import pprint

from tqdm import tqdm
from openai import OpenAI

## Pipeline

https://platform.openai.com/docs/api-reference/authentication

In [2]:
api_key = 'sk-proj-7yyg4KC696T7WK6CmLeWT3BlbkFJfjiC6bs5r8R7yoGLCIUP'

In [38]:
client = OpenAI(api_key=api_key)

In [3]:
df = pd.read_parquet('./data_provided/final_dataset/final_17042025.parquet')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12224 entries, 0 to 12223
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   response_id           12224 non-null  int64 
 1   document_id           12224 non-null  int64 
 2   user_id               12224 non-null  int64 
 3   annotator_sentiment   12224 non-null  object
 4   is_ck_annotation      12224 non-null  int64 
 5   response_timestamp    12224 non-null  object
 6   document_content      12224 non-null  object
 7   annotation_date       12224 non-null  object
 8   username              12224 non-null  object
 9   unique_document_id    12224 non-null  object
 10  language_wc           12224 non-null  object
 11  document_length       12224 non-null  int64 
 12  gpt_labels_v1         12224 non-null  object
 13  language_gpt          12224 non-null  object
 14  language_manual       12224 non-null  object
 15  language              12224 non-null

In [5]:
df.language.unique()

array(['ua', 'ru', 'mixed'], dtype=object)

## Stratification label balancing

The core idea is to reduce the imbalance between classes in the dataset by generating new samples by Chat GPT

In [10]:
df.stratification_label.value_counts()

stratification_label
neutral_ua        3291
negative_ua       2433
positive_ua       1859
negative_ru       1799
neutral_ru        1208
mixed_ua           442
positive_ru        441
negative_mixed     309
neutral_mixed      203
mixed_ru           120
positive_mixed      73
mixed_mixed         46
Name: count, dtype: int64

In [16]:
max_stratification = df.stratification_label.value_counts().max()

In [17]:
max_stratification

np.int64(3291)

In [20]:
classes_to_augment = (df.stratification_label.value_counts() - max_stratification).reset_index()

In [21]:
classes_to_augment['count'] = classes_to_augment['count'].apply(abs) 

In [22]:
classes_to_augment

Unnamed: 0,stratification_label,count
0,neutral_ua,0
1,negative_ua,858
2,positive_ua,1432
3,negative_ru,1492
4,neutral_ru,2083
5,mixed_ua,2849
6,positive_ru,2850
7,negative_mixed,2982
8,neutral_mixed,3088
9,mixed_ru,3171


In [104]:
classes_to_augment['count'].sum()

np.int64(27268)

In [28]:
for text in df.loc[(df.stratification_label == 'mixed_mixed'), 'document_content']:
    pprint.pprint(text, width=250)
    print('----------------------------')
    print('----------------------------')
    print('----------------------------')

'Навіть в цьому дописи про це ні слова: відкривай та авторизуйся через NFC, а по факту це не до кінця працює.   Хтось побачить цей допис з підписників на канал і поведіться, як я.'
----------------------------
----------------------------
----------------------------
('Дівчата,доброго дня.Чи є тут люди з лівобережжя Херсонщини.?Я вибачаюсь чи можна питання?Хто подавав через Дія на ЄВідновлення,та отримав відмову через те що ,будинок в окупації та не внесено його до державного реєстру!?Ми з Олешок,і куди тільки '
 'не звертаємося нам відмовляють в внесенні будинку в державний реєстр.ЦНАП видав офіційну відмову. Може хтось знак куди було перенесено архів м.Олешки.Раніше в Кропивницьк,зараз кажуть що до Одеси.Але ніхто не знає куди.')
----------------------------
----------------------------
----------------------------
('Ангела Меркель наполягає, що вважає правильним своє рішення не приймати Україну до НАТО у 2008 році, незважаючи на критику з боку Зеленського.  Водночас ексканцлерка під

In [None]:
def analyze_sentiment(comment, system_prompt):
    """
    Sends a request to OpenAI's GPT model to analyze sentiment.
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": comment}
        ]
    )   
    return response.choices[0].message.content

In [93]:
prompt_inputs = {
    'negative_ua': [858, 'include only Ukrainian words', 'negative sentiment'],
    'positive_ua': [1432, 'include only Ukrainian words', 'positive sentiment'],
    'negative_ru': [1492, 'include only Russian words', 'negative sentiment'],
    'neutral_ru': [2083, 'include only Russian words', 'neutral sentiment'],
    'mixed_ua': [2849, 'include only Ukrainian words', 'mixed sentiment (express positive and negative emotions in different part of the text output'],
    'positive_ru': [2850, 'include only Russian words', 'positive sentiment'],
    'negative_mixed': [2982, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'negative sentiment'],
    'neutral_mixed': [3088, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'neutral sentiment'],
    'mixed_ru': [3171, 'include only Russian words', 'mixed sentiment (express positive and negative emotions in different part of the text output'],
    'positive_mixed': [3218, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'positive sentiment'],
    'mixed_mixed': [3245, 'include Ukrainian words as well as Russian (e.g.: "Доброго вечора, как делишки?")', 'mixed sentiment (express positive and negative emotions in different part of the text output)'],

}

In [None]:
prompt_outputs = {
    
}

In [None]:
for strat_label, inputs in tqdm(prompt_inputs.items()):

    prompt_outputs[strat_label] = []
    
    for _ in tqdm(range(inputs[0])):
        language = inputs[1]
        sentiment = inputs[2]
        text = df.loc[(df.stratification_label == strat_label), 'document_content'].sample(1).values[0]
        


        system_prompt_overall = f'''

                You are a sentiment analysis expert. You need to help to create a dataset of texts needed for training an ML model. Your help is to write a text which will be included to the dataset. This is important that the text must {language}. The sentiment of the text should express {sentiment}.
                The example of such a text is provided below.

                Write the text similar to the provided example. You can do just a rewording. However, remember, that the resulted text must {language}. 
                
                Also, uou must write only the text without any additional comments from yourself. 

            '''
        
        comment = f'''

        The text example is below: 
        """
        {text}
        """

        '''


        prompt_outputs[strat_label].append(analyze_sentiment(comment, system_prompt=system_prompt_overall))



In [101]:
# system_prompt_overall = f'''

#         You are a sentiment analysis expert. You need to help to create a dataset of texts needed for training an ML model. Your help is to write a text which will be included to the dataset. This is important that the text must {language}. The sentiment of the text should express {sentiment}.
#         The example of such a text is provided below. Write the text similar to the provided example and remember, that the resulted text must {language}. 
        
#         Also, uou must write only the text without any additional comments from yourself. 

#     '''

# comment = f'''

# The text example is below: 
# """
# {text}
# """
# '''

In [102]:
# analyze_sentiment(comment, system_prompt=system_prompt_overall)