In [6]:
import pandas as pd
import nlpaug.augmenter.word as naw

In [7]:
df = pd.read_csv('Data\\Cleaned_Call_Conversation(Emotion).csv')

In [11]:


# Assuming you have a DataFrame called df with columns 'TextTranscripts', 'VoiceClip_No', 'label', and 'processed_content'

# Find the maximum count among the labels
max_count = df['label'].value_counts().max()

# Define specific words for each label
label_words = {
    'Agg': ['worst', 'unacceptable', 'disappointed', 'fix it', 'soon'],
    'Non': ['repair', 'broke down', 'not working', 'no signal', 'maintenance'],
}

# Augment the data to balance the sample count across all labels
augmented_df = pd.DataFrame(columns=['VoiceClip_No', 'TextTranscript', 'label', 'processed_content'])

for label in df['label'].unique():
    label_df = df[df['label'] == label]

    # Extract relevant columns
    texts = label_df['processed_content'].tolist()  # Use the correct column name
    voiceclip_nos = label_df['VoiceClip_No'].tolist()  # Use the correct column name

    # Augment the data to match the maximum count
    augmented_texts = []
    augmented_voiceclip_nos = []
    aug = naw.RandomWordAug()

    for text, voiceclip_no in zip(texts, voiceclip_nos):
        # Replace specific words for each label
        for word in label_words.get(label, []):
            augmented_word = aug.augment(word)
            text = text.replace(word, augmented_word[0] if augmented_word else word)

        augmented_texts.append(text)
        augmented_voiceclip_nos.append(voiceclip_no)

    # Sample each label to match the maximum count
    sampled_texts = pd.Series(texts).sample(n=max_count, replace=True).tolist()
    sampled_voiceclip_nos = pd.Series(voiceclip_nos).sample(n=max_count, replace=True).tolist()

    # If the label is 'Agg', add 20 more augmented samples
    if label == 'Agg':
        for text, voiceclip_no in zip(texts[:20], voiceclip_nos[:20]):
            for word in label_words.get(label, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)
            augmented_voiceclip_nos.append(voiceclip_no)

    # If the label is 'Non', add 3 more augmented samples
    elif label == 'Non':
        for text, voiceclip_no in zip(texts[:3], voiceclip_nos[:3]):
            for word in label_words.get(label, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)
            augmented_voiceclip_nos.append(voiceclip_no)

    # Create a DataFrame for the current label
    label_df = pd.DataFrame({
        'VoiceClip_No': sampled_voiceclip_nos + augmented_voiceclip_nos,
        'TextTranscript': sampled_texts + augmented_texts,
        'label': label,
        'processed_content': sampled_texts + augmented_texts
    })

    # Concatenate with the main DataFrame
    augmented_df = pd.concat([augmented_df, label_df])

# Save the augmented dataset to a new CSV file
augmented_df.to_csv('Data\\balanced_augmented_Call_Conversation(Emotion).csv', index=False)

In [13]:
df2 = pd.read_csv('Data\\balanced_augmented_Call_Conversation(Emotion).csv')

# Print the shape of the DataFrame
print(df2.shape)

(602, 4)


In [14]:
df2['label'].value_counts()

label
Agg    330
Non    272
Name: count, dtype: int64