In [1]:
import pandas as pd
import nlpaug.augmenter.word as naw
import random

In [2]:
df = pd.read_csv('Data\\Cleaned_Call_Recordings.csv')

In [3]:
import pandas as pd
import numpy as np


# Assuming you have already imported other necessary libraries and defined functions

# Find the maximum count among the labels
max_count = df['label'].value_counts().max()
if max_count <= 0:
    raise ValueError("The maximum count among the labels is not greater than 0.")

# Define specific words for each label
label_words = {
    'Aggressive': ['worst', 'unacceptable', 'disappointed', 'fix it', 'soon'],
    'Non-aggressive': ['repair', 'broke down', 'not working', 'no signal', 'maintenance'],
}

# Augment the data to balance the sample count across all labels
augmented_df = pd.DataFrame(columns=['VoiceClip_No', 'TextTranscript', 'label', 'processed_content'])

for label in df['label'].unique():
    label_df = df[df['label'] == label]

    # Extract relevant columns
    texts = label_df['processed_content'].tolist()  # Use the correct column name
    voiceclip_nos = label_df['VoiceClip_No'].tolist()  # Use the correct column name

    # Augment the data to match the maximum count
    augmented_texts = []
    augmented_voiceclip_nos = []
    aug = naw.RandomWordAug()

    for text, voiceclip_no in zip(texts, voiceclip_nos):
        # Replace specific words for each label
        for word in label_words.get(label, []):
            augmented_word = aug.augment(word)
            text = text.replace(word, augmented_word[0] if augmented_word else word)

        augmented_texts.append(text)
        augmented_voiceclip_nos.append(voiceclip_no)

    # Sample each label to match the maximum count, ensuring max_count is greater than 0
    if max_count > 0:
        sampled_texts = pd.Series(texts).sample(n=min(max_count, len(texts)), replace=True).tolist()
        sampled_voiceclip_nos = pd.Series(voiceclip_nos).sample(n=min(max_count, len(voiceclip_nos)), replace=True).tolist()
    else:
        sampled_texts = []
        sampled_voiceclip_nos = []

    # If the label is 'Agg', add more augmented samples
    if label == 'Aggressive':
        for text, voiceclip_no in zip(texts[:2], voiceclip_nos[:2]):
            for word in label_words.get(label, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)
            augmented_voiceclip_nos.append(voiceclip_no)

    # If the label is 'Non', add more augmented samples
    elif label == 'Non-aggressive':
        for text, voiceclip_no in zip(texts[:5], voiceclip_nos[:5]):  # Increase the number here
          for word in label_words.get(label, []):
            augmented_word = aug.augment(word)
            text = text.replace(word, augmented_word[0] if augmented_word else word)
        augmented_texts.append(text)
        augmented_voiceclip_nos.append(voiceclip_no)


    # Create a DataFrame for the current label
    label_df = pd.DataFrame({
        'VoiceClip_No': sampled_voiceclip_nos + augmented_voiceclip_nos,
        'TextTranscript': sampled_texts + augmented_texts,
        'label': label,
        'processed_content': sampled_texts + augmented_texts
    })

    # Concatenate with the main DataFrame
    augmented_df = pd.concat([augmented_df, label_df])

# Save the augmented dataset to a new CSV file
augmented_df.to_csv('Data\\augmented_Call_Recordings.csv', index=False)


In [4]:
df2 = pd.read_csv('Data\\augmented_Call_Recordings.csv')

# Print the shape of the DataFrame
print(df2.shape)

(841, 4)


In [5]:
df2['label'].value_counts()

label
Aggressive        428
Non-aggressive    413
Name: count, dtype: int64