In [54]:
import numpy as np
import pandas as pd
import nlpaug.augmenter.word as naw

In [55]:
df = pd.read_csv('Complaints.csv')
df.head()

Unnamed: 0,Complaints,Department
0,My bill seems higher than usual.,Billing
1,"I received a late payment notice, but I paid o...",Billing
2,Can you explain the charges on my recent bill?,Billing
3,"My auto-payment didn't go through, and I got c...",Billing
4,I was billed for a service I canceled last month.,Billing


In [56]:
df.shape

(142, 2)

In [57]:
df['Department'].value_counts()

Billing      55
Technical    52
HR           35
Name: Department, dtype: int64

In [58]:
# Find the maximum count among the departments
max_count = df['Department'].value_counts().max()

# Define specific words for each department
department_words = {
    'Billing': ['bill', 'charges', 'balance', 'activate', 'packages', 'billing'],
    'Technical': ['repair', 'broke down', 'not working', 'no signal', 'maintenance'],
    'HR': ['agent', 'rude', 'didnt answer the phone']
}

# Augment the data to balance the sample count across all departments
augmented_df = pd.DataFrame(columns=['Complaints', 'Department'])

for department in df['Department'].unique():
    texts = df[df['Department'] == department]['Complaints'].tolist()

    # Augment the data to match the maximum count
    augmented_texts = []
    aug = naw.RandomWordAug()

    for text in texts:
        # Replace specific words for each department
        for word in department_words.get(department, []):
            augmented_word = aug.augment(word)
            text = text.replace(word, augmented_word[0] if augmented_word else word)

        augmented_texts.append(text)

    # Sample each department to match the maximum count
    sampled_texts = pd.Series(texts).sample(n=max_count, replace=True).tolist()

    # If the department is 'HR', add 20 more augmented samples
    if department == 'HR':
        for text in texts[:20]:
            for word in department_words.get(department, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)

    # If the department is 'Technical', add 3 more augmented samples
    elif department == 'Technical':
        for text in texts[:3]:
            for word in department_words.get(department, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)

    # Create a DataFrame for the current department
    department_df = pd.DataFrame({'Complaints': sampled_texts + augmented_texts, 'Department': department})

    # Concatenate with the main DataFrame
    augmented_df = pd.concat([augmented_df, department_df])

# Save the augmented dataset to a new CSV file
augmented_df.to_csv('balanced_augmented_dataset.csv', index=False)


In [59]:
df2= pd.read_csv('balanced_augmented_dataset.csv')

In [60]:
df2['Department'].value_counts()

Billing      110
Technical    110
HR           110
Name: Department, dtype: int64