In [2]:

import numpy as np
import pandas as pd
import nlpaug.augmenter.word as naw

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df = pd.read_csv('Datasets/NewDepatments_Dataset.csv')
df.head()

Unnamed: 0,Complaints,Department
0,We want to increase our Wi-Fi services,Technical
1,Increase the speed for the upload and the down...,Technical
2,"They said that to make the speed faster, we h...",Technical
3,So whenever I happen to dial this number no ch...,Billing
4,Sometimes we get delay in getting response fro...,HR


In [4]:
df.shape

(210, 2)

In [5]:
df['Department'].value_counts()

Department
Technical    70
Billing      70
HR           70
Name: count, dtype: int64

In [6]:
# Find the maximum count among the departments
max_count = df['Department'].value_counts().max()

# Define specific words for each department
department_words = {
    'Billing': ['bill', 'charges', 'balance', 'activate', 'packages', 'billing','billed'],
    'Technical': ['repair', 'broke down', 'not working', 'no signal', 'maintenance'],
    'HR': ['agent', 'rude', 'didnt answer the phone']
}

# Augment the data to balance the sample count across all departments
augmented_df = pd.DataFrame(columns=['Complaints', 'Department'])

for department in df['Department'].unique():
    texts = df[df['Department'] == department]['Complaints'].tolist()

    # Augment the data to match the maximum count
    augmented_texts = []
    aug = naw.RandomWordAug()

    for text in texts:
        # Replace specific words for each department
        for word in department_words.get(department, []):
            augmented_word = aug.augment(word)
            text = text.replace(word, augmented_word[0] if augmented_word else word)

        augmented_texts.append(text)

    # Sample each department to match the maximum count
    sampled_texts = pd.Series(texts).sample(n=max_count, replace=True).tolist()

    # Create a DataFrame for the current department
    department_df = pd.DataFrame({'Complaints': sampled_texts + augmented_texts, 'Department': department})

    # Concatenate with the main DataFrame
    augmented_df = pd.concat([augmented_df, department_df])

# Save the augmented dataset to a new CSV file
augmented_df.to_csv('Datasets/balanced_augmented_dataset.csv', index=False)


In [7]:
df2= pd.read_csv('Datasets/balanced_augmented_dataset.csv')
df2.shape

(420, 2)

In [8]:
df2['Department'].value_counts()

Department
Technical    140
Billing      140
HR           140
Name: count, dtype: int64