In [1]:
import pandas as pd
import sklearn
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt


In [2]:
df = pd.read_excel('DSGP dataSet2.xlsx')

In [3]:
df.head(10)

Unnamed: 0,VoiceClip_No,TextTranscripts,label
0,Voice1,"We want to increase our Wi-Fi services, increa...",Non
1,Voice2,I want to know whether I will be charged for t...,Non
2,Voice3,I made a complaint today morning.I did not ge...,Agg
3,Voice4,I'm calling to reactivate my line. I've been ...,Agg
4,Voice5,Can you check and tell me my balance ? \n,Non
5,Voice6,I want to log a complaint that my internet POT...,Non
6,Voice7,I would like to add on some GB to my connectio...,Non
7,Voice8,I have the LOS light Blinking red color in the...,Non
8,Voice9,\nActually I made this complaint on Saturday.O...,Agg
9,voice10,Hi can i know how much is the minimum that i s...,Non


In [4]:
df.columns

Index(['VoiceClip_No', 'TextTranscripts', 'label'], dtype='object')

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import re

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove links
    text = ' '.join([word for word in text.split() if not word.startswith('@')])  # Remove mentions (words starting with '@')
    text = re.sub(r'\d{10}', '', text)  # Remove contact numbers
    text = re.sub(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', '', text)  # Remove dates
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    tokens = text.split()
    tokens = [word for word in tokens if len(word) > 2]  # Remove short words (length <= 2)
    tokens = [word.lower() for word in tokens if word.isalnum()]  # Tokenization, lowercase, and removing stopwords
    # tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

In [7]:
df['processed_content'] = df['TextTranscripts'].apply(preprocess_text)

In [8]:
df.head(5)

Unnamed: 0,VoiceClip_No,TextTranscripts,label,processed_content
0,Voice1,"We want to increase our Wi-Fi services, increa...",Non,want increase our increase thespeed for the up...
1,Voice2,I want to know whether I will be charged for t...,Non,want know whether will charged for the for dia...
2,Voice3,I made a complaint today morning.I did not ge...,Agg,made complaint today did not get any calls fro...
3,Voice4,I'm calling to reactivate my line. I've been ...,Agg,calling reactivate been requesting this for th...
4,Voice5,Can you check and tell me my balance ? \n,Non,can you check and tell balance


In [9]:
print(df['label'].value_counts())

label
Agg    155
Non    114
Name: count, dtype: int64


In [10]:
df.to_csv('processed_datasetnewAug.csv')

In [27]:
import pandas as pd
import nlpaug.augmenter.word as naw

# Assuming you have a DataFrame called df with columns 'TextTranscripts', 'VoiceClip_No', 'label', and 'processed_content'

# Find the maximum count among the labels
max_count = df['label'].value_counts().max()

# Define specific words for each label
label_words = {
    'Agg': ['worst', 'unacceptable', 'disappointed', 'fix it', 'soon'],
    'Non': ['repair', 'broke down', 'not working', 'no signal', 'maintenance'],
}

# Augment the data to balance the sample count across all labels
augmented_df = pd.DataFrame(columns=['VoiceClip_No', 'TextTranscript', 'label', 'processed_content'])

for label in df['label'].unique():
    label_df = df[df['label'] == label]

    # Extract relevant columns
    texts = label_df['processed_content'].tolist()  # Use the correct column name
    voiceclip_nos = label_df['VoiceClip_No'].tolist()  # Use the correct column name

    # Augment the data to match the maximum count
    augmented_texts = []
    augmented_voiceclip_nos = []
    aug = naw.RandomWordAug()

    for text, voiceclip_no in zip(texts, voiceclip_nos):
        # Replace specific words for each label
        for word in label_words.get(label, []):
            augmented_word = aug.augment(word)
            text = text.replace(word, augmented_word[0] if augmented_word else word)

        augmented_texts.append(text)
        augmented_voiceclip_nos.append(voiceclip_no)

    # Sample each label to match the maximum count
    sampled_texts = pd.Series(texts).sample(n=max_count, replace=True).tolist()
    sampled_voiceclip_nos = pd.Series(voiceclip_nos).sample(n=max_count, replace=True).tolist()

    # If the label is 'Agg', add 20 more augmented samples
    if label == 'Agg':
        for text, voiceclip_no in zip(texts[:20], voiceclip_nos[:20]):
            for word in label_words.get(label, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)
            augmented_voiceclip_nos.append(voiceclip_no)

    # If the label is 'Non', add 3 more augmented samples
    elif label == 'Non':
        for text, voiceclip_no in zip(texts[:3], voiceclip_nos[:3]):
            for word in label_words.get(label, []):
                augmented_word = aug.augment(word)
                text = text.replace(word, augmented_word[0] if augmented_word else word)
            augmented_texts.append(text)
            augmented_voiceclip_nos.append(voiceclip_no)

    # Create a DataFrame for the current label
    label_df = pd.DataFrame({
        'VoiceClip_No': sampled_voiceclip_nos + augmented_voiceclip_nos,
        'TextTranscript': sampled_texts + augmented_texts,
        'label': label,
        'processed_content': sampled_texts + augmented_texts
    })

    # Concatenate with the main DataFrame
    augmented_df = pd.concat([augmented_df, label_df])

# Save the augmented dataset to a new CSV file
augmented_df.to_csv('balanced_augmented_Newdataset.csv', index=False)



In [28]:
df2 = pd.read_csv('balanced_augmented_Newdataset.csv')

# Print the shape of the DataFrame
print(df2.shape)

(602, 4)


In [29]:
df2['label'].value_counts()

label
Agg    330
Non    272
Name: count, dtype: int64