In [1]:
# Data Augmentation Notebook

# Import Required Libraries
import os
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
import random
import librosa

# Ensure NLTK resources are downloaded
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

# Paths to Data
TEXT_DATA_PATH = 'datasets/processed/meld_features_updated.csv'
AUDIO_DATA_PATH = 'datasets/raw/MELD/train/audio'

# Load Data
data = pd.read_csv(TEXT_DATA_PATH)
print("Dataset loaded successfully!")

# Identify Minority Classes
emotion_counts = data['Emotion'].value_counts()
minority_classes = emotion_counts[emotion_counts < emotion_counts.mean()].index.tolist()
print("Minority classes:", minority_classes)

Dataset loaded successfully!
Minority classes: ['anger', 'surprise', 'sadness', 'disgust', 'fear']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajt8\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajt8\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajt8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Synonym Replacement Function
def synonym_replacement(sentence, n=2):
    """
    Replace 'n' random words in a sentence with their synonyms.
    """
    words = sentence.split()
    new_words = words.copy()
    random.shuffle(new_words)
    
    for word in new_words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words[new_words.index(word)] = synonym
        n -= 1
        if n == 0:
            break
    return ' '.join(new_words)

# Apply Synonym Replacement to Minority Classes
augmented_text = []
augmented_labels = []
for emotion in minority_classes:
    samples = data[data['Emotion'] == emotion]['Utterance']
    for text in samples:
        augmented_text.append(synonym_replacement(text))
        augmented_labels.append(emotion)

# Create DataFrame for Augmented Text
augmented_text_df = pd.DataFrame({'Utterance': augmented_text, 'Emotion': augmented_labels})

# Save Augmented Text Data
augmented_text_df.to_csv('datasets/processed/augmented_text_synonym.csv', index=False)
print("Synonym replacement augmentation completed and saved!")

Synonym replacement augmentation completed and saved!


In [14]:
def random_insertion(sentence, n=2):
    words = sentence.split()
    for _ in range(n):
        new_word = synonym_replacement(random.choice(words), n=1)
        position = random.randint(0, len(words))
        words.insert(position, new_word)
    return ' '.join(words)

# Apply Random Insertion
augmented_text = []
augmented_labels = []

for emotion in minority_classes:
    samples = data[data['Emotion'] == emotion]['Utterance']
    for text in samples:
        augmented_text.append(random_insertion(text))
        augmented_labels.append(emotion)

# Create DataFrame
augmented_text_df = pd.DataFrame({'Utterance': augmented_text, 'Emotion': augmented_labels})
augmented_text_df.to_csv('datasets/processed/augmented_text_insertion.csv', index=False)
print("Random insertion augmentation completed!")

Random insertion augmentation completed!


In [15]:
def add_noise(sentence, noise_level=0.1):
    words = sentence.split()
    n_noisy = int(len(words) * noise_level)
    for _ in range(n_noisy):
        idx = random.randint(0, len(words) - 1)
        words[idx] = ''.join(random.sample(words[idx], len(words[idx])))  
    return ' '.join(words)

# Apply Noise Addition
augmented_text = []
augmented_labels = []

for emotion in minority_classes:
    samples = data[data['Emotion'] == emotion]['Utterance']
    for text in samples:
        augmented_text.append(add_noise(text))
        augmented_labels.append(emotion)

# Create DataFrame
augmented_text_df = pd.DataFrame({'Utterance': augmented_text, 'Emotion': augmented_labels})
augmented_text_df.to_csv('datasets/processed/augmented_text_noise.csv', index=False)
print("Noise addition augmentation completed!")

Noise addition augmentation completed!
