In [6]:
import os
import pandas as pd
import re

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", '', text)
    return text

# Define keywords for happy and sad labels
happy_keywords = ["good", "know", "im", "love", "like", "mean", "fine", "happy", "going", "get", "excited", "thats", "okay", "youre", "theyre", "right", "yeah", "spot", "thing", "hey"]
sad_keywords = ['sad', 'unhappy', 'miserable', 'hate', 'oh', 'help', 'sorrow', 'upset', 'depressed', 'downcast', 'gloomy', 'dont', 'cant']

# Function to label texts based on keywords
def label_text(text):
    text_lower = clean_text(text)
    happy_count = sum(keyword in text_lower for keyword in happy_keywords)
    sad_count = sum(keyword in text_lower for keyword in sad_keywords)
    
    if happy_count > sad_count:
        return 'happy'
    elif sad_count > happy_count:
        return 'sad'
    else:
        return 'neutral'

# Path to the IEMOCAP dataset directory
dataset_path = r"C:\Users\SAKSHEE\Downloads\IEMOCAP_full_release"

# Lists to store data
texts = []
labels = []

# Iterate over the dataset files
for session in range(1, 6):
    session_path = os.path.join(dataset_path, f'Session{session}', 'dialog', 'transcriptions')
    for root, _, files in os.walk(session_path):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r') as f:
                    for line in f:
                        parts = line.strip().split(' ')
                        text = ' '.join(parts[1:])
                        label = label_text(text)
                        texts.append(clean_text(text))
                        labels.append(label)

# Create a DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})

# Save to CSV
output_file_path = r"C:\Users\SAKSHEE\Downloads\relabelled_data_combined.csv"
df.to_csv(output_file_path, index=False)

# Show the distribution of the new labels
label_distribution = df['label'].value_counts()
print(label_distribution)
