In [95]:
# !wget https://huggingface.co/datasets/declare-lab/MELD/resolve/main/MELD.Raw.tar.gz
# This file was too big for my computer to handle so I just downloaded the train file directly from the repository. 
# !pip install nltk
!nltk.download('wordnet')

/bin/bash: -c: line 0: syntax error near unexpected token `'wordnet''
/bin/bash: -c: line 0: `nltk.download('wordnet')'


## Necessary imports

In [96]:
import pandas as pd
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import csv
import string
import os

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sindhukothe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [97]:
dialogues = pd.read_csv("labelled_emotions.csv") 

In [98]:
dialogues.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,Why do all you’re coffee mugs have numbers on ...,Mark,surprise,positive,0,0,3,19,"00:14:38,127","00:14:40,378"
1,2,Oh. That’s so Monica can keep track. That way ...,Rachel,anger,negative,0,1,3,19,"00:14:40,629","00:14:47,385"
2,3,Y'know what?,Rachel,neutral,neutral,0,2,3,19,"00:14:56,353","00:14:57,520"
3,19,"Come on, Lydia, you can do it.",Joey,neutral,neutral,1,0,1,23,"0:10:44,769","0:10:46,146"
4,20,Push!,Joey,joy,positive,1,1,1,23,"0:10:46,146","0:10:46,833"


## This section is to split the dialougues on the basis of emotion

In [99]:
dialogues["Emotion"].unique()

array(['surprise', 'anger', 'neutral', 'joy', 'sadness', 'fear',
       'disgust'], dtype=object)

In [100]:
# List of emotions
emotions = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

# Ensure the folder exists
os.makedirs("Emotions", exist_ok=True)

# Create a CSV file for each emotion with sample headers
for emotion in emotions:
    file_path = f"Emotions/{emotion}.csv"
    
    # Only create the file if it doesn't already exist
    if not os.path.exists(file_path):
        # Create a basic DataFrame structure
        df = pd.DataFrame(columns=["dialogue", "sentiment"])
        df.to_csv(file_path, index=False)
        print(f"Created: {file_path}")
    else:
        print(f"Already exists: {file_path}")

Already exists: Emotions/anger.csv
Already exists: Emotions/disgust.csv
Already exists: Emotions/fear.csv
Already exists: Emotions/joy.csv
Already exists: Emotions/sadness.csv
Already exists: Emotions/surprise.csv
Already exists: Emotions/neutral.csv


In [101]:
for index, row in dialogues.iterrows():
    new_row = [row["Utterance"], row["Sentiment"]]
    # print(new_row)
    fname = "Emotions/" + row["Emotion"] + ".csv"
    with open(fname, "a", newline="") as f:  
        writer = csv.writer(f)
        writer.writerow(new_row)

## This section is to split dialogues on the basis of VAD scores

In [102]:
#Caches for the VAD scores so that we would not have to iterate through the entire df everytime we need something
val_cache = {}
ar_cache = {}
dom_cache = {}

#lemmatizer for words that are not in the csv file
lemmatizer = WordNetLemmatizer()

### Valence

In [103]:
valence = pd.read_csv("VAD_lexicon/valence.txt", sep="\t",) 
valence["valence"].describe()

count    44728.000000
mean        -0.002367
std          0.500050
min         -1.000000
25%         -0.376000
50%          0.000000
75%          0.375000
max          1.000000
Name: valence, dtype: float64

In [104]:
def get_val(str):
    words = str.split(" ")
    val_score = 0
    for word in words:
        word = word.lower() # making the case the same to increase the chances of running into the word
        word = word.translate(str.maketrans('', '', string.punctuation)) #removing punctuation 
        if word not in val_cache.keys():
            word_valence = valence.loc[valence["term"] == word, "valence"]
            if len(word_valence) > 0:
                score = word_valence.values[0]
                val_cache[word] = score  # Cache the result
                val_score += score
            else:
                lemma = lemmatizer.lemmatize(word)
                if lemma != word:  # sometimes the word and the lemma are the same, 
                                         # in which case there isnt really a point in wasting time on the
                    if lemma in val_cache.keys():
                        val_score += val_cache[lemma]
                    else:
                        lemma_valence = valence.loc[valence["term"] == lemma, "valence"]
                        if len(lemma_valence) > 0:
                            score = lemma_valence.values[0]
                            val_cache[lemma] = score  
                            val_score += score
                        else:
                            val_cache[word] = 0
        else:
            val_score += val_cache[word]

    return val_score / len(words)

In [105]:
# List of ranges
ranges = ["0-1", "-1-0"]

# Ensure the folder exists
# os.makedirs("Valence", exist_ok=True)

for r in ranges:
    file_path = f"val_{r}.csv"
    val = get_val
    # Only create the file if it doesn't already exist
    if not os.path.exists(file_path):
        # Create a basic DataFrame structure
        df = pd.DataFrame(columns=["Utterance", "Emotion", "Sentiment", "Valence"])
        df.to_csv(file_path, index=False)
        print(f"Created: {file_path}")
    else:
        print(f"Already exists: {file_path}")

Created: val_0-1.csv
Created: val_-1-0.csv


In [106]:
for index, row in dialogues.iterrows():
    val = get_val(row["Utterance"])
    new_row = [row["Utterance"], row["Emotion"], row["Sentiment"], val]
    v_cat = ""
    
    if val > 0.0:
        v_cat = "0-1"
    else:
        v_cat = "-1-0"
    
    fname = "val_" + v_cat + ".csv"
    with open(fname, "a", newline="") as f:  
        writer = csv.writer(f)
        writer.writerow(new_row)

In [107]:
for r in ranges:
    fname = "val_" + r + ".csv"
    df = pd.read_csv(fname)
    df_cleaned = df.drop_duplicates()
    df_cleaned.to_csv(fname, index=False)

In [108]:
test = pd.read_csv("val_0-1.csv") 
test.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Valence
0,Why do all you’re coffee mugs have numbers on ...,surprise,positive,0.111818
1,Oh. That’s so Monica can keep track. That way ...,anger,negative,0.055364
2,"Come on, Lydia, you can do it.",neutral,neutral,0.103143
3,"Let's get that ball and really move, hey, hey,...",joy,positive,0.119818
4,"Let's— I was just—yeah, right.",joy,positive,0.08


### Arousal

In [109]:
arousal = pd.read_csv("VAD_lexicon/arousal.txt", sep="\t",) 
arousal["arousal"].describe()

count    44728.000000
mean         0.022224
std          0.504393
min         -1.000000
25%         -0.362500
50%          0.000000
75%          0.500000
max          1.000000
Name: arousal, dtype: float64

In [110]:
def get_ar(str):
    words = str.split(" ")
    ar_score = 0
    for word in words:
        word = word.lower() # making the case the same to increase the chances of running into the word
        word = word.translate(str.maketrans('', '', string.punctuation)) #removing punctuation 
        if word not in ar_cache.keys():
            word_ar = arousal.loc[arousal["term"] == word, "arousal"]
            if len(word_ar) > 0:
                score = word_ar.values[0]
                ar_cache[word] = score  # Cache the result
                ar_score += score
            else:
                lemma = lemmatizer.lemmatize(word)
                if lemma != word:  # sometimes the word and the lemma are the same, 
                                         # in which case there isnt really a point in wasting time on the
                    if lemma in ar_cache.keys():
                        ar_score += ar_cache[lemma]
                    else:
                        lemma_ar = arousal.loc[arousal["term"] == lemma, "arousal"]
                        if len(lemma_ar) > 0:
                            score = lemma_ar.values[0]
                            ar_cache[lemma] = score  
                            ar_score += score
                        else:
                            ar_cache[word] = 0
        else:
            ar_score += ar_cache[word]

    return ar_score / len(words)

In [111]:
# List of ranges
ranges = ["0-1", "-1-0"]

# Ensure the folder exists
# os.makedirs("Valence", exist_ok=True)

for r in ranges:
    file_path = f"ar_{r}.csv"
    val = get_val
    # Only create the file if it doesn't already exist
    if not os.path.exists(file_path):
        # Create a basic DataFrame structure
        df = pd.DataFrame(columns=["Utterance", "Emotion", "Sentiment","Arousal"])
        df.to_csv(file_path, index=False)
        print(f"Created: {file_path}")
    else:
        print(f"Already exists: {file_path}")

Created: ar_0-1.csv
Created: ar_-1-0.csv


In [112]:
for index, row in dialogues.iterrows():
    ar = get_ar(row["Utterance"])
    new_row = [row["Utterance"], row["Emotion"], row["Sentiment"], ar]
    
    a_cat = ""
    
    if ar > 0.0:
        ar_cat = "0-1"
    else:
        ar_cat = "-1-0"
        # print(row["Utterance"])
    
    fname = "ar_" + ar_cat + ".csv"
    with open(fname, "a", newline="") as f:  
        writer = csv.writer(f)
        writer.writerow(new_row)

In [113]:
for r in ranges:
    fname = "ar_" + r + ".csv"
    df = pd.read_csv(fname)
    df_cleaned = df.drop_duplicates()
    df_cleaned.to_csv(fname, index=False)

In [114]:
test = pd.read_csv("ar_-1-0.csv") 
test.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Arousal
0,Why do all you’re coffee mugs have numbers on ...,surprise,positive,-0.088455
1,Oh. That’s so Monica can keep track. That way ...,anger,negative,-0.154864
2,"Come on, Lydia, you can do it.",neutral,neutral,-0.132571
3,"Push 'em out, push 'em out, way out!",joy,positive,-0.00225
4,"Let's get that ball and really move, hey, hey,...",joy,positive,-0.023


### Dominance

In [115]:
dominance = pd.read_csv("VAD_lexicon/dominance.txt", sep="\t",) 
dominance["dominance"].describe()

count    44728.000000
mean         0.053255
std          0.439983
min         -1.000000
25%         -0.272000
50%          0.020000
75%          0.404000
max          1.000000
Name: dominance, dtype: float64

In [116]:
def get_dom(str):
    words = str.split(" ")
    dom_score = 0
    for word in words:
        word = word.lower() # making the case the same to increase the chances of running into the word
        word = word.translate(str.maketrans('', '', string.punctuation)) #removing punctuation 
        if word not in dom_cache.keys():
            word_dom = dominance.loc[dominance["term"] == word, "dominance"]
            if len(word_dom) > 0:
                score = word_dom.values[0]
                dom_cache[word] = score  # Cache the result
                dom_score += score
            else:
                lemma = lemmatizer.lemmatize(word)
                if lemma != word:  # sometimes the word and the lemma are the same, 
                                         # in which case there isnt really a point in wasting time on the
                    if lemma in dom_cache.keys():
                        dom_score += dom_cache[lemma]
                    else:
                        lemma_dom = dominance.loc[dominance["term"] == lemma, "dominance"]
                        if len(lemma_dom) > 0:
                            score = lemma_dom.values[0]
                            dom_cache[lemma] = score  
                            dom_score += score
                        else:
                            dom_cache[word] = 0
        else:
            dom_score += dom_cache[word]

    return dom_score / len(words)

In [117]:
ranges = ["0-1", "-1-0"]

# Ensure the folder exists
# os.makedirs("Dominance", exist_ok=True)

for r in ranges:
    file_path = f"dom_{r}.csv"
    val = get_val
    # Only create the file if it doesn't already exist
    if not os.path.exists(file_path):
        # Create a basic DataFrame structure
        df = pd.DataFrame(columns=["Utterance", "Emotion", "Sentiment", "Dominance"])
        df.to_csv(file_path, index=False)
        print(f"Created: {file_path}")
    else:
        print(f"Already exists: {file_path}")

Created: dom_0-1.csv
Created: dom_-1-0.csv


In [118]:
for index, row in dialogues.iterrows():
    dom = get_dom(row["Utterance"])
    new_row = [row["Utterance"], row["Emotion"], row["Sentiment"], dom]
    
    d_cat = ""
    
    if dom > 0.0:
        d_cat = "0-1"
    else:
        d_cat = "-1-0"
    
    fname = "dom_" + d_cat + ".csv"
    with open(fname, "a", newline="") as f:  
        writer = csv.writer(f)
        writer.writerow(new_row)

In [119]:
for r in ranges:
    fname = "dom_" + r + ".csv"
    df = pd.read_csv(fname)
    df_cleaned = df.drop_duplicates()
    df_cleaned.to_csv(fname, index=False)

In [120]:
test = pd.read_csv("dom_0-1.csv") 
test.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Dominance
0,"Come on, Lydia, you can do it.",neutral,neutral,0.021714
1,Push!,joy,positive,0.048
2,"Push 'em out, push 'em out, harder, harder.",joy,positive,0.0035
3,"Let's— I was just—yeah, right.",joy,positive,0.07
4,"Ross, didn't you say that there was an elevato...",neutral,neutral,0.041727
