In [154]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Data preprocessing

##### Shared - 12/07/2023 (Seoyoung Kim)

### 1.1 Load modules and dataset

In [155]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [156]:
# !pip install contractions



In [157]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sn
from wordcloud import WordCloud
import math

## for feature extraction
import re
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from gensim.models import Word2Vec

## for data model
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [159]:
json_file_path = '/content/drive/MyDrive/Colab Notebooks/CIS511/dataset/text/Subtask_2_train.json'

with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)
#print(data)

In [306]:
# Create a DataFrame
df_list = []

for conversation in data:
    for utterance in conversation["conversation"]:
        df_list.append({
            "conversation_ID": conversation["conversation_ID"],
            "utterance_ID": utterance["utterance_ID"],
            "speaker": utterance["speaker"],
            "emotion": utterance["emotion"],
            "text": utterance["text"],
            "video_name": utterance["video_name"]
        })

df = pd.DataFrame(df_list)
df.head()

Unnamed: 0,conversation_ID,utterance_ID,speaker,emotion,text,video_name
0,1,1,Chandler,neutral,"Alright , so I am back in high school , I am s...",dia1utt1.mp4
1,1,2,All,neutral,"Oh , yeah . Had that dream .",dia1utt2.mp4
2,1,3,Chandler,surprise,"Then I look down , and I realize there is a ph...",dia1utt3.mp4
3,1,4,Joey,surprise,Instead of ... ?,dia1utt4.mp4
4,1,5,Chandler,anger,That is right .,dia1utt5.mp4


In [307]:
df.shape

(13619, 6)

### 2.1 Data Cleaning

In [311]:
def clean_text(text): # Remove URLs & mentions
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

def expand_contractions(text): # Expand contractions in text processing
    expanded_words = []
    for tx in text.split():
        #if tx != contractions.fix(tx): print(tx, contractions.fix(tx))
        expanded_words.append(contractions.fix(tx))
    new_text = ' '.join(expanded_words)
    return new_text

def remove_non_alpha(new_text):  # Extract only alphabets
    new_text = re.sub(r'[^a-zA-Z0-9\s]', '', new_text.lower())
    return new_text

def remove_stopwords(new_text): # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    return " ".join(token for token in new_text.split() if token not in stop_words)

def word_lemma(new_text): # Remmatizer
    return " ".join([WordNetLemmatizer().lemmatize(token) for token in new_text.split()])

def tokenizer(new_text):
    return [token for token in new_text.split()]

def create_label(emotion): # Replace emotions to label
    emotion_label = {}
    for idx, val in enumerate(df.emotion.unique()):
        emotion_label[val] = idx
    return emotion_label[emotion]

def drop_column(df, column): # Drop column
    df = df.drop([column], axis=1)
    return df

df['text'] = df['text'].apply(clean_text)
df['features'] = df['text'].apply(expand_contractions)
df['features'] = df['features'].apply(remove_non_alpha)
df['features2'] = df['features'].apply(remove_stopwords)
df['features3'] = df['features2'].apply(word_lemma)
df['tokens'] = df['features'].apply(tokenizer)
df['tokens2'] = df['features3'].apply(tokenizer)
df['label'] = df['emotion'].apply(create_label)
df['length'] = df['features'].str.split().apply(len)
df['length2'] = df['features3'].str.split().apply(len)
df.head()

Unnamed: 0,conversation_ID,utterance_ID,speaker,emotion,text,video_name,features,features2,features3,tokens,tokens2,label,length,length2
0,1,1,Chandler,neutral,"Alright , so I am back in high school , I am s...",dia1utt1.mp4,alright so i am back in high school i am sta...,alright back high school standing middle cafet...,alright back high school standing middle cafet...,"[alright, so, i, am, back, in, high, school, i...","[alright, back, high, school, standing, middle...",0,24,10
1,1,2,All,neutral,"Oh , yeah . Had that dream .",dia1utt2.mp4,oh yeah had that dream,oh yeah dream,oh yeah dream,"[oh, yeah, had, that, dream]","[oh, yeah, dream]",0,5,3
2,1,3,Chandler,surprise,"Then I look down , and I realize there is a ph...",dia1utt3.mp4,then i look down and i realize there is a pho...,look realize phone,look realize phone,"[then, i, look, down, and, i, realize, there, ...","[look, realize, phone]",1,12,3
3,1,4,Joey,surprise,Instead of ... ?,dia1utt4.mp4,instead of,instead,instead,"[instead, of]",[instead],1,2,1
4,1,5,Chandler,anger,That is right .,dia1utt5.mp4,that is right,right,right,"[that, is, right]",[right],2,3,1


### 2.2 Feature Extraction

In [206]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [355]:
## df2 length >= 5 using feature
df2 = df[df['length'] >= 5]
df2 = df2.reset_index(drop=True)

## df3 length2 >= 5 using feature3
df3 = df[df['length2'] >= 5]
df3 = df3.reset_index(drop=True)

In [356]:
print(df.shape, df2.shape, df3.shape)

(13619, 17) (9312, 17) (4766, 17)


#### 2.2.1 Sentiment PMI

In [362]:
## PMI

## case 1 = no constraint
## case 2 = length >= 5 / df2['features']
## case 3 = lemma and length >=5 / df3['features3'] / df['tokens2']

# p(emotion)
total_emo = df3.shape[0]
emotion_prob = Counter()
for idx, val in zip(df3['emotion'].unique(), df3['emotion'].value_counts()):
    emotion_prob[idx] = val/total_emo
print(emotion_prob['joy'])

# c(word, emotion)
word_emotion_cooccurrence = Counter()
for i in range(df3.shape[0]):
    tokens = df3['tokens2'][i]
    emo = df3['emotion'][i]
    for token in tokens:
      word_emotion_cooccurrence[(token, emo)] += 1
print(len(word_emotion_cooccurrence))

# Calcluate pmi
total_word = sum(df3['length2'])
pmis = []
for i in range(df3.shape[0]):
    pmi = []
    emotion = df3[['emotion', 'tokens2']].loc[i][0]
    prob_e = emotion_prob[idx] # p(emotion)
    for word in df3[['emotion', 'tokens2']].loc[i][1]:
        word_emotion = word_emotion_cooccurrence[(word, emotion)]
        prob_w_e = word_emotion/len(word_emotion_cooccurrence)

        n_word = sum(word_emotion_cooccurrence[(idx, val)] for idx, val in word_emotion_cooccurrence if word in idx)
        prob_w = n_word / total_word # p(word)

        score = math.log2(prob_w_e) - (math.log2(prob_w) + math.log2(prob_e))
        pmi.append(score)

    pmis.append(pmi)

df3['pmi'] = np.array(pmis)
df3.head(1)

0.13680234997901805
9760


  df3['pmi'] = np.array(pmis)


Unnamed: 0,conversation_ID,utterance_ID,speaker,emotion,text,video_name,features,features2,features3,tokens,tokens2,label,length,length2,word2vec,pos,pmi
0,1,1,Chandler,neutral,"Alright , so I am back in high school , I am s...",dia1utt1.mp4,alright so i am back in high school i am sta...,alright back high school standing middle cafet...,alright back high school standing middle cafet...,"[alright, so, i, am, back, in, high, school, i...","[alright, back, high, school, standing, middle...",0,24,10,"[[-0.08596851, 0.14384215, 0.06965853, 0.04791...","[(alright, RB), (so, RB), (i, JJ), (am, VBP), ...","[5.837427643815923, 5.653715690643536, 5.92489..."


In [363]:
X = df3['pmi'].apply(lambda x: ' '.join(map(str, x))).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, df3['label'], test_size=0.2, random_state=123)

test_model = [MultinomialNB(), RandomForestClassifier(), DecisionTreeClassifier(),
              LogisticRegression(solver='saga'), SVC(), KNeighborsClassifier(), SGDClassifier(), XGBClassifier()]

file = pd.DataFrame()
pmi_results = pd.DataFrame()

for val in test_model:
    pipe = make_pipeline(CountVectorizer(), val)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    model = list(pipe.named_steps)
    result = classification_report(y_pred, y_test, output_dict=True)
    df_results = pd.DataFrame(result)
    df_results['model'] = model[1]
    file = pd.concat([file, df_results], axis=0)

pmi_results = pd.concat([pmi_results, file[file.index == 'f1-score']], axis=0)
pmi_results



Unnamed: 0,0,1,2,3,4,5,6,accuracy,macro avg,weighted avg,model
f1-score,0.986982,0.957055,0.959108,0.978495,0.977636,0.891566,0.816327,0.969602,0.938167,0.969893,multinomialnb
f1-score,0.939462,0.9125,0.87747,0.924731,0.929293,0.888889,0.75,0.919287,0.888906,0.920833,randomforestclassifier
f1-score,0.927835,0.825806,0.843284,0.909091,0.883117,0.777778,0.533333,0.883648,0.814321,0.886896,decisiontreeclassifier
f1-score,0.973099,0.944099,0.932836,0.956522,0.938907,0.897436,0.862745,0.951782,0.929378,0.952144,logisticregression
f1-score,0.928177,0.900662,0.911197,0.920455,0.925081,0.794118,0.666667,0.91195,0.863765,0.914753,svc
f1-score,0.806673,0.657895,0.606061,0.604027,0.699248,0.448276,0.242424,0.719078,0.580658,0.740831,kneighborsclassifier
f1-score,0.96977,0.9125,0.903226,0.951872,0.946708,0.809524,0.846154,0.939203,0.905679,0.938481,sgdclassifier
f1-score,0.947727,0.918239,0.916996,0.956522,0.940789,0.873239,0.807018,0.933962,0.908647,0.934437,xgbclassifier


#### 2.2.2 Parts of Speech Tagging

In [339]:
# Part of Speech (POS)

## case 1 = no constraint
## case 2 = length >= 5 / df2['features']
## case 3 = lemma and length >=5 / df3['features3'] / df['tokens2']

df3['pos'] = [nltk.pos_tag(token) for token in df3['tokens2']]
df3.head(1)

Unnamed: 0,conversation_ID,utterance_ID,speaker,emotion,text,video_name,features,features2,features3,tokens,tokens2,label,length,length2,pos
0,1,1,Chandler,neutral,"Alright , so I am back in high school , I am s...",dia1utt1.mp4,alright so i am back in high school i am sta...,alright back high school standing middle cafet...,alright back high school standing middle cafet...,"[alright, so, i, am, back, in, high, school, i...","[alright, back, high, school, standing, middle...",0,24,10,"[(alright, RB), (back, RB), (high, JJ), (schoo..."


In [340]:
# Imbalanced label distribution / oversampleing
length = df3['label'].value_counts()
train_oversampled = df3[df3['label'] == length.index[0]]

for i in range(0, 7):
    if i != length.index[0]:
        oversample = df3[df3['label'] == i].sample(length[length.index[0]], replace=True, random_state=123)
        train_oversampled = pd.concat([oversample, train_oversampled], axis=0)

print(train_oversampled['label'].value_counts())
train_oversampled = train_oversampled.reset_index(drop=True)
# train_oversampled.head()

X = train_oversampled['pos'].apply(lambda x: ' '.join(map(str, x))).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, train_oversampled['label'], test_size=0.2)

test_model = [MultinomialNB(), RandomForestClassifier(), DecisionTreeClassifier(),
              LogisticRegression(solver='saga'), SVC(), KNeighborsClassifier(), SGDClassifier(), XGBClassifier()]

file = pd.DataFrame()
pos_results = pd.DataFrame()

for val in test_model:
    # Choose a model from list
    pipe = make_pipeline(CountVectorizer(), val)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    model = list(pipe.named_steps)
    result = classification_report(y_pred, y_test, output_dict=True)
    df_results = pd.DataFrame(result)
    df_results['model'] = model[1]
    file = pd.concat([file, df_results], axis=0)

pos_results = pd.concat([pos_results, file[file.index == 'f1-score']], axis=0)
pos_results

6    2054
5    2054
4    2054
3    2054
2    2054
1    2054
0    2054
Name: label, dtype: int64




Unnamed: 0,0,1,2,3,4,5,6,accuracy,macro avg,weighted avg,model
f1-score,0.428094,0.780488,0.669039,0.675545,0.690562,0.84271,0.840611,0.718359,0.703864,0.731245,multinomialnb
f1-score,0.828897,0.986207,0.937799,0.968789,0.904594,1.0,0.993865,0.946106,0.945736,0.946017,randomforestclassifier
f1-score,0.634526,0.964045,0.899772,0.936068,0.869965,0.977778,0.972389,0.902295,0.893506,0.909377,decisiontreeclassifier
f1-score,0.549689,0.892412,0.790267,0.820513,0.775087,0.951574,0.945882,0.827191,0.817918,0.835155,logisticregression
f1-score,0.591284,0.894309,0.747596,0.812739,0.741935,0.981273,0.99022,0.821627,0.822765,0.820131,svc
f1-score,0.366771,0.813953,0.658371,0.773562,0.63807,0.912892,0.94186,0.745828,0.729354,0.761262,kneighborsclassifier
f1-score,0.588921,0.915888,0.815851,0.855037,0.805017,0.9694,0.959716,0.850487,0.844261,0.855466,sgdclassifier
f1-score,0.548387,0.869364,0.697385,0.7925,0.7162,0.954823,0.959427,0.793115,0.791155,0.794692,xgbclassifier


#### 2.2.3 Word2Vec

In [None]:
# Word Embedding (Word2Vec)
tokens = df['tokens']
word2vecmodel = Word2Vec(tokens, vector_size=100, window=5, min_count=1, workers=4)

word_embeddings = []
for sentence in tokens:
  tmp = []
  for word in sentence:
    if word in word2vecmodel.wv:
      tmp.append(word2vecmodel.wv[word])
  word_embeddings.append(tmp)
print(len(word_embeddings), df.shape)
df['word2vec'] = np.array(word_embeddings)
df.head()

In [None]:
word2vec_results = pd.DataFrame()
file = pd.DataFrame()

## case 1 = no constraint
## case 2 = length >= 5 / df2['features']
## case 3 = lemma and length >=5 / df3['features3']

# length = df['label'].value_counts()
# train_oversampled = df[df['label'] == length.index[0]]

# for i in range(0, 7):
#     if i != length.index[0]:
#         oversample = df[df['label'] == i].sample(length[length.index[0]], replace=True, random_state=123)
#         train_oversampled = pd.concat([oversample, train_oversampled], axis=0)

# print(train_oversampled['label'].value_counts())
# train_oversampled = train_oversampled.reset_index(drop=True)
# train_oversampled.head()

X = df['word2vec'].apply(lambda x: ' '.join(map(str, x))).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2)

test_model = [MultinomialNB(), RandomForestClassifier(), DecisionTreeClassifier(),
              LogisticRegression(solver='saga'), SVC(), KNeighborsClassifier(), SGDClassifier(), XGBClassifier()]


for val in test_model:
    pipe = make_pipeline(CountVectorizer(), val)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    model = list(pipe.named_steps)
    result = classification_report(y_pred, y_test, output_dict=True)
    df_results = pd.DataFrame(result)
    df_results['model'] = model[1]
    file = pd.concat([file, df_results], axis=0)

word2vec_results = pd.concat([word2vec_results, file[file.index == 'f1-score']], axis=0)
word2vec_results

#### 2.2.4 N-gram

In [320]:
ALL_Cases_results = pd.DataFrame()

## case 1 = no constraint
## case 2 = length >= 5 / df2['features']
## case 3 = lemma and length >=5 / df3['features3']

length = df3['label'].value_counts()
train_oversampled = df3[df3['label'] == length.index[0]]

for i in range(0, 7):
    if i != length.index[0]:
        oversample = df3[df3['label'] == i].sample(length[length.index[0]], replace=True, random_state=123)
        train_oversampled = pd.concat([oversample, train_oversampled], axis=0)

print(train_oversampled['label'].value_counts())
train_oversampled = train_oversampled.reset_index(drop=True)
train_oversampled.head()

X_train, X_test, y_train, y_test = train_test_split(train_oversampled['features3'], train_oversampled['label'], test_size=0.2)
test_model = [MultinomialNB(), RandomForestClassifier(), DecisionTreeClassifier(),
              LogisticRegression(solver='saga'), SVC(), KNeighborsClassifier(), SGDClassifier(), XGBClassifier()]
file = pd.DataFrame()

for val in test_model:
    pipe = make_pipeline(CountVectorizer(ngram_range=(1, 2)), val)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    model = list(pipe.named_steps)
    result = classification_report(y_pred, y_test, output_dict=True)
    df_results = pd.DataFrame(result)
    df_results['model'] = model[1]
    file = pd.concat([file, df_results], axis=0)
    total_result6 = file

ALL_Cases_results = pd.concat([ALL_Cases_results, file[file.index == 'f1-score']], axis=0)
ALL_Cases_results

6    2054
5    2054
4    2054
3    2054
2    2054
1    2054
0    2054
Name: label, dtype: int64




Unnamed: 0,0,1,2,3,4,5,6,accuracy,macro avg,weighted avg,model
f1-score,0.485816,0.924107,0.868421,0.874305,0.854839,0.939675,0.933495,0.858484,0.840094,0.877366,multinomialnb
f1-score,0.843713,0.994125,0.943937,0.971154,0.905312,1.0,0.996129,0.949583,0.950624,0.948972,randomforestclassifier
f1-score,0.657534,0.970252,0.871006,0.937931,0.893378,0.971223,0.988476,0.905772,0.898543,0.913316,decisiontreeclassifier
f1-score,0.73029,0.970115,0.902589,0.931475,0.894915,0.989011,0.985951,0.917942,0.914907,0.92146,logisticregression
f1-score,0.792325,0.974178,0.932292,0.951574,0.893668,1.0,0.998706,0.932545,0.934678,0.931067,svc
f1-score,0.286996,0.895833,0.572402,0.824769,0.616794,0.987805,0.951911,0.738873,0.733787,0.745155,kneighborsclassifier
f1-score,0.765789,0.97907,0.911801,0.950237,0.886133,0.992647,0.989744,0.926287,0.92506,0.928008,sgdclassifier
f1-score,0.507674,0.840336,0.672634,0.768856,0.695122,0.939787,0.963795,0.769471,0.769744,0.770878,xgbclassifier
