In [1]:
import numpy as np
import pandas as pd

# Dataset

In [2]:
test_data = pd.read_csv('Dataset/test.txt', header=None, sep=";", names=["Comment","Emotion"], encoding="utf-8")
train_data = pd.read_csv('Dataset/train.txt', header=None, sep=";", names=["Comment","Emotion"], encoding="utf-8")
validation_data = pd.read_csv('Dataset/val.txt', header=None, sep=";", names=["Comment","Emotion"], encoding="utf-8")

In [3]:
print("Train : ", train_data.shape)
print("Test : ", test_data.shape)
print("Validation : ", validation_data.shape)

Train :  (16000, 2)
Test :  (2000, 2)
Validation :  (2000, 2)


# EDA

In [4]:
import plotly.express as px
import plotly.figure_factory as ff

In [5]:
train_data['length'] = train_data.apply(lambda row: len(row['Comment']), axis=1)

In [6]:
train_data.head()

Unnamed: 0,Comment,Emotion,length
0,i didnt feel humiliated,sadness,23
1,i can go from feeling so hopeless to so damned...,sadness,108
2,im grabbing a minute to post i feel greedy wrong,anger,48
3,i am ever feeling nostalgic about the fireplac...,love,92
4,i am feeling grouchy,anger,20


In [7]:
fig = px.bar(train_data.groupby(['Emotion']).agg('count').reset_index(), x='Emotion', y='Comment', text='Comment')
fig.update_layout(yaxis_title="Count")

In [8]:
emotions = ['anger','fear', 'joy', 'love', 'sadness', 'surprise']
fig = ff.create_distplot([train_data[train_data.Emotion == e].length for e in emotions], emotions)
fig.show()

# Data Processing
After sampling some rows, I noticed that the dataset is cleaned. All I need to do is lemmatize it and perhaps remove stopwords.

In [9]:
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [10]:
contractions = {
"im": "i am",
"didnt": "did not",
"ive": "i have"
}

In [11]:
def contract(x):
    for i in x.split():
        if i in contractions:
            x = x.replace(i, contractions[i])
    return x

In [12]:
train_data['Comment'] = train_data.apply(lambda row: contract(row['Comment']), axis = 1)
test_data['Comment'] = test_data.apply(lambda row: contract(row['Comment']), axis = 1)
validation_data['Comment'] = validation_data.apply(lambda row: contract(row['Comment']), axis = 1)

In [13]:
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

def get_clean(x, w2v=False):
    tokenized_text = nltk.word_tokenize(x)
    new_tokenized = []
    for i in tokenized_text:
        if w2v:
            new_tokenized.append(lemmatizer.lemmatize(i))
        else:
            if i not in stopwords and len(i) != 1:
                new_tokenized.append(lemmatizer.lemmatize(i))
    return(' '.join(new_tokenized))

In [14]:
train_data['clean_comment'] = train_data.apply(lambda row: get_clean(row['Comment']), axis = 1)
test_data['clean_comment'] = test_data.apply(lambda row: get_clean(row['Comment']), axis = 1)
validation_data['clean_comment'] = validation_data.apply(lambda row: get_clean(row['Comment']), axis = 1)

train_data['w2v_comment'] = train_data.apply(lambda row: get_clean(row['Comment'], w2v=True), axis = 1)
test_data['w2v_comment'] = test_data.apply(lambda row: get_clean(row['Comment'], w2v=True), axis = 1)
validation_data['w2v_comment'] = validation_data.apply(lambda row: get_clean(row['Comment'], w2v=True), axis = 1)

In [16]:
def labeling(row):
    if row['Emotion'] == 'joy':
        return 0
    if row['Emotion'] == 'anger':
        return 1
    if row['Emotion'] == 'love':
        return 2
    if row['Emotion'] == 'sadness':
        return 3
    if row['Emotion'] == 'fear':
        return 4
    if row['Emotion'] == 'surprise':
        return 5

In [17]:
train_data['y'] = train_data.apply(labeling, axis = 1)
test_data['y'] = test_data.apply(labeling, axis = 1)
validation_data['y'] = validation_data.apply(labeling, axis = 1)

In [18]:
train_data.head()

Unnamed: 0,Comment,Emotion,length,clean_comment,w2v_comment,y
0,i did not feel humiliated,sadness,23,feel humiliated,i did not feel humiliated,3
1,i can go from feeling so hopeless to so damned...,sadness,108,go feeling hopeless damned hopeful around some...,i can go from feeling so hopeless to so damned...,3
2,i am grabbing a minute to post i feel greedy w...,anger,48,grabbing minute post feel greedy wrong,i am grabbing a minute to post i feel greedy w...,1
3,i am ever feeling nostalgic about the fireplac...,love,92,ever feeling nostalgic fireplace know still pr...,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,anger,20,feeling grouchy,i am feeling grouchy,1


# Word Embedding
- TF-IDF
- Word2Vec

In [19]:
import umap.umap_ as umap

## TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tf_idf = TfidfVectorizer(min_df=3)
tf_idf_matrix = tf_idf.fit_transform(train_data.clean_comment)
tf_idf_valid = tf_idf.transform(validation_data.clean_comment)
tf_idf_test = tf_idf.transform(test_data.clean_comment)
print("Matrix shape:", tf_idf_matrix.shape)
print("Type:", type(tf_idf_matrix))

Matrix shape: (16000, 4703)
Type: <class 'scipy.sparse.csr.csr_matrix'>


In [24]:
tf_idf_umap = umap.UMAP().fit_transform(tf_idf_matrix, train_data.y)
plot = pd.DataFrame(tf_idf_umap)
plot.columns = ['UMAP1', 'UMAP2']
plot['labels'] = train_data.Emotion
fig = px.scatter(plot, x='UMAP1', y='UMAP2', color='labels', title='TF-IDF Word Embedding')
fig.show()

## word2vec

In [26]:
from gensim.models import Word2Vec

In [27]:
list_of_comments = [text.split() for text in train_data.w2v_comment]

In [28]:
w2v_model = Word2Vec(list_of_comments, vector_size=200, window=5, min_count=3, sg=1, hs=0, negative=10, ns_exponent=0.75)
vocab = w2v_model.wv.index_to_key
print("Vocab size:", len(vocab))

Vocab size: 4925


In [34]:
w2v_model.wv.most_similar('love', topn=10)

[('understanding', 0.8386687636375427),
 ('loving', 0.8330996632575989),
 ('trusting', 0.8310558199882507),
 ('treated', 0.8269306421279907),
 ('divine', 0.82631516456604),
 ('connection', 0.8216210603713989),
 ('laugh', 0.8188863396644592),
 ('duty', 0.8180804252624512),
 ('joy', 0.8179502487182617),
 ('loved', 0.8161572813987732)]

In [35]:
mean_w2v = np.array([np.mean([w2v_model.wv[w] for w in words if w in vocab] or [np.zeros(200)], axis = 0) for words in list_of_comments])
print("w2v matrix shape:", mean_w2v.shape)
print("w2v type:", type(mean_w2v))

w2v matrix shape: (16000, 200)
w2v type: <class 'numpy.ndarray'>


In [36]:
w2v_umap = umap.UMAP().fit_transform(pd.DataFrame(mean_w2v), train_data.y)
plot = pd.DataFrame(w2v_umap)
plot.columns = ['UMAP1', 'UMAP2']
plot['labels'] = train_data.Emotion
fig = px.scatter(plot, x='UMAP1', y='UMAP2', color='labels', title='Word2Vec Word Embedding')
fig.show()

# Modelling
- Logistic Regression
- K-NN
- Random Forest Classifier
- AdaBoost Classifier

In [45]:
from sklearn.metrics import classification_report

In [41]:
y_train = train_data.y
y_valid = validation_data.y
y_test = test_data.y

## Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

In [48]:
LR = LogisticRegression(max_iter=10000)
LR.fit(tf_idf_matrix, y_train)
print(classification_report(y_valid, LR.predict(tf_idf_valid)))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       704
           1       0.90      0.83      0.86       275
           2       0.87      0.74      0.80       178
           3       0.87      0.94      0.91       550
           4       0.86      0.76      0.81       212
           5       0.88      0.60      0.72        81

    accuracy                           0.88      2000
   macro avg       0.88      0.80      0.83      2000
weighted avg       0.88      0.88      0.87      2000



## K-NN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [56]:
KNN = KNeighborsClassifier(n_neighbors=30)
KNN.fit(tf_idf_matrix, y_train)
print(classification_report(y_valid, KNN.predict(tf_idf_valid)))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       704
           1       0.93      0.76      0.84       275
           2       0.93      0.59      0.72       178
           3       0.81      0.93      0.87       550
           4       0.86      0.73      0.79       212
           5       0.90      0.56      0.69        81

    accuracy                           0.85      2000
   macro avg       0.88      0.75      0.80      2000
weighted avg       0.85      0.85      0.84      2000

