## First models

### Imports

In [47]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import time

import ast
from collections import Counter

from nrclex import NRCLex

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bapti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [39]:
df_train = pd.read_csv("data/train.txt", names=["text", "emotion"], sep=";")
df_test = pd.read_csv("data/test.txt", names=["text", "emotion"], sep=";")
df_train.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


### Data prep

In [40]:
# load stopwords
sw_file = open('misc/stopwords.txt', "r")
try :
    content = sw_file.read()
    stopwords = ast.literal_eval(content)
finally:
    sw_file.close()
    
# separate text and labels
X, y = list(df_train['text']), list(df_train['emotion'])

# Turn labels to numeric
emotion_list = list(df_train['emotion'].unique())
y = [emotion_list.index(em) for em in y]

In [62]:

def preprocess(text_lst, sw=False, stem=False, lem=True):
    """
    Apply different preprocessing.
    returns :
        - list of texts preprocessed
    params : 
        - list(str) text_lst : list of text to preprocess
        - bool sw : enable to remove stopwords
        - bool stem : enable to stem text
        - bool lem : enable to lemmatize text
    """
    time_start = time.time()
    if sw :
        text_lst = [' '.join([word for word in x.split() if word not in (stopwords)]) for x in text_lst]
    
    stemmer = SnowballStemmer('english')
    lem = WordNetLemmatizer()
    
    if stem and lem :
        text_lst = [' '.join([stemmer.stem(lem.lemmatize(word)) for word in x.split()]) for x in text_lst]
    
    elif stem :
        text_lst = [' '.join([stemmer.stem(word) for word in x.split()]) for x in text_lst]
        
    elif lem :
        text_lst = [' '.join([lem.lemmatize(word) for word in x.split()]) for x in text_lst]
    
    print(f'Time elapsed : {round(time.time() - time_start, 2)} s')
    
    return text_lst

#X_preproc = preprocess(X, sw=False, stem=True, lem=True)

### Model

In [63]:
# testing function
def test_results():
    pass


#### #1 Algorithm without training on data

In [57]:
# simply lemma applied
X_preproc = preprocess(X, sw=False, stem=False, lem=True)

print(X_preproc[2])
text_obj = NRCLex(X_preproc[0])
print(text_obj.raw_emotion_scores)

Time elapsed : 0.85 s
im grabbing a minute to post i feel greedy wrong
{}
