In [69]:
import os
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sklearn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import en_core_web_md

In [70]:
#loads data from a txt file
#file: file to read from. Each line in file must be split into two parts with a specified character char
#char: character to split each line with
#arr: array to store each line

def load_data(file, char, arr):
    with open(file) as f:
        num = 0
        for line in f:
            segments = line.split(char)
            list = []
            list.append(segments[0])
            list.append(segments[1].split('\n')[0])
            arr.append(list)
            num+=1
    return None

In [71]:
#training logistic regression model
def train_logistic_regression(X_train, y_train, X_test, y_test):
    #performs 10-fold cross validation on the training set
    lr = LogisticRegression(penalty='l2', tol=0.0001, C=0.1, class_weight=None, 
                            random_state=123, solver='lbfgs', max_iter=10000, n_jobs=4, multi_class="ovr")
    training_score = cross_validate(lr, X_train, y_train, cv=10, return_train_score=True)
    return training_score
    #using the best performing model, evaluate it on the testing set

#training random forest classifier
def train_rf(X_train, y_train, X_test, y_test):
    #defines our parameter grid
    param_grid = {
        'n_estimators': [25, 50, 75, 100],
        'max_depth': [2, 6, 12, 24, 48]
    }
    #perform 10-fold nested cross validation, optimizing the parameter max_depth
    inner_cv = KFold(n_splits=20, shuffle=True, random_state=123)
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=123)

    rf = RandomForestClassifier(n_jobs=4, random_state=123)

    outer_validation = GridSearchCV(estimator=rf, param_grid=param_grid, cv=inner_cv)
    training_score = cross_validate(outer_validation, X_train, y_train, cv=outer_cv, return_train_score=True)
    return training_score

In [72]:
nlp = spacy.load("en_core_web_md")
#preprocesses data using sentence transformers
#data: the data to process
#df: the dataframe to reference (for indexing)
def preprocess_data(data, df):
    #cleans text by tokenizing and preprocessing it
    df["tokenized_entries"] = [tokenize(e) for e in nlp.pipe(df["entries"])]
    #uses countvectorizer to get vector representations of our tokens
    countvec = CountVectorizer(stop_words="english")
    preprocessed_df = countvec.fit_transform(df["tokenized_entries"])
    return preprocessed_df

def tokenize(text):
    new_text = []
    #we keep the token if it meets the following criteria:
    # length is 2 or longer
    # Not a stopword
    # Not part of the prohibited POS tags
    # Lemmatization
    for token in text:
        if (token.is_stop != True and len(token) >= 2 and token.pos_ not in ["PUNCT", "PART", "ADJ", "PRON", "DET", "ADP", "SPACE", "SYM", "X"]):
            #converts token's lemma to lowercase before appending it
            new_text.append(token.lemma_.lower())
    final_text = " ".join(new_text)
    return final_text


In [73]:
#loading data
train_data = []
test_data = []
load_data("./emotions/train.txt", ';', train_data)
load_data("./emotions/val.txt", ';', train_data)
load_data("./emotions/test.txt", ";", test_data)

train_df = pd.DataFrame(data=train_data, columns=["entries", "emotions"])
test_df = pd.DataFrame(data=test_data, columns=["entries", "emotions"])

X_train = train_df.drop("emotions", axis=1)
X_test = test_df.drop("emotions", axis=1)
y_train = train_df["emotions"]
y_test = test_df["emotions"]

In [74]:
X_train

Unnamed: 0,entries
0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy
...,...
17995,im having ssa examination tomorrow in the morn...
17996,i constantly worry about their fight against n...
17997,i feel its important to share this info for th...
17998,i truly feel that if you are passionate enough...


In [75]:
X_test

Unnamed: 0,entries
0,im feeling rather rotten so im not very ambiti...
1,im updating my blog because i feel shitty
2,i never make her separate from me because i do...
3,i left with my bouquet of red and yellow tulip...
4,i was feeling a little vain when i did this one
...,...
1995,i just keep feeling like someone is being unki...
1996,im feeling a little cranky negative after this...
1997,i feel that i am useful to my people and that ...
1998,im feeling more comfortable with derby i feel ...


In [76]:
y_train

0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
17995    sadness
17996        joy
17997        joy
17998        joy
17999        joy
Name: emotions, Length: 18000, dtype: object

In [77]:
y_test

0       sadness
1       sadness
2       sadness
3           joy
4       sadness
         ...   
1995      anger
1996      anger
1997        joy
1998        joy
1999       fear
Name: emotions, Length: 2000, dtype: object

In [78]:
num_classes = max(len(y_test.unique()), len(y_train.unique()))
num_classes

6

In [79]:
#preprocessing
X_train_transformed = preprocess_data(X_train['entries'], train_df)
X_train_transformed

<18000x10400 sparse matrix of type '<class 'numpy.int64'>'
	with 104511 stored elements in Compressed Sparse Row format>

In [80]:
X_test_transformed = preprocess_data(X_test['entries'], test_df)
X_test_transformed

<2000x2931 sparse matrix of type '<class 'numpy.int64'>'
	with 11566 stored elements in Compressed Sparse Row format>

In [81]:
#model training
score1 = train_logistic_regression(X_train_transformed, y_train, X_test_transformed, y_test)
score1

{'fit_time': array([1.4509995 , 0.25950074, 0.25849819, 0.29150009, 0.24949813,
        0.25700235, 0.24749851, 0.2590003 , 0.25049996, 0.26100111]),
 'score_time': array([0.00199914, 0.00199938, 0.00199986, 0.00200129, 0.00200129,
        0.00199842, 0.00150061, 0.00200057, 0.00250077, 0.00200176]),
 'test_score': array([0.45888889, 0.46333333, 0.45888889, 0.46166667, 0.45555556,
        0.44055556, 0.46611111, 0.47388889, 0.46611111, 0.45444444]),
 'train_score': array([0.52475309, 0.52512346, 0.52679012, 0.52567901, 0.52549383,
        0.52833333, 0.52388889, 0.52475309, 0.52549383, 0.52703704])}

In [82]:
score2 = train_rf(X_train_transformed, y_train, X_test_transformed, y_test)
score2

KeyboardInterrupt: 