In [1]:
import pandas as pd
import numpy as np
import altair as alt

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from main import preprocess, train_lr, apply_lr

# LR Matrix Tests

In [2]:
df, corpus, classes = preprocess(pd.read_csv('train.csv'), min_ngram=1, max_ngram=1)
model = train_lr(df, 0.05, 0.05, 500, 0.0001)
model

matrix([[-0.04490566,  0.62870282, -0.07520998, ..., -0.01683676,
         -0.27164875,  0.62784749],
        [-0.01829712, -0.02778816, -0.01561372, ..., -0.01712876,
         -0.04769062, -0.12031017],
        [ 0.60102442, -0.25231631, -0.29953486, ...,  0.69122411,
         -0.50513354,  1.40260363],
        [-0.34080426, -0.09097559, -0.03160439, ..., -0.16337355,
         -0.12062832, -0.13877689],
        [-0.1946215 , -0.2314961 , -0.13731922, ..., -0.49146181,
          0.96041807, -1.40179335],
        [-0.00239589, -0.02612667,  0.55928216, ..., -0.00242323,
         -0.01531685, -0.36957071]])

In [3]:
predictions = apply_lr(df, model)
predictions.predictions = predictions.predictions.apply(lambda x: classes[np.argmax(x)])
predictions.emotions = predictions.emotions.apply(lambda x: classes[np.argmax(x)])
predictions

Unnamed: 0,id,emotions,_administrators,_illustrative,_communication,_hairs,_alongside,_consequently,_overdoses,_shitting,...,_dip,_chat,_defeated,_hang,_overall,_touched,_warm,_lots,_incomprehension,predictions
0,27383,sadness,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
1,110083,sadness,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
2,140764,joy,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,joy
3,100071,sadness,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
4,2837,love,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,love
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,40054,fear,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fear
1196,104110,sadness,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
1197,106240,sadness,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
1198,5483,surprise,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,surprise


In [4]:
gold_labels = predictions.emotions
pred_labels = predictions.predictions

accuracy = sum([gold_labels[i]==pred_labels[i] for i in gold_labels.index]) / len(gold_labels)
print(f'Training accuracy: {accuracy:.4f}')

Training accuracy: 1.0000


In [5]:
def KFold(**kwargs):
    K=5
    t, corpus, classes = preprocess(pd.read_csv('train.csv'), **kwargs)
    n = len(t) // K
    folds = [t[i:i+n] for i in range(0, len(t), n)]
    total_accuracy = 0
    for k in range(K):
        training_set = pd.concat(folds[:k] + folds[k+1:])
        validation_set = folds[k]
        model = train_lr(training_set, **kwargs)
        predictions = apply_lr(validation_set, model)
        gold_labels = validation_set.emotions.apply(lambda x: classes[np.argmax(x)])
        pred_labels = predictions.predictions.apply(lambda x: classes[np.argmax(x)])
        accuracy = sum([gold_labels[i]==pred_labels[i] for i in gold_labels.index]) / len(gold_labels)
        total_accuracy += accuracy
    return total_accuracy / K

In [6]:
KFold(train_lambda=0.05, train_step_size=0.05, train_max_iter=500, train_tolerance=0.00001, min_ngram=1, max_ngram=1, remove_stopwords=True, lemmatize=False, use_tf=False)

0.6433333333333333