In [1]:
import pandas as pd
import numpy as np
import altair as alt
import ast

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from main import mat_sigmoid, mat_sigmoid_dv, mat_swish1, mat_swish1_dv, mat_relu, mat_relu_dv, preprocess, train_nn, apply_nn

In [2]:
t, corpus, classes = preprocess(pd.read_csv('train.csv'), min_ngram=1, max_ngram=2, remove_stopwords=True, lemmatize=True, feature_count=1000)


In [3]:
model = train_nn(t, [(6, mat_sigmoid, mat_sigmoid_dv)], 0.05, 500, 0)

In [4]:
predictions = apply_nn(t, model)
predictions.predictions = predictions.predictions.apply(lambda x: classes[np.argmax(x)])
predictions.emotions = predictions.emotions.apply(lambda x: classes[np.argmax(x)])
predictions

Unnamed: 0,id,emotions,_feel,_awful,_feel awful,_job,_get,_position,_succeed,_happen,...,_show,_leave feeling,_casual,_response,_saw,_voice,_perfect,_strength,_photo,predictions
0,27383,sadness,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,sadness
1,110083,sadness,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
2,140764,joy,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,joy
3,100071,sadness,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
4,2837,love,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,40054,fear,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fear
1196,104110,sadness,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
1197,106240,sadness,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
1198,5483,surprise,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anger


In [5]:
gold_labels = predictions.emotions
pred_labels = predictions.predictions

accuracy = sum([gold_labels[i]==pred_labels[i] for i in gold_labels.index]) / len(gold_labels)
print(f'Training accuracy: {accuracy:.4f}')

Training accuracy: 0.8042


In [6]:
def KFold(train_fn, apply_fn, **kwargs):
    K=5
    t = pd.read_csv('train.csv')
    n = len(t) // K
    folds = [t[i:i+n] for i in range(0, len(t), n)]
    total_accuracy = 0
    total_loss = 0
    for k in range(K):
        training_set, corpus, classes = preprocess(pd.concat(folds[:k] + folds[k+1:]).reset_index(drop=True), rebalance=False, **kwargs)
        validation_set, corpus, classes = preprocess(folds[k].reset_index(drop=True), schema=corpus, **kwargs)
        model = train_fn(training_set, **kwargs)
        predictions = apply_fn(validation_set, model)
        a = validation_set.emotions.tolist()
        b = predictions.predictions.tolist()
        loss = -np.sum(np.sum(np.multiply(a, np.nan_to_num(np.log(b))), axis=0))
        gold_labels = validation_set.emotions.apply(lambda x: classes[np.argmax(x)]).reset_index(drop=True)
        pred_labels = predictions.predictions.apply(lambda x: classes[np.argmax(x)]).reset_index(drop=True)
        z = pd.DataFrame({'gold': gold_labels, 'pred': pred_labels})
        print(z.groupby('gold').apply(lambda x: (x.pred ==x.name).sum() / len(x)))
        # print(pred_labels)
        accuracy = sum([gold_labels[i]==pred_labels[i] for i in gold_labels.index]) / len(gold_labels)
        print(accuracy)
        print(loss)
        total_accuracy += accuracy
        total_loss += loss
    return total_accuracy / K, total_loss / K

In [7]:
def KFold_embeddings(train_fn, apply_fn, **kwargs):
    K=5
    df = pd.read_csv('train.csv')
    del df['text']
    e = pd.read_csv('naive_glove_embeddings.csv')
    t = pd.merge(df, e, on='id')
    classes = sorted(list(t.emotions.unique()))
    t.emotions = t.emotions.apply(lambda x: [int(x==y) for y in classes])
    n = len(t) // K
    folds = [t[i:i+n] for i in range(0, len(t), n)]
    total_accuracy = 0
    total_loss = 0
    for k in range(K):
        training_set = pd.concat(folds[:k] + folds[k+1:])
        validation_set = folds[k].reset_index(drop=True)
        model = train_fn(training_set, **kwargs)
        predictions = apply_fn(validation_set, model)
        a = validation_set.emotions.tolist()
        b = predictions.predictions.tolist()
        loss = -np.sum(np.sum(np.multiply(a, np.nan_to_num(np.log(b))), axis=0))
        gold_labels = validation_set.emotions.apply(lambda x: classes[np.argmax(x)]).reset_index(drop=True)
        pred_labels = predictions.predictions.apply(lambda x: classes[np.argmax(x)]).reset_index(drop=True)
        accuracy = sum([gold_labels[i]==pred_labels[i] for i in gold_labels.index]) / len(gold_labels)
        z = pd.DataFrame({'gold': gold_labels, 'pred': pred_labels})
        print(z.groupby('gold').apply(lambda x: (x.pred ==x.name).sum() / len(x)))
        print(accuracy)
        print(loss)
        total_accuracy += accuracy
        total_loss += loss
    return total_accuracy / K, total_loss / K

In [8]:
# epochs = [250, 500, 750, 1000, 1250, 1500]
epochs = [500, 1000, 1500, 2000, 2500]
# epochs = [50*i for i in range(1,11)]
validation_results = pd.DataFrame(columns=['parameter_name', 'parameter_value', 'average_accuracy'])
for e in epochs:
    avg_acc = KFold(train_nn, apply_nn, layers=[(50, mat_sigmoid, mat_sigmoid_dv)], train_step_size=0.01, train_iter=e, dropout_rate=0.3, batch_size=0, min_ngram=1, max_ngram=3, remove_stopwords=True, lemmatize=True, use_tf=False, feature_count=1000)
    validation_results.loc[len(validation_results)] = pd.Series({
        'parameter_name': 'epochs',
        'parameter_value': e,
        'average_accuracy': avg_acc
    })

validation_results

gold
anger       0.095238
fear        0.033333
joy         0.763441
love        0.058824
sadness     0.478873
surprise    0.125000
dtype: float64
0.4583333333333333
459.95108287135247
gold
anger       0.078947
fear        0.181818
joy         0.418919
love        0.080000
sadness     0.506849
surprise    0.000000
dtype: float64
0.32083333333333336
596.3213037691404


KeyboardInterrupt: 

In [None]:
alt.Chart(validation_results).mark_line().encode(
    x='parameter_value',
    y='average_accuracy',
    color='parameter_name'
)

In [None]:
dropouts = [0, 0.15, 0.3, 0.45, 0.6, 0.75]
validation_results = pd.DataFrame(columns=['parameter_name', 'parameter_value', 'average_accuracy'])
for d in dropouts:
    avg_acc = KFold(train_nn, apply_nn, layers=[(6, mat_sigmoid, mat_sigmoid_dv)], train_step_size=0.01, train_iter=750, dropout_rate=d, min_ngram=1, max_ngram=3, remove_stopwords=True, lemmatize=True, use_tf=False, feature_count=1000)
    validation_results.loc[len(validation_results)] = pd.Series({
        'parameter_name': 'dropout',
        'parameter_value': d,
        'average_accuracy': avg_acc
    })

validation_results

gold
anger       0.285714
fear        0.200000
joy         0.462366
love        0.058824
sadness     0.549296
surprise    0.000000
dtype: float64
0.3958333333333333
gold
anger       0.210526
fear        0.227273
joy         0.540541
love        0.080000
sadness     0.328767
surprise    0.000000
dtype: float64
0.32916666666666666
gold
anger       0.189189
fear        0.214286
joy         0.588889
love        0.157895
sadness     0.305085
surprise    0.000000
dtype: float64
0.3625
gold
anger       0.303030
fear        0.166667
joy         0.463415
love        0.200000
sadness     0.535211
surprise    0.000000
dtype: float64
0.39166666666666666
gold
anger       0.312500
fear        0.200000
joy         0.273973
love        0.040000
sadness     0.571429
surprise    0.000000
dtype: float64
0.3333333333333333
gold
anger       0.000000
fear        0.266667
joy         0.741935
love        0.058824
sadness     0.408451
surprise    0.000000
dtype: float64
0.44583333333333336
gold
anger       0.

Unnamed: 0,parameter_name,parameter_value,average_accuracy
0,dropout,0.0,0.3625
1,dropout,0.15,0.371667
2,dropout,0.3,0.383333
3,dropout,0.45,0.3475
4,dropout,0.6,0.336667
5,dropout,0.75,0.344167


In [None]:
alt.Chart(validation_results).mark_line().encode(
    x='parameter_value',
    y='average_accuracy',
    color='parameter_name'
)

In [None]:
hidden_nodes = [3,4,6,8,10]
validation_results = pd.DataFrame(columns=['parameter_name', 'parameter_value', 'average_accuracy'])
for h in hidden_nodes:
    avg_acc = KFold(train_nn, apply_nn, layers=[(h, mat_sigmoid, mat_sigmoid_dv)], train_step_size=0.01, train_iter=750, dropout_rate=0.3, min_ngram=1, max_ngram=3, remove_stopwords=True, lemmatize=True, use_tf=False, feature_count=1000)
    validation_results.loc[len(validation_results)] = pd.Series({
        'parameter_name': 'hidden nodes',
        'parameter_value': h,
        'average_accuracy': avg_acc
    })

validation_results

gold
anger       0.666667
fear        0.000000
joy         0.559140
love        0.235294
sadness     0.000000
surprise    0.000000
dtype: float64
0.2916666666666667
gold
anger       0.000000
fear        0.000000
joy         0.743243
love        0.000000
sadness     0.465753
surprise    0.000000
dtype: float64
0.37083333333333335
gold
anger       0.000000
fear        0.107143
joy         0.855556
love        0.000000
sadness     0.322034
surprise    0.000000
dtype: float64
0.4125
gold
anger       0.000000
fear        0.000000
joy         0.560976
love        0.000000
sadness     0.887324
surprise    0.000000
dtype: float64
0.45416666666666666
gold
anger       0.000000
fear        0.000000
joy         0.972603
love        0.000000
sadness     0.103896
surprise    0.000000
dtype: float64
0.32916666666666666
gold
anger       0.000000
fear        0.000000
joy         0.548387
love        0.000000
sadness     0.661972
surprise    0.000000
dtype: float64
0.4083333333333333
gold
anger       0.

Unnamed: 0,parameter_name,parameter_value,average_accuracy
0,hidden nodes,3,0.371667
1,hidden nodes,4,0.359167
2,hidden nodes,6,0.380833
3,hidden nodes,8,0.346667
4,hidden nodes,10,0.374167


In [None]:
alt.Chart(validation_results).mark_line().encode(
    x='parameter_value',
    y='average_accuracy',
    color='parameter_name'
)

In [None]:
epochs = [20000, 4000, 6000, 8000, 10000, 12000, 14000, 16000]
# epochs = [500*i for i in range(1,101)]
validation_results_embeddings = pd.DataFrame(columns=['parameter_name', 'parameter_value', 'average_accuracy'])
for e in epochs:
    avg_acc = KFold_embeddings(train_nn, apply_nn, layers=[(30, mat_sigmoid, mat_sigmoid_dv)], train_step_size=0.01, train_iter=e, dropout_rate=0.2, batch_size=50)
    validation_results_embeddings.loc[len(validation_results_embeddings)] = pd.Series({
        'parameter_name': 'epochs',
        'parameter_value': e,
        'average_accuracy': avg_acc
    })

validation_results_embeddings

gold
anger       0.571429
fear        0.233333
joy         0.688172
love        0.166667
sadness     0.708333
surprise    0.000000
dtype: float64
0.5661157024793388
gold
anger       0.476190
fear        0.181818
joy         0.800000
love        0.125000
sadness     0.577465
surprise    0.000000
dtype: float64
0.5289256198347108
gold
anger       0.405405
fear        0.142857
joy         0.700000
love        0.421053
sadness     0.639344
surprise    0.000000
dtype: float64
0.5330578512396694
gold
anger       0.451613
fear        0.173913
joy         0.833333
love        0.150000
sadness     0.581081
surprise    0.000000
dtype: float64
0.5537190082644629
gold
anger       0.515152
fear        0.296296
joy         0.666667
love        0.320000
sadness     0.493333
surprise    0.142857
dtype: float64
0.5
gold
anger       0.428571
fear        0.033333
joy         0.795699
love        0.000000
sadness     0.722222
surprise    0.000000
dtype: float64
0.5619834710743802
gold
anger       0.190476

Unnamed: 0,parameter_name,parameter_value,average_accuracy
0,epochs,20000,0.536364
1,epochs,4000,0.506612
2,epochs,6000,0.526446
3,epochs,8000,0.502479
4,epochs,10000,0.5
5,epochs,12000,0.466942
6,epochs,14000,0.533884
7,epochs,16000,0.527273


In [None]:
alt.Chart(validation_results_embeddings).mark_line().encode(
    x='parameter_value',
    y='average_accuracy',
    color='parameter_name'
)

In [27]:
dropouts = [0.0, 0.15, 0.3, 0.45, 0.6, 0.75]
validation_results_embeddings = pd.DataFrame(columns=['parameter_name', 'parameter_value', 'average_accuracy', 'average_loss'])
for d in dropouts:
    avg_acc, avg_loss = KFold_embeddings(train_nn, apply_nn, layers=[(30, mat_sigmoid, mat_sigmoid_dv)], train_step_size=0.01, train_iter=12000, dropout_rate=d, batch_size=50)
    validation_results_embeddings.loc[len(validation_results_embeddings)] = pd.Series({
        'parameter_name': 'dropout',
        'parameter_value': d,
        'average_accuracy': avg_acc,
        'average_loss': avg_loss
    })

validation_results_embeddings

gold
anger       0.523810
fear        0.300000
joy         0.666667
love        0.411765
sadness     0.605634
surprise    0.000000
dtype: float64
0.55
459.6945642620692
gold
anger       0.550000
fear        0.181818
joy         0.770270
love        0.200000
sadness     0.492958
surprise    0.000000
dtype: float64
0.5125
491.26986961779767
gold
anger       0.486486
fear        0.428571
joy         0.584270
love        0.157895
sadness     0.583333
surprise    0.142857
dtype: float64
0.5041666666666667
490.9492619274975
gold
anger       0.366667
fear        0.333333
joy         0.738095
love        0.350000
sadness     0.569444
surprise    0.000000
dtype: float64
0.5375
423.22676354629374
gold
anger       0.363636
fear        0.160000
joy         0.666667
love        0.320000
sadness     0.480519
surprise    0.000000
dtype: float64
0.45416666666666666
523.5896761492282
gold
anger       0.571429
fear        0.300000
joy         0.731183
love        0.235294
sadness     0.619718
surprise  

Unnamed: 0,parameter_name,parameter_value,average_accuracy,average_loss
0,dropout,0.0,0.511667,477.746027
1,dropout,0.15,0.529167,425.411506
2,dropout,0.3,0.523333,505.062658
3,dropout,0.45,0.483333,561.055163
4,dropout,0.6,0.5025,622.925912
5,dropout,0.75,0.470833,823.534572


In [28]:
base = alt.Chart(validation_results_embeddings).encode(
    alt.X('parameter_value', title='Dropout Ratio')
)

acc = base.mark_line(stroke='salmon').encode(
    alt.Y('average_accuracy', axis=alt.Axis(title='Average Accuracy', titleColor='salmon'))
)

loss = base.mark_line(stroke='lightskyblue').encode(
    alt.Y('average_loss', axis=alt.Axis(title='Average Loss', titleColor='lightskyblue'))
)

alt.layer(acc, loss).resolve_scale(
    y='independent'
).properties(title='Neural Net')

In [None]:
hidden_nodes = [5,10,15,20,25,30,35]
validation_results_embeddings = pd.DataFrame(columns=['parameter_name', 'parameter_value', 'average_accuracy'])
for h in hidden_nodes:
    avg_acc = KFold_embeddings(train_nn, apply_nn, layers=[(h, mat_sigmoid, mat_sigmoid_dv)], train_step_size=0.01, train_iter=12000, dropout_rate=0.3, batch_size=50)
    validation_results_embeddings.loc[len(validation_results_embeddings)] = pd.Series({
        'parameter_name': 'hidden nodes',
        'parameter_value': h,
        'average_accuracy': avg_acc
    })

validation_results_embeddings

gold
anger       0.000000
fear        0.000000
joy         0.849462
love        0.000000
sadness     0.875000
surprise    0.000000
dtype: float64
0.5867768595041323
gold
anger       0.000000
fear        0.000000
joy         0.706667
love        0.000000
sadness     0.901408
surprise    0.000000
dtype: float64
0.4834710743801653
gold
anger       0.405405
fear        0.035714
joy         0.733333
love        0.000000
sadness     0.622951
surprise    0.000000
dtype: float64
0.49586776859504134
gold
anger       0.161290
fear        0.043478
joy         0.000000
love        0.000000
sadness     0.905405
surprise    0.000000
dtype: float64
0.30165289256198347
gold
anger       0.0
fear        0.0
joy         1.0
love        0.0
sadness     0.0
surprise    0.0
dtype: float64
0.30991735537190085
gold
anger       0.476190
fear        0.133333
joy         0.881720
love        0.277778
sadness     0.611111
surprise    0.000000
dtype: float64
0.5991735537190083
gold
anger       0.738095
fear       

Unnamed: 0,parameter_name,parameter_value,average_accuracy
0,hidden nodes,5,0.435537
1,hidden nodes,10,0.489256
2,hidden nodes,15,0.533884
3,hidden nodes,20,0.523967
4,hidden nodes,25,0.508264
5,hidden nodes,30,0.545455
6,hidden nodes,35,0.533058


In [None]:
alt.Chart(validation_results_embeddings).mark_line().encode(
    x='parameter_value',
    y='average_accuracy',
    color='parameter_name'
)