In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import warnings
from time import time

from collections import Counter

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Data

In [2]:
with open("../Data/Learn/labels.pkl", "rb") as f:
    learn_labels = pickle.load(f)

with open("../Data/generated/my_learn_sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)

with open("../Data/generated/my_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(
    learn_sequences, learn_labels, test_size=0.3,
    shuffle=True, stratify=learn_labels, random_state=42 + 2
)
X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)

embeddings.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28934, 300), (32151,), (32151,), (13779,), (13779,))

# Hyperparameters

### Learning rate and number of epochs

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

n_splits, num_epochs = 3, 20
learning_rates = [0.001, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]
splitter = StratifiedKFold(n_splits, shuffle=True, random_state=1)
results = {lr: {epoch: [] for epoch in range(num_epochs)} for lr in learning_rates}

for lr in learning_rates:
    for train_ind, test_ind in splitter.split(X_train, y_train):
        model = LSTMModel(
            weight_class_M=Counter(y_train)["C"] / Counter(y_train)["M"],
            sentence_length=max(map(len, X_train)),
            embeddings=embeddings,
            num_units=50,
            batch_size=128, 
            dropout_keep_prob=1.0,
            learning_rate=lr,
        )
        for epoch in range(num_epochs):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                model.fit(X_train[train_ind], y_train[train_ind])
                score = model.score(X_train[test_ind], y_train[test_ind])
                results[lr][epoch].append(score)

In [29]:
pd.DataFrame(results).applymap(np.mean) - pd.DataFrame(results).applymap(np.std)

Unnamed: 0,0.001,0.005,0.0075,0.01,0.025,0.05,0.075,0.1
0,0.219186,0.242043,0.257886,0.26062,0.281237,0.219403,0.113782,0.25139
1,0.219518,0.273932,0.274697,0.292848,0.243436,0.221913,0.10389,0.224622
2,0.219404,0.268741,0.34137,0.300429,0.236646,0.23259,0.190903,0.235841
3,0.224053,0.299494,0.315628,0.294848,0.238888,0.217161,0.188953,0.20252
4,0.226083,0.279089,0.357015,0.34926,0.221479,0.21559,0.196997,0.160555
5,0.233993,0.268328,0.354776,0.323535,0.214209,0.229312,0.198591,0.196273
6,0.240764,0.278549,0.361943,0.369923,0.225692,0.237473,0.198712,0.219391
7,0.244185,0.282142,0.384298,0.383922,0.235038,0.222319,0.19842,0.223872
8,0.250533,0.289522,0.398939,0.414784,0.155839,0.229974,0.198287,0.233108
9,0.253443,0.295879,0.218489,0.401989,0.184943,0.227743,0.144751,0.236398


Best result:
- learning_rate = 0.01
- num_epochs = 11

### Test

In [15]:
tf.logging.set_verbosity(tf.logging.ERROR)

model = LSTMModel(
    weight_class_M=Counter(y_train)["C"] / Counter(y_train)["M"],
    sentence_length=max(map(len, X_train)),
    embeddings=embeddings,
    num_units=50,
    batch_size=128, 
    dropout_keep_prob=1.0,
    learning_rate=0.01,
)

model.fit(X_train, y_train, num_epochs=11)
print(classification_report(y_test, model.predict(X_test)))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


              precision    recall  f1-score   support

           C       0.95      0.77      0.85     11974
           M       0.32      0.71      0.44      1805

   micro avg       0.76      0.76      0.76     13779
   macro avg       0.63      0.74      0.64     13779
weighted avg       0.86      0.76      0.79     13779

