In [1]:
#importing packages
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
import time
import torchvision.transforms as transforms

In [2]:
#loading data
with open('data_full.json') as file:
    oos = json.load(file)

In [3]:
#listing files
oos.keys()

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])

In [4]:
#assembling files into pandas dataframes
temp = oos['oos_val']
oos_val = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})
temp = oos['val']
val = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})
temp = oos['train']
train = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})
temp = oos['oos_test']
oos_test = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})
temp = oos['test']
test = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})
temp = oos['oos_train']
oos_train = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [5]:
#inspecting dataframes
print(oos_val.head())
print(val.head())
print(train.head())
print(oos_test.head())
print(test.head())
print(oos_train.head())


                                               query domain
1                                 a show on broadway    oos
2                 who has the best record in the nfl    oos
3                 how do i find the area of a circle    oos
4                  how many onions do i have on hand    oos
                                       query     domain
0   in spanish, meet me tomorrow is said how  translate
1     in french, how do i say, see you later  translate
2           how do you say hello in japanese  translate
3  how do i ask about the weather in chinese  translate
4  how can i say "cancel my order" in french  translate
                                               query     domain
0  what expression would i use to say i love you ...  translate
1  can you tell me how to say 'i do not speak muc...  translate
2  what is the equivalent of, 'life is good' in f...  translate
3  tell me how to say, 'it is a beautiful morning...  translate
4  if i were mongolian, how would i say that

In [6]:
#our training protocol will use the 'out of scope' training data to train for this class, so appending the data to one dataframe
train = train.append(oos_train, ignore_index=True)

In [7]:
#defining the vectorizer that will be used for this dataset
vectorizer = TfidfVectorizer()

In [8]:
#fitting the TFIDF vectorizer to the training data queries and transforming it
X = vectorizer.fit_transform(train['query']).toarray()

In [9]:
# checking the size of the array
X.shape

(15100, 5146)

In [11]:
#inspecting vectorizer features.
print(vectorizer.get_feature_names()[:20])
print(vectorizer.get_feature_names()[-20:])

['00', '000', '005', '00am', '00pm', '01', '02', '03', '05', '098098', '10', '100', '1000', '10000', '100000', '10294', '104', '10500', '10am', '10kg']
['zales', 'zander', 'zazie', 'zealand', 'zebras', 'zen', 'zenith', 'zepher', 'zephers', 'zeppelin', 'zesty', 'zeus', 'zion', 'zippy', 'zippys', 'ziti', 'zombie', 'zone', 'zoo', 'zulu']


These aren't all words, but this reflects the fact that inputs will not always be words. Equally some are variations (zippy/zippys) but these will be preserved to minimise the work done in preprocessing input data during deployment.

In [12]:
#making dataframe of X
X_df = pd.DataFrame(X)

In [13]:
#shape of X dataframe
X_df.shape

(15100, 5146)

In [14]:
#shape of initial training dataframe
train.shape

(15100, 2)

In [15]:
#joining training dataframe to vectorized words
train_vec = train.join(X_df)

In [16]:
#finding shape of joined dataframe
train_vec.shape

(15100, 5148)

In [17]:
#Inspecting first few rows of dataframe
train_vec.head()

Unnamed: 0,query,domain,0,1,2,3,4,5,6,7,...,5136,5137,5138,5139,5140,5141,5142,5143,5144,5145
0,what expression would i use to say i love you ...,translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,can you tell me how to say 'i do not speak muc...,translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"what is the equivalent of, 'life is good' in f...",translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"tell me how to say, 'it is a beautiful morning...",translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"if i were mongolian, how would i say that i am...",translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
#dropping the text queries
train_vec = train_vec.drop('query', axis=1)

In [19]:
#inspecting values of a randomly chosen column to check that not all entries are 0.0
train_vec[5136].value_counts().head(10)

0.000000    15095
0.744097        1
0.731775        1
0.694293        1
0.577886        1
0.698728        1
Name: 5136, dtype: int64

In [20]:
#inspecting unique domains
train_vec['domain'].unique()

array(['translate', 'transfer', 'timer', 'definition', 'meaning_of_life',
       'insurance_change', 'find_phone', 'travel_alert', 'pto_request',
       'improve_credit_score', 'fun_fact', 'change_language', 'payday',
       'replacement_card_duration', 'time', 'application_status',
       'flight_status', 'flip_coin', 'change_user_name',
       'where_are_you_from', 'shopping_list_update', 'what_can_i_ask_you',
       'maybe', 'oil_change_how', 'restaurant_reservation', 'balance',
       'confirm_reservation', 'freeze_account', 'rollover_401k',
       'who_made_you', 'distance', 'user_name', 'timezone', 'next_song',
       'transactions', 'restaurant_suggestion', 'rewards_balance',
       'pay_bill', 'spending_history', 'pto_request_status',
       'credit_score', 'new_card', 'lost_luggage', 'repeat', 'mpg',
       'oil_change_when', 'yes', 'travel_suggestion', 'insurance',
       'todo_list_update', 'reminder', 'change_speed', 'tire_pressure',
       'no', 'apr', 'nutrition_info', 'c

In [21]:
#constructing a mapping dictionary for domains to tranform them into nubmers
y_dic = {}
domain = 0
for item in train_vec['domain'].unique():
    y_dic[item] = domain
    domain += 1

In [22]:
#shuffling training data
train_vec = shuffle(train_vec, random_state=0)
train_vec.head()

Unnamed: 0,domain,0,1,2,3,4,5,6,7,8,...,5136,5137,5138,5139,5140,5141,5142,5143,5144,5145
8218,do_you_have_pets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8136,routing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577,insurance_change,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7446,todo_list,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3978,pto_request_status,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#creating a column of target values
train_vec['y'] = train_vec['domain'].replace(y_dic)
#removing the text domains
train_vec = train_vec.iloc[:, 1:]
#defining x and y data
train_x = train_vec.iloc[:,:-1]
train_y = train_vec.iloc[:,-1]

In [24]:
#Transforming queries from other datasets to vectors
Xv = vectorizer.transform(val['query']).toarray()
Xvo = vectorizer.transform(oos_val['query']).toarray()
Xtest = vectorizer.transform(test['query']).toarray()
Xto = vectorizer.transform(oos_test['query']).toarray()

In [25]:
#turning arrays to dataframes
Xv_df = pd.DataFrame(Xv)
Xvo_df = pd.DataFrame(Xvo)
Xtest_df = pd.DataFrame(Xtest)
Xto_df = pd.DataFrame(Xto)

In [26]:
#creating vector datasets for validation set (excluding out of scope values)
val_vec = pd.concat([val, Xv_df], axis=1)
val_vec = val_vec.drop('query', axis=1)
val_vec = shuffle(val_vec, random_state=0)
val_vec['y'] = val_vec['domain'].replace(y_dic)
val = val_vec.iloc[:, 1:]
val_x = val.iloc[:,:-1]
val_y = val.iloc[:,-1]

In [27]:
#creating vector datasets for out of scope validation set
val_oos_vec = pd.concat([oos_val, Xvo_df], axis=1)
val_oos_vec = val_oos_vec.drop('query', axis=1)
val_oos_vec = shuffle(val_oos_vec, random_state=0)
val_oos_vec['y'] = val_oos_vec['domain'].replace(y_dic)
val_oos = val_oos_vec.iloc[:, 1:]
val_oos_x = val_oos.iloc[:,:-1]
val_oos_y = val_oos.iloc[:,-1]
val_full = val.append(val_oos, ignore_index=True)
val_full_x = val_full.iloc[:,:-1]
val_full_y = val_full.iloc[:,-1]

In [28]:
#creating vector datasets for testing set (excluding out of scope values)
test_vec = pd.concat([test, Xtest_df], axis=1)
test_vec = test_vec.drop('query', axis=1)
test_vec = shuffle(test_vec, random_state=0)
test_vec['y']= test_vec['domain'].replace(y_dic)
test = test_vec.iloc[:, 1:]
test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1]

In [29]:
#creating vector datasets for out of scope testing set
test_oos_vec = pd.concat([oos_test, Xto_df], axis=1)
test_oos_vec = test_oos_vec.drop('query', axis=1)
test_oos_vec = shuffle(test_oos_vec, random_state=0)
test_oos_vec['y'] = test_oos_vec['domain'].replace(y_dic)
test_oos = test_oos_vec.iloc[:, 1:]
test_oos_x = test.iloc[:,:-1]
test_oos_y = test.iloc[:,-1]

In [30]:
#importing torch packages
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils


Initial MLP using parameters from reference paper.

In [32]:
#defining perceptron dimensions
vocab_dim = 5146 #number of words in the vocabulary
hidden_dim = 400 #number of neurons in hidden layer
output_dim = 151 #number of classes (150 in scope plus Out of Scope)

In [33]:
#assembling training arrays.
train_y = np.array(train_y)
train_x = np.array(train_x)

In [34]:
#defining multilayer perceptron
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim, #setting input dimensions
            hidden_dim=hidden_dim, #setting hidden layer dimensions
            output_dim=output_dim, #setting output dimensions
            dropout=0 #setting a dropout rate
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout) #defining the dropout function

        self.hidden = nn.Linear(input_dim, hidden_dim) #defining the hidden layer function
        self.output = nn.Linear(hidden_dim, output_dim) #defining the output layer function

    def forward(self, X, **kwargs):
        X = torch.tanh(self.hidden(X)) #applying a tanh activation function to outputs from the hidden layer
        X = self.dropout(X) #applying the dropout function
        X = F.softmax(self.output(X), dim=-1) #applying the softmax function to the outputs
        return X

In [35]:
#importing skorch modules
from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping
from skorch.dataset import Dataset
from skorch.helper import predefined_split

In [36]:
#transforming data to tensors
train_x = torch.tensor(train_x).float()
val_full_x = torch.tensor(np.array(val_full_x)).float()
val_x = torch.tensor(np.array(val_x)).float()
val_oos_x = torch.tensor(np.array(val_oos_x)).float()
test_x = torch.tensor(np.array(test_x)).float()
test_oos_x = torch.tensor(np.array(test_oos_x)).float()

In [37]:
#transforming targets to arrays
val_full_y = np.array(val_full_y)
val_y = np.array(val_y)
val_oos_y = np.array(val_oos_y)
test_y = np.array(test_y)
test_oos_y = np.array(test_oos_y)

In [38]:
#initialising the classifier
net = NeuralNetClassifier( #a neural network classifier
    module=CLINCModule, #use the MLP defined previously
    criterion=torch.nn.CrossEntropyLoss, #using Cross Entropy Loss as the loss function, due to being a multiclass classification problem
    max_epochs=1000, #defining the maximum number of epochs
    optimizer=torch.optim.SGD, #using stochastic gradient descent for optimization
    callbacks=[EarlyStopping(patience=5)], #implementing early stopping with a patience of 5 epochs
)

In [39]:
#fitting the classifier to the training data
net.fit(train_x, train_y)


  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0173[0m       [32m0.0046[0m        [35m5.0173[0m  2.8404
      2        [36m5.0173[0m       0.0046        [35m5.0173[0m  2.5806
      3        [36m5.0173[0m       0.0046        5.0173  2.5644
      4        [36m5.0173[0m       0.0046        [35m5.0173[0m  2.5759
      5        [36m5.0173[0m       0.0046        5.0173  2.6285
Stopping since valid_loss has not improved in the last 5 epochs.


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CLINCModule(
    (dropout): Dropout(p=0, inplace=False)
    (hidden): Linear(in_features=5146, out_features=400, bias=True)
    (output): Linear(in_features=400, out_features=151, bias=True)
  ),
)

In [40]:
tlabels = net.predict(train_x) #using the trained model to predict from training data
tacc = accuracy_score(tlabels, train_y) #comparing predicted domains to actual classes using classification accuracy
print('training accuracy')
print(tacc) #printing the training accuracy
time0 = time.time() # timer
labels = net.predict(val_x) #predicting validation set
acc = accuracy_score(labels, val_y) #calculating accuracy score for validation predictions
time1 = time.time() # timer
print('Val accuracy')
print(acc) #printing validation accuracy
print('pred time')
print(time1-time0) #printing validation prediction time
time2 = time.time() # timer
olabels = net.predict(val_oos_x) # predicting for Out of Scope Validation set
oos_acc = accuracy_score(olabels, val_oos_y) #calculating accuracy score for OOS validation predictions
time3 = time.time() # timer
print('OOS Val Accuracy')
print(oos_acc) #printing OOS validation accuracy
print('OOS pred time')
print(time3-time2) #printing OOS validation prediction time

training accuracy
0.005298013245033113
Val accuracy
0.003
pred time
0.3200225830078125
OOS Val Accuracy
0.23
OOS pred time
0.01287078857421875


Converged too quickly, likely not in global minimum. Will reattempt using different dropouts, learning rates, momentum.

In [41]:
dropouts = [1, 0.5, 0.1] #defining potential dropout values
tacc={} #initialising training accuracy results dictionary
vacc = {} #initialising validation accuracy results dictionary
vtime = {} #initialising validation time dictionary
oacc = {} #initialising OOS Validation accuracy results dictionary
otime = {} #initialising OOS validation time dictionary
for d in dropouts:    #looping over dropout values
    print(d) #print the dropout rate
    class CLINCModule(nn.Module): #define the module, as per above
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=d #looping through this value
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   #initialising the model as above
    net = NeuralNetClassifier(
    module=CLINCModule,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.SGD,
    callbacks=[EarlyStopping(patience=10)], #Bigger early stopping patience of 10 epochs
    )
    
    #fitting model and calculating results
    net.fit(train_x, train_y) #fitting to training data
    tlabels = net.predict(train_x) #predicting from training data
    tacc[d] = accuracy_score(tlabels, train_y) #calculating training accuracy and storing it in dict
    print('training accuracy')
    print(tacc) #printing full training accuracy dict
    time0 = time.time() #timer
    labels = net.predict(val_x) #predicting validation data
    vacc[d] = accuracy_score(labels, val_y) #calculating validation accuracy and storing it in dict
    time1 = time.time() # timer
    vtime[d] = time1-time0 #calculating validation prediction time & storing in dict 
    print('Val accuracy')
    print(vacc) #printing full validation accuracy dict
    print('pred time')
    print(vtime) #printing full validation prediction time dict
    time2 = time.time() #timer
    olabels = net.predict(val_oos_x) #predicting OOS validation data
    oacc[d] = accuracy_score(olabels, val_oos_y) #calculating OOS validation accuracy and storing in dict
    time3 = time.time() #timer
    otime[d]=time3-time2 #calculating OOS validation prediction time & storing in dict
    print('OOS Val Accuracy')
    print(oacc)# printing full OOS validation accuracy dict
    print('OOS pred time')
    print(otime) #printing full OOS validation prediction time dict

1
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0173[0m       [32m0.0033[0m        [35m5.0173[0m  2.7095
      2        [36m5.0173[0m       0.0033        5.0173  2.5825
      3        [36m5.0173[0m       0.0033        5.0173  2.5890
      4        5.0173       0.0033        5.0173  2.5863
      5        [36m5.0173[0m       0.0033        [35m5.0173[0m  2.5609
      6        5.0173       0.0033        5.0173  2.5750
      7        [36m5.0173[0m       0.0033        5.0173  2.5444
      8        [36m5.0173[0m       0.0033        5.0173  2.5716
      9        5.0173       0.0033        5.0173  2.5837
     10        [36m5.0173[0m       0.0033        5.0173  2.6356
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{1: 0.0035099337748344373}
Val accuracy
{1: 0.006333333333333333}
pred time
{1: 0.2524840831756592}
OOS Val Accuracy
{1: 0.0}
OOS pred time


In [42]:
learning_rates = [10, 5, 1, 0.5, 0.1, 0.01, 0.001, 0.0001] #looking at different learning rates
#emptying results dicts
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
for lr in learning_rates: #looping through different learning rates   
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=0
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr, #looping through different learning rates
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.SGD,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[lr] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[lr] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[d] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[lr] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[lr]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)

10
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0173[0m       [32m0.0169[0m        [35m5.0173[0m  2.7157
      2        [36m5.0173[0m       [32m0.0348[0m        [35m5.0172[0m  2.6195
      3        [36m5.0172[0m       [32m0.0540[0m        [35m5.0172[0m  2.6411
      4        [36m5.0172[0m       [32m0.0728[0m        [35m5.0172[0m  2.6060
      5        [36m5.0172[0m       [32m0.0954[0m        [35m5.0172[0m  2.5968
      6        [36m5.0172[0m       [32m0.1185[0m        [35m5.0172[0m  2.5800
      7        [36m5.0171[0m       [32m0.1348[0m        [35m5.0171[0m  2.5846
      8        [36m5.0171[0m       [32m0.1513[0m        [35m5.0171[0m  2.6166
      9        [36m5.0171[0m       [32m0.1705[0m        [35m5.0171[0m  2.6094
     10        [36m5.0171[0m       [32m0.1911[0m        [35m5.0171[0m  2.6303
Stopping since valid_loss has not improved 

      2        [36m5.0173[0m       0.0073        5.0173  2.6848
      3        [36m5.0173[0m       0.0073        5.0173  2.6724
      4        5.0173       0.0073        5.0173  2.6982
      5        [36m5.0173[0m       0.0073        5.0173  2.7871
      6        [36m5.0173[0m       0.0073        5.0173  2.7626
      7        [36m5.0173[0m       0.0073        5.0173  2.7045
      8        5.0173       0.0073        5.0173  2.6944
      9        [36m5.0173[0m       0.0073        5.0173  2.7003
     10        5.0173       0.0073        5.0173  2.6909
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{10: 0.21410596026490067, 5: 0.08827814569536424, 1: 0.01695364238410596, 0.5: 0.00814569536423841, 0.1: 0.004172185430463576, 0.01: 0.004503311258278146, 0.001: 0.008013245033112583}
Val accuracy
{10: 0.19633333333333333, 5: 0.084, 1: 0.018, 0.5: 0.009666666666666667, 0.1: 0.0033333333333333335, 0.01: 0.006666666666666667, 0.001: 0.007}
pred time

In [43]:
momentum = [10, 5, 1, 0.5, 0.1] #changing momentum applied 
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
for p in momentum:    #looping through momentum
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=0
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    optimizer__momentum=p, #looping through momentum values
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.SGD,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[p] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[p] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[d] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[p] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[p]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.0066[0m           nan  2.9037
      2           nan       0.0066           nan  2.8744
      3           nan       0.0066           nan  2.8993
      4           nan       0.0066           nan  2.8462
      5           nan       0.0066           nan  2.8437
      6           nan       0.0066           nan  2.8491
      7           nan       0.0066           nan  2.8397
      8           nan       0.0066           nan  2.8402
      9           nan       0.0066           nan  2.8399
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{10: 0.006622516556291391}
Val accuracy
{10: 0.006666666666666667}
pred time
{0.1: 0.24516987800598145}
OOS Val Accuracy
{10: 0.0}
OOS pred time
{10: 0.008492708206176758}
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  -

Not converging satisfactorily with SGD. Attempt using Adam optimizer, varying dropout rate.

In [44]:
dropouts = [1, 0.5, 0.1, 0.01, 0.001, 0] #dropouts to try
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
for d in dropouts:    #looping through dropouts
    print(d) #print dropout rate
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=d #dropout rate set to value in loop
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[d] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[d] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[d] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[d] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[d]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)

1
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0173[0m       [32m0.0053[0m        [35m5.0173[0m  3.7077
      2        [36m5.0173[0m       [32m0.0066[0m        [35m5.0173[0m  3.6635
      3        5.0173       0.0066        [35m5.0173[0m  3.5737
      4        [36m5.0173[0m       0.0066        5.0173  3.7169
      5        5.0173       0.0066        5.0173  3.8467
      6        5.0173       0.0066        [35m5.0173[0m  3.9625
      7        5.0173       0.0066        5.0173  3.8979
      8        5.0173       0.0066        5.0173  3.6462
      9        5.0173       0.0066        5.0173  3.6076
     10        5.0173       0.0066        5.0173  3.6912
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{1: 0.006622516556291391}
Val accuracy
{1: 0.006666666666666667}
pred time
{1: 0.24365901947021484}
OOS Val Accuracy
{1: 0.0}
OOS pred time
{1: 0.008

     12        [36m4.0427[0m       0.9225        4.1119  3.6072
     13        [36m4.0422[0m       0.9175        4.1157  3.5930
     14        4.0424       0.9185        4.1142  3.6085
     15        [36m4.0398[0m       0.9248        4.1110  3.6459
     16        4.0403       0.9099        4.1234  3.6318
     17        4.0435       0.9083        4.1214  3.5218
     18        4.0419       0.9083        4.1232  3.7210
     19        4.0425       0.9046        4.1272  3.5768
     20        4.0418       0.9083        4.1231  3.6162
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{1: 0.006622516556291391, 0.5: 0.9793377483443708, 0.1: 0.9733112582781457, 0.01: 0.9740397350993377, 0.001: 0.9733774834437086}
Val accuracy
{1: 0.006666666666666667, 0.5: 0.8776666666666667, 0.1: 0.8683333333333333, 0.01: 0.8613333333333333, 0.001: 0.874}
pred time
{1: 0.24365901947021484, 0.5: 0.27951478958129883, 0.1: 0.26523923873901367, 0.01: 0.2642199993133545, 0.001

This has improved, will retain Adam & dropout rate of 0.5. Now investigating Learning Rates.

In [50]:
learning_rates = [10, 5, 1, 0.5, 0.1, 0.01, 0.001, 1e-4, 1e-5 ,1e-6]
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
for lr in learning_rates:    
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=0.5
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[lr] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[lr] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[d] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[lr] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[lr]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)

10
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0221[0m       [32m0.0066[0m        [35m5.0220[0m  4.3490
      2        [36m5.0220[0m       0.0066        5.0220  4.3117
      3        5.0220       0.0066        5.0220  4.8271
      4        5.0220       0.0066        5.0220  3.9213
      5        5.0220       0.0066        5.0220  4.2449
      6        5.0220       0.0066        5.0220  4.0830
      7        5.0220       0.0066        5.0220  4.0942
      8        5.0220       0.0066        5.0220  4.3219
      9        5.0220       0.0066        5.0220  4.3452
     10        5.0220       0.0066        5.0220  4.3482
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{10: 0.006622516556291391}
Val accuracy
{10: 0.006666666666666667}
pred time
{0.001: 0.3723461627960205}
OOS Val Accuracy
{10: 0.0}
OOS pred time
{10: 0.01131582260131836}
5
  epoch    train_lo

      6        [36m4.0473[0m       0.9219        4.1131  3.9236
      7        [36m4.0455[0m       0.9262        [35m4.1096[0m  3.8869
      8        [36m4.0406[0m       [32m0.9301[0m        [35m4.1069[0m  3.9216
      9        [36m4.0389[0m       0.9291        [35m4.1062[0m  3.9415
     10        [36m4.0375[0m       0.9275        4.1071  4.1294
     11        4.0379       0.9268        4.1085  4.0735
     12        4.0379       0.9285        4.1062  4.0020
     13        [36m4.0370[0m       0.9195        4.1130  3.8344
     14        4.0378       0.9252        4.1087  3.6418
     15        4.0371       0.9248        4.1071  3.7036
     16        [36m4.0368[0m       0.9245        4.1077  3.7419
     17        4.0368       0.9215        4.1112  3.9115
     18        4.0370       0.9255        [35m4.1061[0m  3.9666
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{10: 0.006622516556291391, 5: 0.006622516556291391, 1: 0.0066225165

     92        [36m4.0309[0m       0.9377        4.0965  4.4242
     93        [36m4.0309[0m       0.9377        4.0966  4.4377
     94        [36m4.0309[0m       0.9384        [35m4.0962[0m  4.3801
     95        [36m4.0308[0m       0.9371        4.0965  4.4224
     96        4.0308       0.9384        4.0966  4.4281
     97        [36m4.0308[0m       0.9381        4.0966  4.3827
     98        [36m4.0308[0m       0.9391        4.0964  4.4118
     99        4.0308       0.9387        4.0965  4.4504
    100        [36m4.0308[0m       0.9394        4.0964  4.3556
    101        [36m4.0308[0m       0.9374        4.0966  4.3739
    102        4.0308       0.9387        4.0967  4.5497
    103        [36m4.0308[0m       0.9387        4.0966  4.3953
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{10: 0.006622516556291391, 5: 0.006622516556291391, 1: 0.006622516556291391, 0.5: 0.012781456953642384, 0.1: 0.47258278145695365, 0.01: 0.9788

     82        [36m4.0771[0m       0.9328        [35m4.1486[0m  3.6110
     83        [36m4.0759[0m       0.9325        [35m4.1477[0m  3.6947
     84        [36m4.0747[0m       0.9328        [35m4.1468[0m  3.6522
     85        [36m4.0740[0m       0.9325        [35m4.1458[0m  3.6171
     86        [36m4.0729[0m       [32m0.9331[0m        [35m4.1449[0m  3.6124
     87        [36m4.0721[0m       0.9328        [35m4.1440[0m  3.5807
     88        [36m4.0705[0m       0.9328        [35m4.1431[0m  3.5693
     89        [36m4.0693[0m       0.9331        [35m4.1422[0m  3.5716
     90        [36m4.0690[0m       [32m0.9341[0m        [35m4.1413[0m  3.5493
     91        [36m4.0675[0m       0.9341        [35m4.1404[0m  3.5730
     92        [36m4.0665[0m       [32m0.9344[0m        [35m4.1396[0m  3.5952
     93        [36m4.0659[0m       0.9344        [35m4.1387[0m  3.6197
     94        [36m4.0647[0m       [32m0.9348[0m        [35m4.1379[

    191        [36m4.0349[0m       0.9424        [35m4.1061[0m  3.9134
    192        [36m4.0347[0m       0.9424        [35m4.1060[0m  4.0978
    193        [36m4.0346[0m       0.9421        [35m4.1059[0m  3.8796
    194        4.0347       0.9421        [35m4.1059[0m  3.5725
    195        [36m4.0345[0m       0.9421        [35m4.1058[0m  3.7760
    196        4.0347       0.9427        [35m4.1057[0m  4.0846
    197        [36m4.0345[0m       0.9424        [35m4.1056[0m  3.9865
    198        [36m4.0345[0m       0.9414        [35m4.1055[0m  3.7321
    199        [36m4.0343[0m       0.9414        [35m4.1055[0m  3.9641
    200        4.0343       0.9417        [35m4.1054[0m  3.7564
    201        [36m4.0342[0m       0.9417        [35m4.1054[0m  3.8173
    202        4.0342       0.9421        [35m4.1052[0m  3.8850
    203        [36m4.0342[0m       0.9414        [35m4.1051[0m  3.6633
    204        [36m4.0341[0m       0.9414        [35m4.104

Val accuracy
{10: 0.006666666666666667, 5: 0.006666666666666667, 1: 0.006666666666666667, 0.5: 0.013333333333333334, 0.1: 0.44733333333333336, 0.01: 0.874, 0.001: 0.8963333333333333, 0.0001: 0.8983333333333333, 1e-05: 0.211, 1e-06: 0.016}
pred time
{0.001: 0.2431046962738037}
OOS Val Accuracy
{10: 0.0, 5: 0.0, 1: 0.0, 0.5: 0.0, 0.1: 0.0, 0.01: 0.07, 0.001: 0.11, 0.0001: 0.14, 1e-05: 0.0, 1e-06: 0.0}
OOS pred time
{10: 0.01131582260131836, 5: 0.008625030517578125, 1: 0.01005697250366211, 0.5: 0.007462978363037109, 0.1: 0.010442256927490234, 0.01: 0.009956121444702148, 0.001: 0.012646913528442383, 0.0001: 0.18695378303527832, 1e-05: 0.008339881896972656, 1e-06: 0.008316993713378906}


Validation accuracy very similar for learning rates of 0.001 and 0.0001. Former chosen due to faster training time. Investigating effect of number of neurons in hidden layer.

In [45]:
lr = 0.001 #using best learning rate from earlier
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_list = [100, 200, 400, 800, 1600] #hidden layer sizes
for hidden_dim in hidden_list:    #looping through hidden layer sizes
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim, #setting hidden layer size
                output_dim=output_dim,
                dropout=0.5
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[hidden_dim] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[hidden_dim] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[hidden_dim] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[hidden_dim]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0166[0m       [32m0.6139[0m        [35m5.0153[0m  1.4640
      2        [36m5.0072[0m       0.5146        [35m4.9842[0m  1.4421
      3        [36m4.8927[0m       0.5685        [35m4.7778[0m  1.5117
      4        [36m4.6625[0m       [32m0.7056[0m        [35m4.5625[0m  1.4323
      5        [36m4.4843[0m       [32m0.7795[0m        [35m4.4313[0m  1.6715
      6        [36m4.3751[0m       [32m0.8275[0m        [35m4.3504[0m  1.6004
      7        [36m4.3020[0m       [32m0.8589[0m        [35m4.2952[0m  1.5017
      8        [36m4.2513[0m       [32m0.8818[0m        [35m4.2584[0m  1.4168
      9        [36m4.2166[0m       [32m0.8964[0m        [35m4.2296[0m  1.4802
     10        [36m4.1852[0m       [32m0.9046[0m        [35m4.2073[0m  1.4061
     11        [36m4.1655[0m       [32m0.9103[0m      

      5        [36m4.2802[0m       [32m0.8811[0m        [35m4.2753[0m  2.1377
      6        [36m4.2122[0m       [32m0.8967[0m        [35m4.2296[0m  2.2624
      7        [36m4.1692[0m       [32m0.9119[0m        [35m4.1997[0m  2.1465
      8        [36m4.1415[0m       [32m0.9205[0m        [35m4.1801[0m  2.1145
      9        [36m4.1214[0m       [32m0.9232[0m        [35m4.1670[0m  2.0629
     10        [36m4.1077[0m       [32m0.9235[0m        [35m4.1579[0m  2.1666
     11        [36m4.0958[0m       [32m0.9285[0m        [35m4.1506[0m  2.2052
     12        [36m4.0858[0m       [32m0.9305[0m        [35m4.1451[0m  2.0893
     13        [36m4.0781[0m       [32m0.9331[0m        [35m4.1394[0m  2.0739
     14        [36m4.0700[0m       [32m0.9354[0m        [35m4.1346[0m  2.0583
     15        [36m4.0650[0m       [32m0.9358[0m        [35m4.1309[0m  2.1139
     16        [36m4.0608[0m       [32m0.9374[0m        [35m4.1279[0m 

     31        [36m4.0340[0m       0.9417        [35m4.1044[0m  3.4775
     32        [36m4.0338[0m       0.9421        [35m4.1041[0m  3.4732
     33        [36m4.0335[0m       0.9411        [35m4.1035[0m  3.4961
     34        [36m4.0332[0m       0.9411        [35m4.1032[0m  3.4818
     35        [36m4.0331[0m       0.9424        [35m4.1026[0m  3.4845
     36        [36m4.0329[0m       0.9427        [35m4.1022[0m  3.4852
     37        [36m4.0329[0m       0.9427        [35m4.1022[0m  3.4870
     38        [36m4.0329[0m       0.9417        [35m4.1015[0m  3.4766
     39        [36m4.0328[0m       0.9414        [35m4.1014[0m  3.4807
     40        [36m4.0326[0m       0.9414        [35m4.1011[0m  3.4810
     41        [36m4.0326[0m       0.9414        [35m4.1009[0m  3.5063
     42        [36m4.0325[0m       0.9421        4.1010  3.4088
     43        [36m4.0324[0m       0.9417        [35m4.1007[0m  3.4946
     44        [36m4.0323[0m    

     39        [36m4.0317[0m       0.9414        [35m4.0995[0m  6.2374
     40        [36m4.0316[0m       0.9401        [35m4.0993[0m  6.2264
     41        [36m4.0316[0m       0.9411        [35m4.0989[0m  6.2515
     42        4.0316       0.9401        [35m4.0989[0m  6.2312
     43        [36m4.0315[0m       0.9407        [35m4.0988[0m  6.2284
     44        4.0315       0.9407        [35m4.0988[0m  6.2701
     45        [36m4.0315[0m       0.9411        [35m4.0983[0m  6.2650
     46        4.0315       0.9421        4.0983  6.2162
     47        [36m4.0314[0m       0.9421        [35m4.0982[0m  6.1493
     48        [36m4.0314[0m       0.9417        4.0983  6.1465
     49        [36m4.0314[0m       0.9411        [35m4.0980[0m  6.2331
     50        [36m4.0312[0m       0.9417        [35m4.0978[0m  6.2109
     51        [36m4.0312[0m       0.9424        [35m4.0976[0m  6.2606
     52        4.0313       0.9424        [35m4.0974[0m  6.2269
    

     76        [36m4.0305[0m       0.9404        4.0957  11.0205
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{100: 0.9843708609271523, 200: 0.9852980132450331, 400: 0.9864238410596027, 800: 0.9868211920529801, 1600: 0.9867549668874173}
Val accuracy
{100: 0.896, 200: 0.899, 400: 0.9, 800: 0.901, 1600: 0.9003333333333333}
pred time
{100: 0.21036410331726074, 200: 0.1925978660583496, 400: 0.31163692474365234, 800: 0.362407922744751, 1600: 0.5692059993743896}
OOS Val Accuracy
{100: 0.09, 200: 0.12, 400: 0.12, 800: 0.12, 1600: 0.12}
OOS pred time
{100: 0.005872011184692383, 200: 0.0051860809326171875, 400: 0.01047205924987793, 800: 0.011768817901611328, 1600: 0.018018007278442383}


Very small increase in validation accuracy with increasing hidden layer size up to 800 (around 0.5% from 89.6% for 100 neurons), but significantly increasing time (near double). Will investigate effect of additional hidden layer, using same layer size for both layers to limit computation.

In [46]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_list = [100, 200, 400, 800] #hidden layer sizes
for hidden_dim in hidden_list:    #looping through hidden layer size
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=0.5
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.hidden2 = nn.Linear(hidden_dim, hidden_dim) #additional hiden layer
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = torch.tanh(self.hidden2(X)) #additional layer
            X = self.dropout(X) #additional dropout layer
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[hidden_dim] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[hidden_dim] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[hidden_dim] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[hidden_dim]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0170[0m       [32m0.2563[0m        [35m5.0161[0m  1.5456
      2        [36m4.9989[0m       [32m0.2695[0m        [35m4.9425[0m  1.4847
      3        [36m4.8555[0m       [32m0.4331[0m        [35m4.7242[0m  1.4527
      4        [36m4.6858[0m       [32m0.5921[0m        [35m4.5562[0m  1.5185
      5        [36m4.5525[0m       [32m0.6838[0m        [35m4.4358[0m  1.4512
      6        [36m4.4529[0m       [32m0.7308[0m        [35m4.3619[0m  1.4932
      7        [36m4.3932[0m       [32m0.7579[0m        [35m4.3205[0m  1.4904
      8        [36m4.3447[0m       [32m0.7801[0m        [35m4.2895[0m  1.4620
      9        [36m4.3109[0m       [32m0.7917[0m        [35m4.2715[0m  1.4609
     10        [36m4.2875[0m       [32m0.7967[0m        [35m4.2599[0m  1.4564
     11        [36m4.2694[0m       [3

     37        [36m4.0410[0m       0.9179        4.1172  2.2781
     38        4.0411       0.9175        4.1171  2.2547
     39        [36m4.0410[0m       0.9169        4.1169  2.2403
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{100: 0.9678145695364239, 200: 0.9784768211920529}
Val accuracy
{100: 0.873, 200: 0.885}
pred time
{100: 0.16866612434387207, 200: 0.21779298782348633}
OOS Val Accuracy
{100: 0.07, 200: 0.08}
OOS pred time
{100: 0.004477977752685547, 200: 0.006827116012573242}
0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0109[0m       [32m0.1705[0m        [35m4.9560[0m  4.1834
      2        [36m4.6944[0m       [32m0.7275[0m        [35m4.4184[0m  4.0486
      3        [36m4.3032[0m       [32m0.8589[0m        [35m4.2301[0m  4.2061
      4        [36m4.1745[0m       [32m0.9017[0m        [35m4.1690[0m  4.0524
      5   

Worse accuracy than for one layer for all layer sizes, so will use a single layers. As 100 neurons in a single layer performed quite similarly to larger layers, will investigate effect of smaller hidden layer to increase speed.

In [47]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_list = [25, 50, 75] #hidden layer size
for hidden_dim in hidden_list:    #looping for hidden layer size
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,#setting hidden layer size
                output_dim=output_dim,
                dropout=0.5
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[hidden_dim] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[hidden_dim] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[hidden_dim] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[hidden_dim]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0170[0m       [32m0.1593[0m        [35m5.0167[0m  1.2534
      2        [36m5.0157[0m       [32m0.3493[0m        [35m5.0145[0m  1.0968
      3        [36m5.0111[0m       [32m0.3834[0m        [35m5.0061[0m  1.1054
      4        [36m4.9911[0m       0.3073        [35m4.9709[0m  1.0619
      5        [36m4.9445[0m       [32m0.3970[0m        [35m4.9091[0m  1.0228
      6        [36m4.8859[0m       [32m0.4404[0m        [35m4.8360[0m  1.0695
      7        [36m4.8279[0m       [32m0.4745[0m        [35m4.7697[0m  1.0470
      8        [36m4.7768[0m       [32m0.5109[0m        [35m4.7119[0m  1.0754
      9        [36m4.7325[0m       [32m0.5315[0m        [35m4.6623[0m  1.1561
     10        [36m4.6959[0m       [32m0.5487[0m        [35m4.6220[0m  1.0389
     11        [36m4.6648[0m       [32m0.5649

    102        [36m4.1657[0m       [32m0.8997[0m        [35m4.1473[0m  1.0417
    103        [36m4.1646[0m       0.8997        [35m4.1466[0m  1.0497
    104        [36m4.1627[0m       [32m0.9007[0m        [35m4.1463[0m  1.0408
    105        [36m4.1615[0m       0.8993        [35m4.1460[0m  1.0436
    106        4.1628       0.8990        [35m4.1457[0m  1.0390
    107        [36m4.1581[0m       0.9003        [35m4.1453[0m  1.0346
    108        [36m4.1578[0m       0.8987        [35m4.1450[0m  1.0380
    109        [36m4.1574[0m       0.9003        [35m4.1445[0m  1.0430
    110        4.1603       0.9000        [35m4.1443[0m  1.0448
    111        [36m4.1567[0m       0.8997        [35m4.1441[0m  1.0347
    112        [36m4.1556[0m       0.9007        [35m4.1436[0m  1.2328
    113        [36m4.1544[0m       0.9007        [35m4.1433[0m  1.0396
    114        4.1554       [32m0.9010[0m        [35m4.1430[0m  1.0346
    115        4.1554    

     19        [36m4.1946[0m       [32m0.8811[0m        [35m4.2033[0m  1.1530
     20        [36m4.1885[0m       [32m0.8834[0m        [35m4.1971[0m  1.1529
     21        [36m4.1798[0m       [32m0.8848[0m        [35m4.1925[0m  1.1491
     22        [36m4.1739[0m       [32m0.8854[0m        [35m4.1883[0m  1.1520
     23        [36m4.1681[0m       [32m0.8861[0m        [35m4.1843[0m  1.1820
     24        [36m4.1629[0m       0.8848        [35m4.1808[0m  1.1494
     25        [36m4.1569[0m       [32m0.8877[0m        [35m4.1769[0m  1.1473
     26        [36m4.1522[0m       [32m0.8930[0m        [35m4.1724[0m  1.1567
     27        [36m4.1456[0m       [32m0.8967[0m        [35m4.1675[0m  1.1580
     28        [36m4.1410[0m       [32m0.8974[0m        [35m4.1642[0m  1.1575
     29        [36m4.1365[0m       [32m0.8997[0m        [35m4.1614[0m  1.1503
     30        [36m4.1325[0m       [32m0.9023[0m        [35m4.1588[0m  1.1540
 

      9        [36m4.2895[0m       [32m0.8483[0m        [35m4.2847[0m  1.3191
     10        [36m4.2563[0m       [32m0.8639[0m        [35m4.2581[0m  1.3010
     11        [36m4.2287[0m       [32m0.8715[0m        [35m4.2385[0m  1.3066
     12        [36m4.2075[0m       [32m0.8791[0m        [35m4.2229[0m  1.2926
     13        [36m4.1916[0m       [32m0.8897[0m        [35m4.2095[0m  1.3463
     14        [36m4.1756[0m       [32m0.8944[0m        [35m4.1974[0m  1.3234
     15        [36m4.1623[0m       [32m0.9013[0m        [35m4.1871[0m  1.3014
     16        [36m4.1498[0m       [32m0.9060[0m        [35m4.1779[0m  1.3060
     17        [36m4.1394[0m       [32m0.9116[0m        [35m4.1710[0m  1.3210
     18        [36m4.1311[0m       [32m0.9136[0m        [35m4.1646[0m  1.3380
     19        [36m4.1220[0m       [32m0.9185[0m        [35m4.1586[0m  1.3046
     20        [36m4.1159[0m       [32m0.9199[0m        [35m4.1537[0m 

Definitely worse than 100 neurons. Would 151 hidden neurons be beneficial, given output size?

In [48]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_list = [151]
for hidden_dim in hidden_list:    
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=0.5
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = torch.tanh(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[hidden_dim] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[hidden_dim] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[hidden_dim] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[hidden_dim]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0164[0m       [32m0.6871[0m        [35m5.0142[0m  1.8697
      2        [36m4.9909[0m       0.4632        [35m4.9248[0m  1.8631
      3        [36m4.7729[0m       0.6576        [35m4.6273[0m  1.8291
      4        [36m4.5074[0m       [32m0.7858[0m        [35m4.4366[0m  1.8232
      5        [36m4.3543[0m       [32m0.8497[0m        [35m4.3283[0m  1.8285
      6        [36m4.2655[0m       [32m0.8811[0m        [35m4.2671[0m  1.8184
      7        [36m4.2122[0m       [32m0.8934[0m        [35m4.2314[0m  1.8413
      8        [36m4.1786[0m       [32m0.9030[0m        [35m4.2078[0m  1.8570
      9        [36m4.1546[0m       [32m0.9139[0m        [35m4.1886[0m  1.8207
     10        [36m4.1356[0m       [32m0.9189[0m        [35m4.1739[0m  1.8298
     11        [36m4.1188[0m       [32m0.9225[0m      

No. 800 neurons has given best accuracy accuracy at the moment. Can this be improved with different activation functions for the hidden layer?

In [49]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
            dropout=0.5
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.relu(self.hidden(X)) #ReLU activation function for hidden layer
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam,
callbacks=[EarlyStopping(patience=10)],
)

net.fit(train_x, train_y)
tlabels = net.predict(train_x)
tacc[hidden_dim] = accuracy_score(tlabels, train_y)
print('training accuracy')
print(tacc)
time0 = time.time()
labels = net.predict(val_x)
vacc[hidden_dim] = accuracy_score(labels, val_y)
time1 = time.time()
vtime[hidden_dim] = time1-time0
print('Val accuracy')
print(vacc)
print('pred time')
print(vtime)
time2 = time.time()
olabels = net.predict(val_oos_x)
oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
time3 = time.time()
otime[hidden_dim]=time3-time2
print('OOS Val Accuracy')
print(oacc)
print('OOS pred time')
print(otime)
print('ReLU')

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0130[0m       [32m0.2997[0m        [35m4.9847[0m  6.2996
      2        [36m4.7681[0m       [32m0.6013[0m        [35m4.5543[0m  6.2730
      3        [36m4.4382[0m       [32m0.7411[0m        [35m4.3775[0m  6.2243
      4        [36m4.3059[0m       [32m0.8113[0m        [35m4.2919[0m  6.2451
      5        [36m4.2359[0m       [32m0.8305[0m        [35m4.2508[0m  6.2227
      6        [36m4.1975[0m       [32m0.8447[0m        [35m4.2265[0m  6.2319
      7        [36m4.1729[0m       [32m0.8632[0m        [35m4.2091[0m  6.2383
      8        [36m4.1431[0m       [32m0.8987[0m        [35m4.1805[0m  6.1958
      9        [36m4.1115[0m       [32m0.9116[0m        [35m4.1579[0m  6.2770
     10        [36m4.0936[0m       [32m0.9156[0m        [35m4.1471[0m  6.1687
     11        [36m4.0848[0m       [32m0.91

Slight improvement with ReLU (0.3%). Will attempt other functions to see whether they outperform ReLU, otherwise keep. Sigmoid:

In [50]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
            dropout=0.5
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.sigmoid(self.hidden(X)) #sigmoid activation function
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam,
callbacks=[EarlyStopping(patience=10)],
)

net.fit(train_x, train_y)
tlabels = net.predict(train_x)
tacc[hidden_dim] = accuracy_score(tlabels, train_y)
print('training accuracy')
print(tacc)
time0 = time.time()
labels = net.predict(val_x)
vacc[hidden_dim] = accuracy_score(labels, val_y)
time1 = time.time()
vtime[hidden_dim] = time1-time0
print('Val accuracy')
print(vacc)
print('pred time')
print(vtime)
time2 = time.time()
olabels = net.predict(val_oos_x)
oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
time3 = time.time()
otime[hidden_dim]=time3-time2
print('OOS Val Accuracy')
print(oacc)
print('OOS pred time')
print(otime)
print('sigmoid')



  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0180[0m       [32m0.0066[0m        [35m5.0166[0m  6.3491
      2        [36m5.0165[0m       [32m0.0232[0m        [35m5.0146[0m  6.3888
      3        [36m5.0135[0m       [32m0.0344[0m        [35m5.0097[0m  6.4449
      4        [36m5.0064[0m       [32m0.0576[0m        [35m4.9979[0m  6.3237
      5        [36m4.9930[0m       [32m0.0679[0m        [35m4.9813[0m  6.2590
      6        [36m4.9763[0m       [32m0.0772[0m        [35m4.9647[0m  6.3531
      7        [36m4.9621[0m       0.0768        [35m4.9568[0m  6.2890
      8        [36m4.9560[0m       [32m0.0778[0m        [35m4.9533[0m  6.2982
      9        [36m4.9528[0m       [32m0.0808[0m        [35m4.9503[0m  6.2021
     10        [36m4.9486[0m       [32m0.0874[0m        [35m4.9457[0m  6.3159
     11        [36m4.9428[0m       [32m0.0954[0m   

     99        [36m4.4890[0m       [32m0.5397[0m        [35m4.5048[0m  6.1875
    100        [36m4.4811[0m       [32m0.5430[0m        [35m4.4979[0m  6.1579
    101        [36m4.4782[0m       [32m0.5440[0m        [35m4.4963[0m  6.2313
    102        [36m4.4745[0m       [32m0.5450[0m        [35m4.4946[0m  6.2708
    103        [36m4.4709[0m       [32m0.5536[0m        [35m4.4893[0m  6.1081
    104        [36m4.4665[0m       0.5490        4.4894  6.2632
    105        [36m4.4626[0m       [32m0.5570[0m        [35m4.4842[0m  6.3135
    106        [36m4.4574[0m       [32m0.5649[0m        [35m4.4805[0m  6.5914
    107        [36m4.4534[0m       [32m0.5679[0m        [35m4.4777[0m  6.9639
    108        [36m4.4498[0m       [32m0.5712[0m        [35m4.4723[0m  6.9868
    109        [36m4.4454[0m       [32m0.5758[0m        [35m4.4709[0m  6.9394
    110        [36m4.4409[0m       [32m0.5772[0m        [35m4.4665[0m  7.0651
    111   

    203        [36m4.2315[0m       0.7553        4.2829  6.2813
    204        [36m4.2279[0m       [32m0.7646[0m        [35m4.2717[0m  6.3001
    205        [36m4.2254[0m       [32m0.7675[0m        4.2733  6.3484
    206        [36m4.2227[0m       0.7662        4.2731  5.8018
    207        [36m4.2219[0m       [32m0.7709[0m        [35m4.2678[0m  6.4278
    208        [36m4.2206[0m       [32m0.7715[0m        4.2684  6.2424
    209        [36m4.2201[0m       0.7692        4.2688  8.5317
    210        [36m4.2193[0m       0.7705        4.2686  8.0056
    211        [36m4.2175[0m       [32m0.7742[0m        [35m4.2653[0m  6.9680
    212        [36m4.2154[0m       0.7735        4.2653  6.4788
    213        [36m4.2140[0m       [32m0.7755[0m        [35m4.2637[0m  6.5176
    214        4.2142       [32m0.7785[0m        [35m4.2607[0m  6.5550
    215        [36m4.2134[0m       0.7748        4.2623  6.6556
    216        [36m4.2131[0m       0.7728

    313        [36m4.0988[0m       [32m0.8742[0m        [35m4.1639[0m  6.7759
    314        4.0990       0.8705        4.1657  7.0871
    315        [36m4.0987[0m       0.8735        [35m4.1638[0m  6.6992
    316        4.0989       0.8689        4.1655  6.3662
    317        [36m4.0981[0m       0.8722        [35m4.1631[0m  6.3561
    318        4.0985       0.8712        4.1636  6.3542
    319        4.0986       [32m0.8748[0m        [35m4.1618[0m  6.3413
    320        [36m4.0980[0m       0.8712        [35m4.1617[0m  6.5516
    321        [36m4.0979[0m       0.8719        4.1627  6.5108
    322        4.0981       0.8722        [35m4.1611[0m  6.7265
    323        4.0980       0.8738        4.1612  6.6053
    324        4.0980       0.8705        4.1625  6.6099
    325        4.0981       [32m0.8768[0m        [35m4.1605[0m  6.6186
    326        4.0979       0.8725        4.1624  6.5841
    327        [36m4.0971[0m       0.8752        4.1613  6.7656
 

In [51]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
            dropout=0.5
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = self.hidden(X) #no activation function (linear)
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam,
callbacks=[EarlyStopping(patience=10)],
)

net.fit(train_x, train_y)
tlabels = net.predict(train_x)
tacc[hidden_dim] = accuracy_score(tlabels, train_y)
print('training accuracy')
print(tacc)
time0 = time.time()
labels = net.predict(val_x)
vacc[hidden_dim] = accuracy_score(labels, val_y)
time1 = time.time()
vtime[hidden_dim] = time1-time0
print('Val accuracy')
print(vacc)
print('pred time')
print(vtime)
time2 = time.time()
olabels = net.predict(val_oos_x)
oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
time3 = time.time()
otime[hidden_dim]=time3-time2
print('OOS Val Accuracy')
print(oacc)
print('OOS pred time')
print(otime)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0089[0m       [32m0.5252[0m        [35m4.9527[0m  6.3222
      2        [36m4.6487[0m       [32m0.8070[0m        [35m4.3973[0m  6.4461
      3        [36m4.2653[0m       [32m0.8957[0m        [35m4.2314[0m  6.8577
      4        [36m4.1554[0m       [32m0.9199[0m        [35m4.1781[0m  6.8629
      5        [36m4.1109[0m       [32m0.9288[0m        [35m4.1555[0m  6.5649
      6        [36m4.0846[0m       [32m0.9328[0m        [35m4.1418[0m  6.4810
      7        [36m4.0674[0m       [32m0.9394[0m        [35m4.1323[0m  6.7712
      8        [36m4.0572[0m       0.9394        [35m4.1263[0m  6.4031
      9        [36m4.0510[0m       [32m0.9414[0m        [35m4.1221[0m  6.3516
     10        [36m4.0465[0m       0.9411        [35m4.1194[0m  6.3315
     11        [36m4.0436[0m       [32m0.9427[0m        [35

ReLU found to be best. Re-iterating through variables previously optimised for tanh activation. Dropouts:

In [52]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
dropout_list = [0.75, 0.5, 0.25, 0.1, 0.01] #adjusting dropout rate
for dropout in dropout_list:    #looping for dropouts
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=dropout #setting dropout rate
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[dropout] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[dropout] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[dropout] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[dropout] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[dropout]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0158[0m       [32m0.4315[0m        [35m5.0101[0m  6.6061
      2        [36m4.9120[0m       [32m0.4679[0m        [35m4.7299[0m  6.5239
      3        [36m4.6053[0m       [32m0.6715[0m        [35m4.4840[0m  6.4711
      4        [36m4.4244[0m       [32m0.7596[0m        [35m4.3682[0m  6.2988
      5        [36m4.3288[0m       [32m0.7964[0m        [35m4.3018[0m  6.3068
      6        [36m4.2741[0m       [32m0.8225[0m        [35m4.2652[0m  6.3612
      7        [36m4.2322[0m       [32m0.8543[0m        [35m4.2316[0m  6.3762
      8        [36m4.1990[0m       [32m0.8689[0m        [35m4.2087[0m  6.3063
      9        [36m4.1730[0m       [32m0.8798[0m        [35m4.1914[0m  6.2668
     10        [36m4.1551[0m       [32m0.8987[0m        [35m4.1729[0m  6.2441
     11        [36m4.1317[0m       [3

      6        [36m4.1918[0m       [32m0.8583[0m        [35m4.2205[0m  6.7897
      7        [36m4.1625[0m       [32m0.8778[0m        [35m4.1984[0m  6.3732
      8        [36m4.1357[0m       [32m0.8947[0m        [35m4.1771[0m  6.6081
      9        [36m4.1153[0m       [32m0.9036[0m        [35m4.1650[0m  6.4892
     10        [36m4.0983[0m       [32m0.9096[0m        [35m4.1528[0m  6.6590
     11        [36m4.0899[0m       [32m0.9126[0m        [35m4.1469[0m  6.5947
     12        [36m4.0856[0m       0.9113        [35m4.1456[0m  6.4069
     13        [36m4.0804[0m       [32m0.9156[0m        [35m4.1404[0m  6.2320
     14        [36m4.0745[0m       0.9146        4.1415  6.1545
     15        [36m4.0693[0m       [32m0.9212[0m        [35m4.1359[0m  6.1887
     16        [36m4.0600[0m       [32m0.9262[0m        [35m4.1274[0m  6.1672
     17        [36m4.0554[0m       [32m0.9321[0m        [35m4.1217[0m  6.1663
     18        [36

     34        [36m4.0317[0m       0.9411        4.1002  6.3346
     35        [36m4.0317[0m       0.9421        [35m4.0996[0m  6.3608
     36        [36m4.0315[0m       0.9407        4.0997  6.2042
     37        [36m4.0315[0m       0.9417        [35m4.0991[0m  6.3066
     38        [36m4.0314[0m       0.9427        [35m4.0987[0m  6.3781
     39        [36m4.0313[0m       0.9417        4.0987  6.3579
     40        [36m4.0313[0m       0.9414        [35m4.0982[0m  6.4128
     41        4.0313       0.9427        4.0983  6.2995
     42        [36m4.0312[0m       [32m0.9434[0m        [35m4.0981[0m  6.4869
     43        [36m4.0311[0m       [32m0.9444[0m        [35m4.0975[0m  6.6551
     44        [36m4.0311[0m       0.9430        [35m4.0972[0m  6.7829
     45        4.0311       0.9417        4.0973  6.8611
     46        [36m4.0310[0m       0.9430        [35m4.0971[0m  6.5274
     47        [36m4.0309[0m       0.9434        4.0973  6.2378
    

     70        4.0305       0.9407        4.0958  6.2372
     71        4.0306       0.9417        4.0955  6.2266
     72        [36m4.0304[0m       0.9424        4.0956  6.2432
     73        [36m4.0304[0m       0.9411        4.0959  6.2296
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{0.75: 0.9866887417218543, 0.5: 0.9866887417218543, 0.25: 0.9873509933774834, 0.1: 0.9871523178807947}
Val accuracy
{0.75: 0.9066666666666666, 0.5: 0.9053333333333333, 0.25: 0.9036666666666666, 0.1: 0.9056666666666666}
pred time
{0.75: 0.49721789360046387, 0.5: 0.42409491539001465, 0.25: 0.37950801849365234, 0.1: 0.3557312488555908}
OOS Val Accuracy
{0.75: 0.18, 0.5: 0.2, 0.25: 0.27, 0.1: 0.27}
OOS pred time
{0.75: 0.014636039733886719, 0.5: 0.014017105102539062, 0.25: 0.012434005737304688, 0.1: 0.012111186981201172}
0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0026

In [53]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
dropout_list = [1, 0.9, 0.8, 0] #some further values for dropout
for dropout in dropout_list:    
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=dropout
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[dropout] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[dropout] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[dropout] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[dropout] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[dropout]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0173[0m       [32m0.0070[0m        [35m5.0173[0m  6.2591
      2        [36m5.0173[0m       0.0060        5.0173  6.2635
      3        [36m5.0173[0m       [32m0.0073[0m        5.0173  6.2483
      4        [36m5.0173[0m       0.0063        5.0173  6.2667
      5        5.0173       0.0066        5.0173  6.2426
      6        5.0173       0.0066        5.0173  6.2644
      7        5.0173       0.0066        5.0173  6.2378
      8        5.0173       0.0066        5.0173  6.2465
      9        [36m5.0173[0m       0.0066        [35m5.0173[0m  6.2433
     10        [36m5.0173[0m       0.0066        [35m5.0173[0m  6.2304
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{1: 0.006622516556291391}
Val accuracy
{1: 0.006666666666666667}
pred time
{1: 0.37184715270996094}
OOS Val Accuracy
{1: 0.0}
O

     97        4.0419       0.9417        [35m4.0944[0m  6.5697
     98        4.0419       0.9421        [35m4.0941[0m  6.7414
     99        [36m4.0407[0m       0.9437        [35m4.0941[0m  6.6712
    100        [36m4.0404[0m       [32m0.9440[0m        4.0942  6.7380
    101        4.0408       0.9434        4.0941  6.5589
    102        4.0407       [32m0.9444[0m        4.0941  6.6832
    103        [36m4.0402[0m       0.9427        4.0941  6.6383
    104        [36m4.0396[0m       0.9427        [35m4.0938[0m  6.6133
    105        [36m4.0395[0m       0.9404        4.0940  6.6393
    106        4.0399       0.9414        [35m4.0932[0m  6.7592
    107        4.0396       0.9414        4.0935  6.7308
    108        4.0402       0.9421        [35m4.0930[0m  6.6406
    109        4.0396       0.9417        4.0936  6.9146
    110        4.0397       0.9440        4.0936  6.6158
    111        [36m4.0394[0m       0.9414        4.0938  6.5966
    112        [36

Val accuracy
{1: 0.006666666666666667, 0.9: 0.9053333333333333, 0.8: 0.9046666666666666}
pred time
{1: 0.37184715270996094, 0.9: 0.3441929817199707, 0.8: 0.36145877838134766}
OOS Val Accuracy
{1: 0.0, 0.9: 0.21, 0.8: 0.2}
OOS pred time
{1: 0.01224207878112793, 0.9: 0.01036524772644043, 0.8: 0.01233983039855957}
0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0019[0m       [32m0.3301[0m        [35m4.8913[0m  6.1668
      2        [36m4.6103[0m       [32m0.6781[0m        [35m4.4432[0m  6.1750
      3        [36m4.3458[0m       [32m0.7884[0m        [35m4.3179[0m  6.2022
      4        [36m4.2375[0m       [32m0.8222[0m        [35m4.2582[0m  6.2215
      5        [36m4.1896[0m       [32m0.8384[0m        [35m4.2339[0m  6.2444
      6        [36m4.1660[0m       [32m0.8606[0m        [35m4.2146[0m  6.2118
      7        [36m4.1396[0m       [32m0.8825[0m        [3

    104        [36m4.0302[0m       0.9421        [35m4.0930[0m  6.1516
    105        [36m4.0302[0m       0.9421        [35m4.0930[0m  6.1369
    106        [36m4.0302[0m       0.9421        [35m4.0929[0m  6.2087
    107        [36m4.0302[0m       0.9424        [35m4.0929[0m  6.1217
    108        [36m4.0302[0m       0.9424        [35m4.0928[0m  6.1294
    109        [36m4.0302[0m       0.9427        [35m4.0928[0m  6.2150
    110        [36m4.0302[0m       0.9424        [35m4.0927[0m  6.1270
    111        [36m4.0302[0m       0.9424        [35m4.0927[0m  6.1530
    112        [36m4.0302[0m       0.9417        [35m4.0926[0m  6.1249
    113        [36m4.0302[0m       0.9417        [35m4.0926[0m  6.1259
    114        [36m4.0302[0m       0.9414        [35m4.0925[0m  6.1312
    115        [36m4.0302[0m       0.9417        [35m4.0925[0m  6.1283
    116        [36m4.0302[0m       0.9414        [35m4.0925[0m  6.1311
    117        [36m4.030

0.75 found to be most accurate, but not much in it. Will adjust learning rate adjustment by factor of 10 either way:

In [54]:
learning_rates = [0.01, 0.0001]
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
dropout = 0.75
for lr in learning_rates:    
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=dropout
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[lr] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[lr] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[lr] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[lr] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[lr]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.01
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m4.7507[0m       [32m0.7474[0m        [35m4.3177[0m  6.5751
      2        [36m4.2561[0m       [32m0.8609[0m        [35m4.1816[0m  6.1611
      3        [36m4.1629[0m       [32m0.8831[0m        [35m4.1560[0m  6.1635
      4        [36m4.1261[0m       [32m0.9063[0m        [35m4.1288[0m  6.2263
      5        [36m4.1001[0m       [32m0.9225[0m        [35m4.1133[0m  6.1903
      6        [36m4.0816[0m       [32m0.9242[0m        [35m4.1082[0m  6.2384
      7        [36m4.0693[0m       [32m0.9248[0m        [35m4.1080[0m  6.2755
      8        [36m4.0661[0m       [32m0.9275[0m        [35m4.1060[0m  6.1246
      9        [36m4.0610[0m       [32m0.9281[0m        [35m4.1045[0m  6.2033
     10        [36m4.0602[0m       [32m0.9298[0m        [35m4.1043[0m  6.2742
     11        [36m4.0562[0m       0.92

     69        [36m4.2469[0m       [32m0.8238[0m        [35m4.2732[0m  6.1097
     70        [36m4.2434[0m       [32m0.8255[0m        [35m4.2695[0m  6.1337
     71        [36m4.2416[0m       [32m0.8265[0m        [35m4.2666[0m  6.1335
     72        [36m4.2362[0m       0.8265        [35m4.2639[0m  6.2552
     73        [36m4.2341[0m       [32m0.8281[0m        [35m4.2616[0m  6.0734
     74        [36m4.2316[0m       0.8281        [35m4.2592[0m  6.1118
     75        [36m4.2277[0m       [32m0.8291[0m        [35m4.2569[0m  6.2110
     76        [36m4.2262[0m       [32m0.8305[0m        [35m4.2544[0m  6.1312
     77        [36m4.2221[0m       [32m0.8354[0m        [35m4.2517[0m  6.1991
     78        [36m4.2196[0m       [32m0.8364[0m        [35m4.2496[0m  6.1359
     79        [36m4.2167[0m       [32m0.8368[0m        [35m4.2477[0m  6.1426
     80        [36m4.2151[0m       [32m0.8394[0m        [35m4.2455[0m  6.1577
     81   

    172        [36m4.0836[0m       0.9185        [35m4.1401[0m  6.2262
    173        [36m4.0832[0m       0.9179        [35m4.1396[0m  6.3218
    174        [36m4.0822[0m       0.9189        [35m4.1393[0m  6.2505
    175        [36m4.0819[0m       0.9185        [35m4.1390[0m  6.2359
    176        [36m4.0811[0m       [32m0.9195[0m        [35m4.1386[0m  6.2313
    177        [36m4.0809[0m       0.9192        [35m4.1382[0m  6.2133
    178        4.0811       [32m0.9199[0m        [35m4.1378[0m  6.2251
    179        [36m4.0793[0m       0.9195        [35m4.1374[0m  6.5190
    180        [36m4.0792[0m       0.9192        [35m4.1371[0m  6.4926
    181        [36m4.0792[0m       [32m0.9205[0m        [35m4.1367[0m  6.3302
    182        [36m4.0784[0m       0.9202        [35m4.1364[0m  6.3319
    183        4.0785       0.9199        [35m4.1362[0m  6.3877
    184        [36m4.0779[0m       0.9205        [35m4.1358[0m  6.3466
    185        

    284        4.0504       0.9318        4.1130  6.7190
    285        [36m4.0496[0m       0.9315        [35m4.1128[0m  6.5241
    286        4.0500       0.9311        [35m4.1125[0m  6.7529
    287        4.0497       0.9308        [35m4.1124[0m  6.4807
    288        4.0497       0.9308        [35m4.1123[0m  6.5640
    289        4.0497       0.9311        [35m4.1122[0m  6.5109
    290        4.0498       0.9308        4.1122  6.5659
    291        4.0497       0.9305        [35m4.1120[0m  6.4801
    292        4.0497       0.9315        [35m4.1118[0m  6.5687
    293        [36m4.0492[0m       0.9318        [35m4.1117[0m  6.4580
    294        4.0494       [32m0.9325[0m        [35m4.1115[0m  6.5823
    295        [36m4.0492[0m       0.9325        4.1116  6.5719
    296        [36m4.0490[0m       0.9325        [35m4.1114[0m  6.5568
    297        4.0491       [32m0.9328[0m        4.1114  6.5844
    298        [36m4.0488[0m       0.9321        [35m4.

learning rate of 0.001 remains most accurate. Attempting SGD as optimizer again to see if that can improve on where we are:

In [56]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800
dropout = 0.75
momentum_list = [10, 1, 0.1, 0.01, 0.001, 0.0001] #assorted momentums for this learning rate
for p in momentum_list:    
    print(lr)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=dropout
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.SGD, #using SGD as optimizer
    optimizer__momentum = p, #looping through momentums
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[p] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[p] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[p] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[p] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[p]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.0066[0m           nan  5.0247
      2           nan       0.0066           nan  5.0387
      3           nan       0.0066           nan  4.9342
      4           nan       0.0066           nan  4.8973
      5           nan       0.0066           nan  4.9366
      6           nan       0.0066           nan  4.9376
      7           nan       0.0066           nan  4.9249
      8           nan       0.0066           nan  4.9357
      9           nan       0.0066           nan  5.0245
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{10: 0.006622516556291391}
Val accuracy
{10: 0.006666666666666667}
pred time
{10: 0.35844969749450684}
OOS Val Accuracy
{10: 0.0}
OOS pred time
{10: 0.012405633926391602}
0.001
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ----

Not converging. Adam retained. Investigating hidden layer size again to confirm 800 optimal, including much larger hidden layer:

In [57]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim_list = [50, 100, 200, 400, 800, 1600, 3200, 6400, 12800] #hidden layer size
dropout = 0.75
for hidden_dim in hidden_dim_list:    #looping for hidden layer size
    print(hidden_dim)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim, #setting hidden layer size
                output_dim=output_dim,
                dropout=dropout
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam, #Adam again - momentum has been removed accordingly.
    callbacks=[EarlyStopping(patience=10)],
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[hidden_dim] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[hidden_dim] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[hidden_dim] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[hidden_dim]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

50
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0171[0m       [32m0.1003[0m        [35m5.0168[0m  1.2308
      2        [36m5.0161[0m       [32m0.3182[0m        [35m5.0151[0m  1.1453
      3        [36m5.0126[0m       0.2010        [35m5.0086[0m  1.1499
      4        [36m4.9988[0m       0.2308        [35m4.9795[0m  1.1425
      5        [36m4.9725[0m       0.2950        [35m4.9310[0m  1.1498
      6        [36m4.9466[0m       [32m0.3500[0m        [35m4.8811[0m  1.2321
      7        [36m4.9217[0m       [32m0.3828[0m        [35m4.8349[0m  1.1786
      8        [36m4.8983[0m       [32m0.4262[0m        [35m4.7910[0m  1.2794
      9        [36m4.8830[0m       [32m0.4583[0m        [35m4.7538[0m  1.3499
     10        [36m4.8646[0m       [32m0.4917[0m        [35m4.7219[0m  1.1764
     11        [36m4.8473[0m       [32m0.5169[0m        [35m4.690

    102        [36m4.5083[0m       0.8411        [35m4.2172[0m  1.2439
    103        4.5121       [32m0.8421[0m        [35m4.2159[0m  1.2251
    104        4.5134       0.8417        [35m4.2147[0m  1.2163
    105        4.5107       [32m0.8424[0m        [35m4.2137[0m  1.2485
    106        4.5149       [32m0.8430[0m        [35m4.2122[0m  1.2390
    107        4.5093       [32m0.8434[0m        [35m4.2115[0m  1.2381
    108        4.5112       [32m0.8437[0m        [35m4.2107[0m  1.2478
    109        [36m4.5036[0m       0.8427        [35m4.2100[0m  1.3385
    110        4.5125       0.8434        [35m4.2099[0m  1.3001
    111        [36m4.5014[0m       [32m0.8440[0m        [35m4.2091[0m  1.2332
    112        4.5058       0.8437        [35m4.2084[0m  1.2357
    113        [36m4.4964[0m       [32m0.8467[0m        [35m4.2074[0m  1.2409
    114        4.5010       0.8457        [35m4.2069[0m  1.2552
    115        4.4974       0.8467        4

    220        4.4140       0.8629        4.1789  1.2701
    221        4.4147       0.8632        4.1789  1.2620
    222        4.4166       0.8613        4.1791  1.2601
    223        4.4100       0.8619        4.1793  1.2705
    224        4.4191       0.8609        4.1795  1.2652
    225        4.4098       0.8609        4.1793  1.2639
    226        4.4099       0.8623        4.1793  1.2722
    227        4.4113       0.8619        4.1794  1.2683
    228        4.4084       0.8606        4.1791  1.2691
Stopping since valid_loss has not improved in the last 10 epochs.
training accuracy
{50: 0.9144370860927152}
Val accuracy
{50: 0.8266666666666667}
pred time
{50: 0.1441209316253662}
OOS Val Accuracy
{50: 0.04}
OOS pred time
{50: 0.0038161277770996094}
100
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0170[0m       [32m0.2689[0m        [35m5.0165[0m  1.4337
      2        [36m5.0149[0m   

     93        [36m4.1908[0m       [32m0.8834[0m        [35m4.1580[0m  1.5022
     94        4.1923       [32m0.8848[0m        [35m4.1573[0m  1.4859
     95        4.1918       0.8848        [35m4.1567[0m  1.4935
     96        [36m4.1887[0m       0.8834        [35m4.1564[0m  1.4932
     97        [36m4.1886[0m       0.8848        [35m4.1557[0m  1.4913
     98        4.1898       0.8838        4.1558  1.4883
     99        [36m4.1853[0m       0.8828        [35m4.1551[0m  1.4912
    100        [36m4.1853[0m       0.8821        [35m4.1550[0m  1.5107
    101        [36m4.1830[0m       0.8828        [35m4.1544[0m  1.4851
    102        [36m4.1819[0m       0.8844        [35m4.1538[0m  1.6649
    103        [36m4.1819[0m       0.8848        [35m4.1537[0m  1.5060
    104        [36m4.1774[0m       0.8844        [35m4.1529[0m  1.5476
    105        [36m4.1771[0m       [32m0.8861[0m        [35m4.1510[0m  1.4912
    106        4.1784       [32m0

     44        [36m4.1112[0m       [32m0.9113[0m        [35m4.1337[0m  2.2582
     45        [36m4.1104[0m       0.9113        [35m4.1329[0m  2.1436
     46        [36m4.1095[0m       [32m0.9139[0m        [35m4.1322[0m  2.1699
     47        [36m4.1066[0m       0.9139        [35m4.1319[0m  2.2201
     48        [36m4.1041[0m       [32m0.9159[0m        [35m4.1303[0m  2.2653
     49        [36m4.1006[0m       0.9139        [35m4.1296[0m  2.2250
     50        [36m4.0974[0m       0.9129        [35m4.1283[0m  2.1840
     51        4.0977       [32m0.9162[0m        [35m4.1273[0m  2.2352
     52        [36m4.0953[0m       [32m0.9189[0m        [35m4.1247[0m  2.3422
     53        [36m4.0921[0m       [32m0.9209[0m        [35m4.1226[0m  2.5611
     54        [36m4.0895[0m       [32m0.9222[0m        [35m4.1217[0m  2.3258
     55        [36m4.0878[0m       0.9209        [35m4.1214[0m  2.2840
     56        4.0884       0.9205        [35

     18        [36m4.1284[0m       [32m0.9109[0m        [35m4.1496[0m  3.5247
     19        [36m4.1203[0m       [32m0.9129[0m        [35m4.1448[0m  3.5568
     20        [36m4.1127[0m       [32m0.9136[0m        [35m4.1421[0m  3.5397
     21        [36m4.1086[0m       0.9136        [35m4.1397[0m  3.5517
     22        [36m4.1033[0m       [32m0.9199[0m        [35m4.1355[0m  3.5539
     23        [36m4.0992[0m       [32m0.9222[0m        [35m4.1330[0m  3.5776
     24        [36m4.0933[0m       0.9205        [35m4.1318[0m  3.5672
     25        [36m4.0897[0m       0.9212        [35m4.1299[0m  3.5515
     26        [36m4.0876[0m       [32m0.9235[0m        [35m4.1279[0m  3.6154
     27        [36m4.0837[0m       0.9232        [35m4.1272[0m  3.7022
     28        [36m4.0824[0m       0.9212        [35m4.1265[0m  3.8041
     29        [36m4.0798[0m       0.9219        [35m4.1250[0m  3.8737
     30        [36m4.0753[0m       [32m0.92

     19        [36m4.0673[0m       [32m0.9311[0m        [35m4.1209[0m  6.3732
     20        [36m4.0629[0m       [32m0.9351[0m        [35m4.1187[0m  6.2594
     21        [36m4.0581[0m       0.9348        [35m4.1159[0m  6.2065
     22        [36m4.0552[0m       [32m0.9391[0m        [35m4.1132[0m  6.2833
     23        [36m4.0514[0m       [32m0.9397[0m        [35m4.1109[0m  6.2887
     24        [36m4.0489[0m       [32m0.9427[0m        [35m4.1089[0m  6.4198
     25        [36m4.0472[0m       0.9421        [35m4.1084[0m  6.4481
     26        [36m4.0460[0m       0.9427        [35m4.1067[0m  6.3913
     27        [36m4.0442[0m       0.9421        [35m4.1063[0m  6.2536
     28        [36m4.0434[0m       [32m0.9447[0m        [35m4.1044[0m  6.3409
     29        [36m4.0422[0m       0.9437        [35m4.1039[0m  6.3926
     30        [36m4.0407[0m       0.9417        [35m4.1034[0m  6.3065
     31        4.0407       0.9427        [35

     42        4.0324       0.9470        [35m4.0943[0m  11.1228
     43        [36m4.0320[0m       0.9464        4.0947  11.3458
     44        4.0323       0.9444        4.0948  11.1179
     45        4.0321       0.9450        4.0948  11.1263
     46        [36m4.0320[0m       0.9454        [35m4.0941[0m  11.1258
     47        4.0320       0.9440        4.0942  11.1458
     48        [36m4.0320[0m       0.9467        [35m4.0931[0m  11.2284
     49        [36m4.0318[0m       [32m0.9480[0m        [35m4.0929[0m  11.1924
     50        [36m4.0318[0m       0.9457        4.0934  11.1937
     51        [36m4.0316[0m       0.9460        4.0933  11.0919
     52        4.0317       0.9457        4.0935  11.1520
     53        4.0319       0.9460        4.0937  11.1545
     54        [36m4.0315[0m       0.9454        4.0938  11.1866
     55        4.0317       0.9450        4.0943  11.2692
     56        4.0315       0.9470        4.0938  11.3958
     57        [36m4.

      8        [36m4.0458[0m       [32m0.9381[0m        [35m4.1128[0m  38.6286
      9        [36m4.0406[0m       [32m0.9397[0m        [35m4.1117[0m  36.8542
     10        [36m4.0383[0m       [32m0.9401[0m        [35m4.1078[0m  36.0486
     11        [36m4.0364[0m       [32m0.9421[0m        [35m4.1055[0m  36.7903
     12        [36m4.0356[0m       0.9417        [35m4.1045[0m  39.0225
     13        [36m4.0348[0m       [32m0.9424[0m        [35m4.1027[0m  39.1421
     14        [36m4.0341[0m       [32m0.9437[0m        [35m4.1021[0m  39.2116
     15        [36m4.0338[0m       [32m0.9447[0m        [35m4.1012[0m  40.0323
     16        [36m4.0336[0m       0.9440        4.1015  40.0047
     17        [36m4.0331[0m       0.9414        4.1028  39.6398
     18        [36m4.0330[0m       0.9424        [35m4.1004[0m  39.2430
     19        [36m4.0327[0m       0.9444        [35m4.0996[0m  37.7536
     20        [36m4.0325[0m       0.9421

Decrease in performance after 800. Experimenting with additional hidden layer:

In [58]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim_list = [100,200,400,800,1600] #size for both hidden layers
dropout = 0.75
for hidden_dim in hidden_dim_list:        
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim,
                output_dim=output_dim,
                dropout=dropout
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim) 
            self.hidden2 = nn.Linear(hidden_dim, hidden_dim) #hidden layer 2


            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.relu(self.hidden2(X)) #hidden layer 2
            X = self.dropout(X) #dropout 2
            X = F.softmax(self.output(X), dim=-1)
            return X

    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam,
    callbacks=[EarlyStopping(patience=10)],
    )

    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[hidden_dim] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[hidden_dim] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[hidden_dim] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[hidden_dim]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0173[0m       [32m0.0325[0m        [35m5.0172[0m  1.5052
      2        [36m5.0170[0m       [32m0.0500[0m        [35m5.0165[0m  1.4924
      3        [36m5.0140[0m       0.0301        [35m5.0076[0m  1.5714
      4        [36m5.0031[0m       [32m0.0788[0m        [35m4.9890[0m  1.5027
      5        [36m4.9908[0m       [32m0.1093[0m        [35m4.9624[0m  1.5579
      6        [36m4.9788[0m       [32m0.1215[0m        [35m4.9366[0m  1.5185
      7        [36m4.9692[0m       [32m0.1401[0m        [35m4.9184[0m  1.4710
      8        [36m4.9601[0m       [32m0.1629[0m        [35m4.9013[0m  1.5335
      9        [36m4.9523[0m       [32m0.1904[0m        [35m4.8836[0m  1.5409
     10        [36m4.9459[0m       [32m0.2175[0m        [35m4.8679[0m  1.5013
     11        [36m4.9371[0m       [32m0.2401[0m   

    105        4.6772       0.7139        [35m4.3236[0m  1.7308
    106        [36m4.6695[0m       [32m0.7179[0m        [35m4.3224[0m  1.9601
    107        4.6753       0.7169        [35m4.3221[0m  1.6860
    108        4.6763       0.7169        [35m4.3213[0m  1.6749
    109        4.6738       [32m0.7192[0m        [35m4.3196[0m  1.6412
    110        4.6723       [32m0.7202[0m        [35m4.3196[0m  1.6298
    111        [36m4.6664[0m       [32m0.7205[0m        [35m4.3185[0m  1.6467
    112        4.6708       [32m0.7209[0m        [35m4.3181[0m  1.6816
    113        4.6725       [32m0.7222[0m        [35m4.3176[0m  1.6435
    114        [36m4.6575[0m       0.7209        [35m4.3176[0m  1.6305
    115        4.6641       0.7205        [35m4.3168[0m  1.6281
    116        4.6676       0.7219        [35m4.3160[0m  1.6121
    117        4.6675       [32m0.7225[0m        [35m4.3157[0m  1.6401
    118        [36m4.6572[0m       0.7225        

     10        [36m4.7947[0m       [32m0.4666[0m        [35m4.6304[0m  2.1802
     11        [36m4.7689[0m       [32m0.5089[0m        [35m4.5895[0m  2.1720
     12        [36m4.7459[0m       [32m0.5477[0m        [35m4.5494[0m  2.1667
     13        [36m4.7224[0m       [32m0.5636[0m        [35m4.5180[0m  2.1509
     14        [36m4.7036[0m       [32m0.5851[0m        [35m4.4930[0m  2.1955
     15        [36m4.6820[0m       [32m0.5911[0m        [35m4.4761[0m  2.2031
     16        [36m4.6670[0m       [32m0.6020[0m        [35m4.4577[0m  2.1542
     17        [36m4.6522[0m       [32m0.6162[0m        [35m4.4432[0m  2.1363
     18        [36m4.6326[0m       [32m0.6225[0m        [35m4.4326[0m  2.1456
     19        [36m4.6232[0m       [32m0.6301[0m        [35m4.4216[0m  2.1372
     20        [36m4.6095[0m       [32m0.6447[0m        [35m4.4095[0m  2.1892
     21        [36m4.6032[0m       [32m0.6510[0m        [35m4.4026[0m 

    118        4.3206       0.8126        4.2198  2.3738
    119        [36m4.3150[0m       0.8132        [35m4.2187[0m  2.4346
    120        [36m4.3137[0m       0.8132        4.2190  2.4416
    121        [36m4.3123[0m       [32m0.8142[0m        [35m4.2181[0m  2.4259
    122        4.3136       0.8126        4.2186  2.4349
    123        4.3180       0.8136        [35m4.2176[0m  2.4401
    124        4.3161       [32m0.8146[0m        [35m4.2174[0m  2.4586
    125        [36m4.3119[0m       0.8142        4.2178  2.4255
    126        [36m4.3087[0m       0.8129        4.2183  2.4132
    127        4.3109       0.8132        4.2182  2.4471
    128        4.3115       [32m0.8152[0m        [35m4.2170[0m  2.3804
    129        4.3088       0.8146        [35m4.2162[0m  2.4103
    130        4.3095       0.8152        [35m4.2162[0m  2.3767
    131        [36m4.3065[0m       0.8139        4.2170  2.4148
    132        [36m4.3048[0m       [32m0.8162[0m      

     66        [36m4.1296[0m       0.8874        4.1427  4.4876
     67        [36m4.1291[0m       0.8897        [35m4.1420[0m  4.4988
     68        4.1309       [32m0.8907[0m        [35m4.1407[0m  4.4705
     69        [36m4.1260[0m       [32m0.8914[0m        [35m4.1406[0m  4.5457
     70        [36m4.1257[0m       [32m0.8921[0m        [35m4.1404[0m  4.5151
     71        [36m4.1220[0m       0.8914        [35m4.1399[0m  4.5490
     72        4.1228       0.8897        4.1410  4.5413
     73        4.1224       0.8921        4.1406  4.5797
     74        [36m4.1203[0m       0.8921        [35m4.1393[0m  4.5253
     75        [36m4.1178[0m       [32m0.8924[0m        [35m4.1390[0m  4.4683
     76        4.1205       0.8914        4.1402  4.5429
     77        4.1192       [32m0.8930[0m        [35m4.1389[0m  4.5854
     78        4.1185       0.8911        4.1397  4.5606
     79        [36m4.1174[0m       0.8927        [35m4.1385[0m  4.5744
    

     49        [36m4.0585[0m       0.9252        4.1062  7.9090
     50        [36m4.0574[0m       0.9262        [35m4.1043[0m  7.7395
     51        [36m4.0565[0m       [32m0.9275[0m        [35m4.1028[0m  7.8799
     52        4.0567       [32m0.9288[0m        4.1033  7.9706
     53        [36m4.0553[0m       0.9275        4.1035  7.9312
     54        4.0570       0.9281        4.1033  8.0179
     55        [36m4.0540[0m       0.9288        [35m4.1016[0m  7.9543
     56        4.0542       0.9275        4.1037  7.9625
     57        4.0544       0.9272        4.1043  8.0101
     58        [36m4.0521[0m       0.9281        4.1027  7.9799
     59        [36m4.0519[0m       [32m0.9315[0m        [35m4.0996[0m  7.9870
     60        [36m4.0517[0m       0.9311        [35m4.0996[0m  8.0527
     61        [36m4.0503[0m       [32m0.9318[0m        [35m4.0978[0m  8.0677
     62        [36m4.0497[0m       0.9301        4.0993  8.0485
     63        [36m4.

With no hidden layers?

In [59]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
dropout = 0.75
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
            dropout=dropout
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.output = nn.Linear(input_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.relu(X)
        X = self.dropout(X) # hidden layer removed
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam,
callbacks=[EarlyStopping(patience=10)],
)

net.fit(train_x, train_y)
tlabels = net.predict(train_x)
tacc[hidden_dim] = accuracy_score(tlabels, train_y)
print('training accuracy')
print(tacc)
time0 = time.time()
labels = net.predict(val_x)
vacc[hidden_dim] = accuracy_score(labels, val_y)
time1 = time.time()
vtime[hidden_dim] = time1-time0
print('Val accuracy')
print(vacc)
print('pred time')
print(vtime)
time2 = time.time()
olabels = net.predict(val_oos_x)
oacc[hidden_dim] = accuracy_score(olabels, val_oos_y)
time3 = time.time()
otime[hidden_dim]=time3-time2
print('OOS Val Accuracy')
print(oacc)
print('OOS pred time')
print(otime)
    

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0170[0m       [32m0.6474[0m        [35m5.0167[0m  1.9125
      2        [36m5.0163[0m       [32m0.7811[0m        [35m5.0160[0m  2.0325
      3        [36m5.0153[0m       [32m0.8086[0m        [35m5.0151[0m  2.0285
      4        [36m5.0140[0m       [32m0.8209[0m        [35m5.0140[0m  1.9413
      5        [36m5.0122[0m       0.8132        [35m5.0127[0m  1.9885
      6        [36m5.0097[0m       0.8017        [35m5.0111[0m  1.9310
      7        [36m5.0061[0m       0.7884        [35m5.0092[0m  2.0014
      8        [36m5.0019[0m       0.7709        [35m5.0070[0m  1.9692
      9        [36m4.9966[0m       0.7623        [35m5.0045[0m  1.9227
     10        [36m4.9915[0m       0.7487        [35m5.0018[0m  1.9665
     11        [36m4.9847[0m       0.7377        [35m4.9988[0m  1.9575
     12        [36m4.976

    105        4.6747       0.9119        [35m4.5822[0m  1.9365
    106        [36m4.6713[0m       0.9123        [35m4.5792[0m  1.9616
    107        4.6737       0.9119        [35m4.5762[0m  1.9758
    108        [36m4.6696[0m       [32m0.9129[0m        [35m4.5732[0m  1.9211
    109        [36m4.6639[0m       [32m0.9132[0m        [35m4.5702[0m  2.0008
    110        4.6680       0.9119        [35m4.5672[0m  1.9327
    111        4.6708       [32m0.9139[0m        [35m4.5643[0m  2.0023
    112        4.6641       [32m0.9152[0m        [35m4.5614[0m  1.9845
    113        4.6700       [32m0.9159[0m        [35m4.5586[0m  1.9386
    114        4.6723       0.9159        [35m4.5559[0m  2.0182
    115        4.6693       [32m0.9166[0m        [35m4.5530[0m  1.8936
    116        [36m4.6619[0m       [32m0.9172[0m        [35m4.5503[0m  1.9841
    117        4.6667       [32m0.9175[0m        [35m4.5476[0m  1.9503
    118        4.6642       [32m0

    223        4.6185       0.9232        [35m4.3681[0m  1.9079
    224        4.6150       0.9232        [35m4.3671[0m  1.9789
    225        [36m4.6104[0m       0.9228        [35m4.3660[0m  1.8879
    226        4.6162       0.9225        [35m4.3651[0m  1.9451
    227        4.6165       0.9228        [35m4.3641[0m  1.9179
    228        4.6155       0.9225        [35m4.3631[0m  1.8972
    229        4.6110       0.9219        [35m4.3621[0m  1.9866
    230        [36m4.6103[0m       0.9232        [35m4.3610[0m  1.8940
    231        [36m4.6074[0m       0.9228        [35m4.3600[0m  1.9889
    232        4.6135       0.9225        [35m4.3591[0m  1.9388
    233        4.6092       0.9232        [35m4.3581[0m  1.9166
    234        4.6081       0.9222        [35m4.3573[0m  1.9517
    235        4.6082       0.9222        [35m4.3564[0m  1.9081
    236        4.6099       0.9209        [35m4.3555[0m  1.9884
    237        4.6119       0.9215        [35m4.

    345        4.5854       0.9238        [35m4.2866[0m  1.9363
    346        4.5838       0.9238        [35m4.2862[0m  1.9697
    347        4.5884       0.9235        [35m4.2856[0m  1.9723
    348        4.5909       0.9238        [35m4.2852[0m  1.9383
    349        4.5937       0.9235        [35m4.2847[0m  1.9347
    350        4.5853       0.9238        [35m4.2844[0m  1.9175
    351        4.5903       0.9235        [35m4.2840[0m  1.9637
    352        4.5903       0.9238        [35m4.2836[0m  1.9461
    353        4.5904       0.9235        [35m4.2832[0m  1.9950
    354        [36m4.5827[0m       0.9238        [35m4.2828[0m  2.0293
    355        [36m4.5806[0m       0.9248        [35m4.2824[0m  2.0513
    356        4.5840       0.9225        [35m4.2820[0m  2.0689
    357        4.5809       0.9235        [35m4.2815[0m  1.9933
    358        4.5831       0.9235        [35m4.2812[0m  1.9473
    359        4.5836       0.9235        [35m4.2808[0m 

    469        4.5709       0.9209        [35m4.2469[0m  1.9316
    470        4.5750       0.9215        [35m4.2466[0m  1.9224
    471        4.5736       0.9222        [35m4.2462[0m  1.9296
    472        4.5755       0.9222        [35m4.2460[0m  1.8886
    473        4.5749       0.9219        [35m4.2456[0m  1.9248
    474        4.5776       0.9209        [35m4.2453[0m  1.9415
    475        4.5702       0.9209        [35m4.2451[0m  2.0847
    476        4.5767       0.9209        [35m4.2448[0m  2.0191
    477        4.5789       0.9212        [35m4.2445[0m  1.9355
    478        4.5743       0.9205        [35m4.2442[0m  1.9756
    479        4.5747       0.9195        [35m4.2441[0m  1.9774
    480        4.5710       0.9199        [35m4.2439[0m  2.0626
    481        4.5778       0.9202        [35m4.2438[0m  1.9492
    482        4.5738       0.9195        [35m4.2436[0m  2.0930
    483        4.5783       0.9205        [35m4.2434[0m  1.9277
    484   

    593        4.5689       0.9189        [35m4.2234[0m  1.9511
    594        4.5673       0.9189        [35m4.2233[0m  1.9108
    595        4.5769       0.9195        [35m4.2231[0m  2.0045
    596        4.5762       0.9189        [35m4.2230[0m  1.9334
    597        4.5674       0.9185        [35m4.2228[0m  2.0055
    598        4.5733       0.9185        [35m4.2225[0m  1.9529
    599        [36m4.5585[0m       0.9179        [35m4.2223[0m  2.0073
    600        4.5653       0.9166        [35m4.2221[0m  1.9913
    601        4.5687       0.9169        [35m4.2220[0m  2.0223
    602        4.5665       0.9175        [35m4.2218[0m  1.9056
    603        4.5732       0.9172        [35m4.2216[0m  1.9439
    604        4.5699       0.9175        [35m4.2216[0m  1.9438
    605        4.5672       0.9166        [35m4.2215[0m  1.9547
    606        4.5715       0.9172        [35m4.2213[0m  1.9337
    607        4.5672       0.9179        [35m4.2212[0m  1.9276
 

    717        4.5630       0.9202        [35m4.2073[0m  1.9157
    718        4.5662       0.9199        [35m4.2072[0m  1.9357
    719        4.5771       0.9205        [35m4.2072[0m  1.9320
    720        4.5615       0.9205        [35m4.2071[0m  1.9665
    721        4.5601       0.9205        [35m4.2070[0m  1.9173
    722        4.5703       0.9195        [35m4.2069[0m  1.9356
    723        4.5638       0.9195        [35m4.2068[0m  1.9260
    724        4.5655       0.9195        [35m4.2068[0m  1.9174
    725        4.5653       0.9199        [35m4.2067[0m  1.9570
    726        4.5655       0.9195        [35m4.2066[0m  1.9219
    727        4.5654       0.9192        [35m4.2065[0m  1.9460
    728        4.5654       0.9192        [35m4.2065[0m  1.9477
    729        4.5600       0.9195        [35m4.2064[0m  1.9246
    730        4.5613       0.9192        [35m4.2063[0m  1.9345
    731        4.5616       0.9189        [35m4.2061[0m  1.9013
    732   

    842        4.5644       0.9185        [35m4.1967[0m  1.9283
    843        4.5698       0.9192        [35m4.1965[0m  1.9424
    844        4.5634       0.9185        [35m4.1964[0m  1.9413
    845        4.5605       0.9182        [35m4.1963[0m  1.9491
    846        4.5558       0.9179        [35m4.1962[0m  1.9696
    847        4.5581       0.9182        [35m4.1961[0m  1.9489
    848        4.5578       0.9175        4.1961  1.9454
    849        4.5670       0.9172        [35m4.1961[0m  1.9902
    850        4.5646       0.9182        [35m4.1961[0m  1.9548
    851        4.5562       0.9185        [35m4.1959[0m  1.9737
    852        4.5599       0.9192        4.1959  1.9577
    853        4.5538       0.9199        [35m4.1958[0m  1.9402
    854        4.5672       0.9195        [35m4.1958[0m  1.9305
    855        4.5567       0.9189        [35m4.1957[0m  1.9533
    856        4.5625       0.9195        [35m4.1956[0m  1.9467
    857        4.5611       

Adjusting for patience:

In [61]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800 #hidden layer size
dropout = 0.75
patience_list = [5, 10, 20, 50]
for patience in patience_list:    #looping for patience
    print(hidden_dim)
    class CLINCModule(nn.Module):
        def __init__(
                self,
                input_dim=vocab_dim,
                hidden_dim=hidden_dim, #setting hidden layer size
                output_dim=output_dim,
                dropout=dropout
        ):
            super(CLINCModule, self).__init__()
            self.dropout = nn.Dropout(dropout)

            self.hidden = nn.Linear(input_dim, hidden_dim)
            self.output = nn.Linear(hidden_dim, output_dim)

        def forward(self, X, **kwargs):
            X = F.relu(self.hidden(X))
            X = self.dropout(X)
            X = F.softmax(self.output(X), dim=-1)
            return X
   
    net = NeuralNetClassifier(
    module=CLINCModule,
    lr=lr,
    criterion=torch.nn.CrossEntropyLoss,
    max_epochs=1000,
    optimizer=torch.optim.Adam, 
    callbacks=[EarlyStopping(patience=patience)], #setting patience 
    )
    
    net.fit(train_x, train_y)
    tlabels = net.predict(train_x)
    tacc[patience] = accuracy_score(tlabels, train_y)
    print('training accuracy')
    print(tacc)
    time0 = time.time()
    labels = net.predict(val_x)
    vacc[patience] = accuracy_score(labels, val_y)
    time1 = time.time()
    vtime[patience] = time1-time0
    print('Val accuracy')
    print(vacc)
    print('pred time')
    print(vtime)
    time2 = time.time()
    olabels = net.predict(val_oos_x)
    oacc[patience] = accuracy_score(olabels, val_oos_y)
    time3 = time.time()
    otime[patience]=time3-time2
    print('OOS Val Accuracy')
    print(oacc)
    print('OOS pred time')
    print(otime)
    

800
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0158[0m       [32m0.5245[0m        [35m5.0103[0m  6.6359
      2        [36m4.9139[0m       0.4758        [35m4.7317[0m  6.6599
      3        [36m4.5996[0m       [32m0.6685[0m        [35m4.4816[0m  6.7153
      4        [36m4.4274[0m       [32m0.7589[0m        [35m4.3687[0m  6.4396
      5        [36m4.3246[0m       [32m0.8063[0m        [35m4.2951[0m  6.4540
      6        [36m4.2656[0m       [32m0.8384[0m        [35m4.2550[0m  6.3557
      7        [36m4.2222[0m       [32m0.8609[0m        [35m4.2262[0m  6.3473
      8        [36m4.1939[0m       [32m0.8685[0m        [35m4.2085[0m  6.4061
      9        [36m4.1745[0m       [32m0.8874[0m        [35m4.1913[0m  6.2543
     10        [36m4.1525[0m       [32m0.9043[0m        [35m4.1731[0m  6.3500
     11        [36m4.1296[0m       [32m0.9156[0

     45        [36m4.0349[0m       0.9434        4.0971  6.3108
     46        [36m4.0348[0m       0.9447        [35m4.0961[0m  6.2760
     47        [36m4.0345[0m       0.9460        [35m4.0955[0m  6.3619
     48        [36m4.0343[0m       0.9454        [35m4.0954[0m  6.3100
     49        4.0346       0.9460        4.0955  6.3138
     50        [36m4.0338[0m       0.9457        [35m4.0954[0m  6.3144
     51        4.0341       0.9447        4.0955  6.4639
     52        [36m4.0335[0m       0.9430        [35m4.0953[0m  6.4119
     53        [36m4.0335[0m       0.9464        [35m4.0941[0m  6.3548
     54        4.0335       0.9454        [35m4.0940[0m  6.3701
     55        4.0336       0.9460        [35m4.0937[0m  6.3463
     56        [36m4.0331[0m       0.9464        [35m4.0937[0m  6.3704
     57        4.0333       0.9450        4.0941  6.3888
     58        [36m4.0330[0m       0.9447        4.0937  6.3430
     59        [36m4.0328[0m       0.9

     80        4.0317       0.9444        4.0926  6.3438
     81        4.0316       0.9450        4.0922  6.3974
     82        [36m4.0315[0m       0.9427        4.0926  6.3608
     83        [36m4.0314[0m       0.9444        4.0922  6.4344
     84        4.0315       0.9440        4.0921  6.4697
     85        4.0315       0.9440        4.0924  6.4249
     86        4.0314       0.9440        4.0920  6.4089
     87        [36m4.0313[0m       0.9467        4.0917  6.3463
     88        [36m4.0310[0m       0.9460        [35m4.0913[0m  6.4175
     89        4.0313       0.9444        4.0920  6.3947
     90        4.0311       0.9437        4.0922  6.4943
     91        4.0312       0.9444        [35m4.0913[0m  6.4630
Stopping since valid_loss has not improved in the last 20 epochs.
training accuracy
{5: 0.9870860927152317, 10: 0.9866887417218543, 20: 0.9874834437086093}
Val accuracy
{5: 0.9066666666666666, 10: 0.909, 20: 0.9066666666666666}
pred time
{5: 0.3598189353942871, 

     98        4.0313       0.9424        4.0942  6.4377
     99        4.0315       0.9454        4.0926  6.4622
    100        4.0313       0.9417        4.0944  6.4508
    101        4.0313       0.9407        4.0935  6.4371
    102        [36m4.0312[0m       0.9434        4.0934  6.3857
    103        [36m4.0312[0m       0.9447        4.0931  6.4095
    104        4.0312       0.9404        4.0944  6.4063
    105        [36m4.0311[0m       0.9417        4.0934  6.6744
    106        4.0311       0.9414        4.0933  6.4796
    107        [36m4.0311[0m       0.9421        4.0931  6.4586
    108        4.0312       0.9427        4.0921  6.4424
    109        4.0311       0.9421        4.0923  6.4655
    110        [36m4.0309[0m       0.9434        4.0928  6.4930
    111        4.0311       0.9401        4.0935  6.4812
    112        4.0311       0.9427        4.0924  6.4479
    113        4.0310       0.9430        4.0920  6.4530
    114        4.0310       0.9424        4

Best value for patience of 10 Epochs

No early stopping:

In [63]:
lr = 0.001
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800 
dropout = 0.75

print(hidden_dim)
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim, #setting hidden layer size
            output_dim=output_dim,
            dropout=dropout
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.relu(self.hidden(X))
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam, #Early Stopping removed
)

net.fit(train_x, train_y)
tlabels = net.predict(train_x)
tacc[patience] = accuracy_score(tlabels, train_y)
print('training accuracy')
print(tacc)
time0 = time.time()
labels = net.predict(val_x)
vacc[patience] = accuracy_score(labels, val_y)
time1 = time.time()
vtime[patience] = time1-time0
print('Val accuracy')
print(vacc)
print('pred time')
print(vtime)
time2 = time.time()
olabels = net.predict(val_oos_x)
oacc[patience] = accuracy_score(olabels, val_oos_y)
time3 = time.time()
otime[patience]=time3-time2
print('OOS Val Accuracy')
print(oacc)
print('OOS pred time')
print(otime)


800
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0158[0m       [32m0.5123[0m        [35m5.0102[0m  6.3291
      2        [36m4.9124[0m       0.4752        [35m4.7288[0m  6.2838
      3        [36m4.5918[0m       [32m0.6871[0m        [35m4.4679[0m  6.2390
      4        [36m4.4122[0m       [32m0.7709[0m        [35m4.3588[0m  6.2377
      5        [36m4.3182[0m       [32m0.8123[0m        [35m4.2924[0m  6.2310
      6        [36m4.2627[0m       [32m0.8258[0m        [35m4.2583[0m  6.2400
      7        [36m4.2282[0m       [32m0.8474[0m        [35m4.2322[0m  6.2204
      8        [36m4.2028[0m       [32m0.8579[0m        [35m4.2155[0m  6.1701
      9        [36m4.1816[0m       [32m0.8705[0m        [35m4.2000[0m  6.1621
     10        [36m4.1614[0m       [32m0.8950[0m        [35m4.1813[0m  6.1194
     11        [36m4.1372[0m       [32m0.9076[0

    120        4.0307       0.9427        4.0917  6.3932
    121        4.0307       0.9424        4.0915  6.4292
    122        [36m4.0305[0m       0.9417        4.0921  6.4065
    123        [36m4.0305[0m       0.9430        4.0922  6.4458
    124        4.0306       0.9437        4.0916  6.3946
    125        4.0305       0.9424        4.0917  6.4726
    126        4.0305       0.9404        4.0925  6.3902
    127        [36m4.0305[0m       0.9424        4.0917  6.4056
    128        4.0306       0.9411        4.0923  6.3937
    129        4.0306       0.9434        4.0919  6.5274
    130        4.0305       0.9430        4.0917  6.4062
    131        [36m4.0305[0m       0.9401        4.0918  6.3949
    132        [36m4.0304[0m       0.9424        4.0911  6.4151
    133        [36m4.0304[0m       0.9421        4.0914  6.4482
    134        4.0304       0.9437        4.0910  6.4010
    135        4.0305       0.9427        4.0915  6.4399
    136        4.0306       0.9411

    260        4.0301       0.9384        4.0924  6.4510
    261        4.0302       0.9387        4.0916  6.4632
    262        4.0301       0.9397        4.0914  6.4550
    263        4.0301       0.9407        4.0916  6.5228
    264        4.0302       0.9397        4.0923  6.4714
    265        4.0301       0.9404        4.0922  6.4615
    266        [36m4.0301[0m       0.9391        4.0930  6.4507
    267        [36m4.0300[0m       0.9404        4.0922  6.4881
    268        4.0301       0.9394        4.0923  6.4511
    269        4.0301       0.9394        4.0925  6.4433
    270        4.0301       0.9387        4.0933  6.4383
    271        4.0301       0.9377        4.0940  6.5294
    272        [36m4.0300[0m       0.9377        4.0937  6.5002
    273        [36m4.0300[0m       0.9384        4.0930  6.4760
    274        4.0300       0.9361        4.0937  6.5042
    275        4.0301       0.9368        4.0935  6.4472
    276        4.0300       0.9374        4.0935  6.

    401        4.0298       0.9384        4.0917  6.8595
    402        4.0299       0.9387        4.0922  6.6630
    403        4.0298       0.9391        4.0918  6.7649
    404        4.0298       0.9381        4.0918  6.6705
    405        4.0298       0.9387        4.0921  6.6471
    406        4.0299       0.9391        4.0917  6.8459
    407        4.0298       0.9384        4.0920  6.6308
    408        4.0299       0.9381        4.0923  6.7126
    409        4.0298       0.9391        4.0919  6.7349
    410        4.0298       0.9377        4.0920  6.6377
    411        4.0298       0.9374        4.0922  6.7885
    412        4.0299       0.9377        4.0923  6.6540
    413        4.0299       0.9381        4.0922  6.6350
    414        4.0298       0.9387        4.0915  6.8223
    415        4.0298       0.9381        4.0915  6.6780
    416        4.0298       0.9384        4.0913  6.7458
    417        4.0298       0.9391        4.0909  6.6639
    418        4.0298       0.9

    543        4.0298       0.9391        4.0922  6.6652
    544        4.0298       0.9387        4.0920  6.6838
    545        4.0298       0.9397        4.0916  6.6728
    546        4.0298       0.9391        4.0916  6.5831
    547        4.0298       0.9397        4.0915  6.9081
    548        4.0298       0.9394        4.0919  6.6190
    549        4.0298       0.9387        4.0922  6.7840
    550        4.0298       0.9397        4.0916  10.2805
    551        4.0298       0.9397        4.0917  286.7433
    552        4.0298       0.9401        4.0908  7.8314
    553        4.0298       0.9391        4.0913  7.2209
    554        4.0298       0.9401        4.0912  7.0530
    555        4.0298       0.9391        4.0912  7.1388
    556        4.0298       0.9387        4.0914  6.8863
    557        4.0298       0.9387        4.0917  6.7048
    558        4.0298       0.9381        4.0920  6.7783
    559        4.0298       0.9404        4.0919  6.6144
    560        4.0298       

    686        4.0298       0.9368        4.0922  12.8274
    687        4.0298       0.9371        4.0924  22.0418
    688        4.0298       0.9361        4.0926  293.2432
    689        4.0298       0.9361        4.0924  11.0351
    690        4.0298       0.9368        4.0926  22.1673
    691        4.0298       0.9371        4.0925  296.3504
    692        4.0298       0.9377        4.0926  7.8375
    693        4.0298       0.9371        4.0922  22.1364
    694        4.0298       0.9374        4.0919  298.4188
    695        4.0298       0.9374        4.0918  6.4436
    696        4.0298       0.9377        4.0918  21.6877
    697        4.0298       0.9368        4.0921  302.1354
    698        4.0298       0.9377        4.0915  6.4095
    699        4.0298       0.9384        4.0918  18.2301
    700        4.0298       0.9377        4.0916  22.3095
    701        4.0298       0.9374        4.0915  6.4356
    702        4.0298       0.9377        4.0914  14.3832
    703       

    827        4.0298       0.9397        4.0907  15.3676
    828        4.0298       0.9397        4.0907  22.9042
    829        4.0298       0.9397        4.0906  290.2831
    830        4.0298       0.9401        4.0908  15.3119
    831        4.0298       0.9397        4.0905  22.9366
    832        4.0298       0.9397        4.0905  291.9086
    833        4.0298       0.9394        4.0904  13.4076
    834        4.0298       0.9394        4.0903  22.9931
    835        4.0298       0.9404        4.0899  293.0152
    836        4.0298       0.9401        4.0904  12.2199
    837        4.0298       0.9391        4.0905  22.9903
    838        4.0298       0.9394        4.0906  293.9956
    839        4.0298       0.9381        4.0906  11.1796
    840        4.0298       0.9387        4.0910  22.8685
    841        4.0298       0.9381        4.0911  296.1385
    842        4.0298       0.9397        4.0911  9.1696
    843        4.0298       0.9394        4.0909  22.9860
    844   

    969        4.0298       0.9394        4.0906  7.9113
    970        4.0298       0.9387        4.0910  7.2865
    971        4.0298       0.9384        4.0908  7.0860
    972        4.0298       0.9377        4.0908  6.9257
    973        4.0298       0.9391        4.0906  6.8468
    974        4.0298       0.9404        4.0902  6.8487
    975        4.0298       0.9404        4.0903  6.9166
    976        4.0298       0.9407        4.0902  6.8827
    977        4.0298       0.9417        4.0900  6.9436
    978        4.0298       0.9401        4.0901  6.9277
    979        4.0298       0.9401        4.0900  6.8692
    980        4.0298       0.9397        4.0905  7.0795
    981        4.0298       0.9397        4.0906  6.9153
    982        4.0298       0.9391        4.0907  7.0382
    983        4.0298       0.9391        4.0908  7.0542
    984        4.0298       0.9401        4.0907  6.9938
    985        4.0298       0.9394        4.0907  7.0106
    986        4.0298       0.9

This is worse. Retraining model using optimal values

In [64]:
lr = 0.001 # best learning rate
tacc={}
vacc = {}
vtime = {}
oacc = {}
otime = {}
hidden_dim = 800 #hidden layer size
dropout = 0.75 #best dropout rate

print(hidden_dim)
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim, 
            output_dim=output_dim,
            dropout=dropout
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim) #one hidden layer only
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.relu(self.hidden(X)) #best activation function for hidden layer
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam, #best optimizer.
callbacks=[EarlyStopping(patience=10)], #best patience
)

net.fit(train_x, train_y)
tlabels = net.predict(train_x)
tacc[patience] = accuracy_score(tlabels, train_y)
print('training accuracy')
print(tacc)
time0 = time.time()
labels = net.predict(val_x)
vacc[patience] = accuracy_score(labels, val_y)
time1 = time.time()
vtime[patience] = time1-time0
print('Val accuracy')
print(vacc)
print('pred time')
print(vtime)
time2 = time.time()
olabels = net.predict(val_oos_x)
oacc[patience] = accuracy_score(olabels, val_oos_y)
time3 = time.time()
otime[patience]=time3-time2
print('OOS Val Accuracy')
print(oacc)
print('OOS pred time')
print(otime)



800
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0158[0m       [32m0.5401[0m        [35m5.0104[0m  6.2687
      2        [36m4.9119[0m       0.4685        [35m4.7284[0m  6.2613
      3        [36m4.5989[0m       [32m0.6712[0m        [35m4.4784[0m  6.2625
      4        [36m4.4203[0m       [32m0.7672[0m        [35m4.3602[0m  6.2490
      5        [36m4.3205[0m       [32m0.8142[0m        [35m4.2892[0m  6.2845
      6        [36m4.2618[0m       [32m0.8348[0m        [35m4.2541[0m  6.2805
      7        [36m4.2274[0m       [32m0.8454[0m        [35m4.2330[0m  6.3284
      8        [36m4.2029[0m       [32m0.8573[0m        [35m4.2160[0m  6.2509
      9        [36m4.1778[0m       [32m0.8801[0m        [35m4.1933[0m  6.2657
     10        [36m4.1532[0m       [32m0.9020[0m        [35m4.1732[0m  6.1212
     11        [36m4.1310[0m       [32m0.9106[0

In [66]:
#exporting best model
import pickle
with open('okmlp.pkl', 'wb') as f:
    pickle.dump(net, f)

Timing for best model:

In [67]:
lr = 0.001
hidden_dim = 800 #hidden layer size
dropout = 0.75

print(hidden_dim)
class CLINCModule(nn.Module):
    def __init__(
            self,
            input_dim=vocab_dim,
            hidden_dim=hidden_dim, #setting hidden layer size
            output_dim=output_dim,
            dropout=dropout
    ):
        super(CLINCModule, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, X, **kwargs):
        X = F.relu(self.hidden(X))
        X = self.dropout(X)
        X = F.softmax(self.output(X), dim=-1)
        return X

net = NeuralNetClassifier(
module=CLINCModule,
lr=lr,
criterion=torch.nn.CrossEntropyLoss,
max_epochs=1000,
optimizer=torch.optim.Adam, #Adam again - momentum has been removed accordingly.
callbacks=[EarlyStopping(patience=10)],
)
timet0 = time.time()
for i in range(10): #taking average over 10 full optimizations
    net.fit(train_x, train_y)
timet1 = time.time()
t_time = (timet1-timet0)/10
print('training time(average of 10 iterations)')
print(t_time)

time0 = time.time()
for i in range(100): #taking average over 100 predictions
    labels = net.predict(val_x)
time1 = time.time()
vtime = (time1-time0)/100
print('validation prediction time (average of 100 iterations)')
print(vtime)


time2 = time.time()
for i in range(100): #taking average over 100 predictions
    olabels = net.predict(val_oos_x)
time3 = time.time()
otime[patience]=(time3-time2)/100
print('OOS prediction time (average of 100 iterations)')
print(otime)

800
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0158[0m       [32m0.5652[0m        [35m5.0103[0m  7.3780
      2        [36m4.9108[0m       0.4795        [35m4.7254[0m  6.3650
      3        [36m4.5992[0m       [32m0.6844[0m        [35m4.4755[0m  6.2726
      4        [36m4.4187[0m       [32m0.7679[0m        [35m4.3620[0m  6.3744
      5        [36m4.3210[0m       [32m0.8043[0m        [35m4.2964[0m  6.6267
      6        [36m4.2667[0m       [32m0.8325[0m        [35m4.2560[0m  6.4749
      7        [36m4.2296[0m       [32m0.8497[0m        [35m4.2316[0m  6.4373
      8        [36m4.1983[0m       [32m0.8626[0m        [35m4.2106[0m  6.5754
      9        [36m4.1758[0m       [32m0.8838[0m        [35m4.1920[0m  6.5171
     10        [36m4.1522[0m       [32m0.8983[0m        [35m4.1736[0m  6.4708
     11        [36m4.1334[0m       [32m0.9063[0

     14        [36m4.0999[0m       [32m0.9156[0m        [35m4.1411[0m  6.2991
     15        [36m4.0943[0m       [32m0.9166[0m        [35m4.1380[0m  6.3786
     16        [36m4.0865[0m       [32m0.9255[0m        [35m4.1306[0m  6.6013
     17        [36m4.0777[0m       [32m0.9291[0m        [35m4.1266[0m  6.6279
     18        [36m4.0717[0m       [32m0.9321[0m        [35m4.1234[0m  6.4790
     19        [36m4.0659[0m       [32m0.9348[0m        [35m4.1206[0m  6.4075
     20        [36m4.0618[0m       [32m0.9354[0m        [35m4.1172[0m  6.3652
     21        [36m4.0571[0m       [32m0.9394[0m        [35m4.1144[0m  6.4991
     22        [36m4.0532[0m       [32m0.9397[0m        [35m4.1117[0m  6.5207
     23        [36m4.0513[0m       [32m0.9427[0m        [35m4.1093[0m  6.3909
     24        [36m4.0488[0m       0.9417        [35m4.1077[0m  6.3628
     25        [36m4.0477[0m       0.9427        [35m4.1068[0m  6.3544
     26   

     25        [36m4.0466[0m       0.9407        [35m4.1066[0m  6.6347
     26        [36m4.0455[0m       0.9417        [35m4.1054[0m  6.7291
     27        [36m4.0439[0m       0.9417        [35m4.1048[0m  6.6612
     28        [36m4.0428[0m       [32m0.9430[0m        [35m4.1045[0m  6.5828
     29        [36m4.0421[0m       [32m0.9447[0m        [35m4.1029[0m  6.5609
     30        [36m4.0412[0m       0.9440        [35m4.1025[0m  6.4501
     31        [36m4.0402[0m       0.9427        [35m4.1017[0m  6.7104
     32        [36m4.0399[0m       0.9421        4.1022  6.7797
     33        [36m4.0387[0m       0.9447        [35m4.1008[0m  6.9062
     34        [36m4.0386[0m       0.9447        [35m4.1006[0m  6.5777
     35        [36m4.0383[0m       [32m0.9467[0m        [35m4.0993[0m  6.5613
     36        [36m4.0375[0m       0.9447        4.0995  6.5703
     37        [36m4.0370[0m       0.9444        4.0996  6.5061
     38        4.0372    

     59        4.0328       0.9457        4.0943  6.4243
     60        4.0328       0.9454        [35m4.0938[0m  6.6775
     61        [36m4.0326[0m       0.9460        4.0941  6.5448
     62        [36m4.0325[0m       0.9470        [35m4.0938[0m  6.3740
     63        4.0327       0.9460        [35m4.0935[0m  6.6394
     64        [36m4.0322[0m       0.9467        [35m4.0923[0m  6.6024
     65        4.0323       0.9450        4.0930  6.6287
     66        4.0323       0.9464        4.0927  6.6050
     67        4.0324       0.9450        4.0928  6.6565
     68        [36m4.0320[0m       0.9457        4.0926  6.6289
     69        4.0321       0.9467        4.0927  6.5386
     70        4.0320       0.9450        4.0929  6.6314
     71        [36m4.0319[0m       0.9480        [35m4.0923[0m  6.6163
     72        4.0323       0.9467        4.0929  6.5793
     73        [36m4.0319[0m       0.9444        4.0930  6.5792
Stopping since valid_loss has not improved in 

      2        [36m4.9137[0m       0.4828        [35m4.7283[0m  6.6732
      3        [36m4.6009[0m       [32m0.6864[0m        [35m4.4767[0m  6.6676
      4        [36m4.4176[0m       [32m0.7685[0m        [35m4.3615[0m  6.3482
      5        [36m4.3219[0m       [32m0.8172[0m        [35m4.2922[0m  6.3929
      6        [36m4.2599[0m       [32m0.8407[0m        [35m4.2485[0m  6.3426
      7        [36m4.2198[0m       [32m0.8546[0m        [35m4.2256[0m  6.2989
      8        [36m4.1964[0m       [32m0.8652[0m        [35m4.2086[0m  6.3067
      9        [36m4.1725[0m       [32m0.8864[0m        [35m4.1915[0m  6.2652
     10        [36m4.1518[0m       [32m0.8983[0m        [35m4.1724[0m  6.3402
     11        [36m4.1307[0m       [32m0.9066[0m        [35m4.1590[0m  6.2250
     12        [36m4.1188[0m       [32m0.9116[0m        [35m4.1507[0m  6.3543
     13        [36m4.1072[0m       [32m0.9136[0m        [35m4.1453[0m  6.3472
 

     22        [36m4.0518[0m       [32m0.9411[0m        [35m4.1113[0m  6.6125
     23        [36m4.0503[0m       [32m0.9427[0m        [35m4.1098[0m  6.4819
     24        [36m4.0491[0m       0.9404        [35m4.1082[0m  6.4985
     25        [36m4.0471[0m       0.9427        [35m4.1070[0m  6.4314
     26        [36m4.0452[0m       0.9414        [35m4.1059[0m  6.3382
     27        [36m4.0439[0m       [32m0.9434[0m        [35m4.1047[0m  6.4079
     28        [36m4.0425[0m       0.9430        [35m4.1040[0m  6.5185
     29        4.0427       0.9421        [35m4.1038[0m  6.4130
     30        [36m4.0407[0m       [32m0.9437[0m        [35m4.1031[0m  6.5538
     31        [36m4.0404[0m       0.9424        [35m4.1027[0m  6.4830
     32        [36m4.0398[0m       0.9424        [35m4.1027[0m  6.5192
     33        [36m4.0391[0m       0.9430        [35m4.1018[0m  6.6374
     34        [36m4.0387[0m       0.9407        4.1023  6.5581
     35

     62        4.0327       0.9440        4.0945  6.4249
     63        [36m4.0323[0m       0.9450        4.0936  6.4443
     64        [36m4.0323[0m       0.9460        [35m4.0930[0m  6.4440
     65        [36m4.0321[0m       0.9457        4.0934  6.5259
     66        4.0324       0.9450        [35m4.0929[0m  6.5915
     67        4.0322       0.9444        4.0934  6.5167
     68        4.0322       0.9437        4.0942  6.5535
     69        [36m4.0321[0m       0.9444        4.0929  6.6276
     70        [36m4.0320[0m       0.9450        4.0930  6.4476
     71        [36m4.0319[0m       0.9440        [35m4.0928[0m  6.4105
     72        4.0321       0.9447        [35m4.0923[0m  6.4377
     73        4.0319       0.9447        4.0938  6.4220
     74        [36m4.0318[0m       0.9447        4.0929  6.4105
     75        4.0319       0.9457        [35m4.0920[0m  6.4444
     76        [36m4.0317[0m       0.9454        4.0922  6.4296
     77        [36m4.0314[

     79        4.0317       0.9454        4.0921  6.6292
     80        [36m4.0315[0m       0.9434        4.0922  6.6057
     81        [36m4.0315[0m       0.9450        4.0920  6.4910
     82        4.0317       0.9447        4.0927  6.5474
     83        4.0315       0.9454        4.0910  6.5867
Stopping since valid_loss has not improved in the last 10 epochs.
Re-initializing module.
Re-initializing optimizer.
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m5.0158[0m       [32m0.4874[0m        [35m5.0102[0m  6.5338
      2        [36m4.9122[0m       0.4864        [35m4.7246[0m  6.6421
      3        [36m4.5979[0m       [32m0.6659[0m        [35m4.4811[0m  6.6101
      4        [36m4.4219[0m       [32m0.7603[0m        [35m4.3626[0m  6.7448
      5        [36m4.3218[0m       [32m0.8142[0m        [35m4.2924[0m  6.5438
      6        [36m4.2583[0m       [32m0.8447[0m   