In [22]:
import re
import math
import time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import nltk
from nltk.tokenize import word_tokenize

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt

In [82]:
# Have a look at the data

train_path = "a3_data/wsd_train.txt"
test_path = "a3_data/wsd_test_blind.txt"

with open(train_path, encoding = "utf-8") as f:
    for d, line in enumerate(f):
        print(line.lower())
        break
        
with open(test_path, encoding = "utf-8") as f:
    for d, line in enumerate(f):
        print(line.lower())
        break

keep%2:42:07::	keep.v	15	action by the committee in pursuance of its mandate , the committee will continue to keep under review the situation relating to the question of palestine and participate in relevant meetings of the general assembly and the security council . the committee will also continue to monitor the situation on the ground and draw the attention of the international community to urgent developments in the occupied palestinian territory , including east jerusalem , requiring international action .

?	physical.a	58	iaea pointed out that training and education were fundamental to the agency 's approach to enhancing physical protection systems in states . training courses , workshops and seminars that had been held on six continents had raised awareness and had provided hands-on experience of various subjects including the physical protection of research facilities , the practical operation of physical protection systems , and the engineering safety aspects of physical prote

# Load and preprocess the data

In [3]:
def load_data(file_path):
    
    sense_list = []
    lemma_list = []
    position_list = []
    text_list = []

    with open(file_path, encoding = "utf-8") as f:
        for d, line in enumerate(f):

            line = line.lower()

            ix = line.find("\t")
            sense_key = line[0:ix]
            line = line[ix+1:]

            ix = line.find("\t")
            lemma = line[0:ix]
            line = line[ix+1:]

            ix = line.find("\t")
            position = line[0:ix]
            text = line[ix+1:].split()

            sense_list.append(sense_key)
            lemma_list.append(lemma)
            position_list.append(position)
            text_list.append(text)
    
    df = pd.DataFrame(sense_list, columns = ["Sense_key"])
    df["Lemma"] = lemma_list
    df["Position"] = position_list
    df["Text"] = text_list

    del sense_list, lemma_list, position_list
    
    return df

In [104]:
class Preprocessing:
    def __init__(self, df, num_words, seq_len):
        self.data = df
        self.num_words = num_words
        self.seq_len = seq_len  
        
        self.vocabulary = None
        self.x_tokenized = None
        self.x_embedded = None
        self.x_padded = None
        self.x_raw = None
        
        self.lemma = None
        self.n_outputs = None
        self.le = None
        self.y = None
        self.y_onehot = None
        
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
    
    def load_data(self):
        # split into sentences (x) and sense key (y)
        df = self.data
        self.x_raw = df.Text.values
        self.lemma = df.Lemma.iloc[0]
        self.n_outputs = len(df.Sense_key.unique())
        
        labels = np.asarray(df.Sense_key.values)
        le = preprocessing.LabelEncoder()
        targets = le.fit_transform(labels)
        self.y = targets
        self.le = le
        
    def build_vocabulary(self):
        # Builds the vocabulary 
        self.vocabulary = dict()
        fdist = nltk.FreqDist()

        for sentence in self.x_raw:
            for word in sentence:
                fdist[word] += 1

        common_words = fdist.most_common(self.num_words)

        for idx, word in enumerate(common_words):
            self.vocabulary[word[0]] = (idx+1)
            
    def word_to_idx(self):
        # By using the dictionary each token is transformed into its index based representation
        self.x_tokenized = list() 

        for sentence in self.x_raw:
            temp_sentence = list()
            for word in sentence:
                if word in self.vocabulary.keys():
                    temp_sentence.append(self.vocabulary[word])
            self.x_tokenized.append(temp_sentence)
        
    def find_seq_len(self):
        # Find length of the longest line in the data
        max_len = 0
        for item in self.x_raw:
    
            if len(item) > max_len:
                max_len = len(item)
        
        self.seq_len = max_len
    
    def padding_sentences(self):
        # Each sentence which does not fulfill the required length is padded with the index 0
        pad_idx = 0
        self.x_padded = list()

        for sentence in self.x_tokenized:
            while len(sentence) < self.seq_len:
                sentence.insert(len(sentence), pad_idx)

            self.x_padded.append(sentence)
            
        self.x_padded = np.array(self.x_padded) 
        
    def onehot_encode(self):
        # Create a onehot encoded representation of the targets
        self.y_onehot = list()
        y_idx = self.le.inverse_transform(self.y)
        
        for i in range(len(self.y)):
            
            tmp = np.zeros(self.n_outputs)
        
            for k in range(self.n_outputs):
                if self.data.Sense_key.iloc[i] == y_idx[i]:
                    tmp[self.y[i]] = 1
                    
            self.y_onehot.append(tmp)
            
        self.y_onehot = np.array(self.y_onehot)       
            
    def split_data(self):
        self.x_train, self.x_test, self.y_train, self.y_test = \
        train_test_split(self.x_padded, self.y_onehot, test_size=0.25, random_state=None)

## Check that it works

In [5]:
df = load_data(train_path)

In [113]:
sense_list = ["positive%3:00:01::", "positive%5:00:00:advantageous:00"]

df_pos = df[df.Lemma == "positive.a"]
#df_pos = df_pos.loc[df_pos.Sense_key.isin(sense_list)]

for i, val in enumerate(df_pos.Sense_key.unique()):

    col_name = "Onehot_" + str(i)
    df_pos[col_name] = df_pos.loc[:, "Sense_key"] == val
    df_pos[col_name] = df_pos[col_name].astype(int)

df_pos.head()

In [114]:
len(df_pos)

1216

In [115]:
data_pos = Preprocessing(df_pos, num_words = 6000, seq_len = 100)
data_pos.load_data()
data_pos.build_vocabulary()
data_pos.word_to_idx()
data_pos.find_seq_len()
data_pos.padding_sentences()
data_pos.onehot_encode()
data_pos.split_data()

In [117]:
embeddings_dict = {}

with open("glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [120]:
embeddings_dict["the"]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [124]:
data_pos.x_raw[0]

['thirdly',
 ',',
 'there',
 'are',
 'situations',
 'where',
 'national',
 'or',
 'international',
 'developments',
 'create',
 'an',
 'environment',
 'with',
 'the',
 'potential',
 'to',
 'significantly',
 'affect',
 ',',
 'whether',
 'positively',
 'or',
 'negatively',
 ',',
 'durability',
 'and',
 'sustainability',
 'of',
 'long-term',
 'solutions',
 ',',
 'including',
 'large-scale',
 'returns',
 '.',
 'in',
 'such',
 'circumstances',
 ',',
 'the',
 'purpose',
 'of',
 'the',
 'representative',
 "'s",
 'mission',
 'is',
 'to',
 'build',
 'upon',
 'positive',
 'developments',
 'as',
 'much',
 'as',
 'possible',
 'so',
 'as',
 ',',
 'to',
 'contribute',
 'to',
 'conditions',
 'on',
 'the',
 'ground',
 'which',
 'heighten',
 'the',
 'likelihood',
 'of',
 'sustainable',
 'progress',
 '.',
 'at',
 'the',
 'same',
 'time',
 ',',
 'the',
 'representative',
 'seeks',
 'to',
 'identify',
 'risks',
 'and',
 'steer',
 'actors',
 'away',
 'from',
 'courses',
 'of',
 'action',
 'that',
 'risk',


In [126]:
embeddings_dict[data_pos.x_raw[0][0]]

array([ 0.42115 ,  0.037718, -0.49961 , -0.48811 ,  0.21755 ,  0.45833 ,
        0.84379 , -0.029328, -0.22836 , -0.50353 ,  0.29288 ,  0.88652 ,
        0.075803, -0.89269 , -0.077111,  0.64782 ,  0.1009  , -0.2286  ,
        1.2372  ,  0.29613 ,  0.49105 ,  0.3105  ,  0.37639 , -0.10743 ,
        0.99254 ,  0.53388 ,  0.33742 , -0.72163 ,  0.6276  ,  0.17441 ,
       -0.070688,  0.63826 , -0.096589, -0.24625 ,  0.22688 , -0.25842 ,
       -0.19026 ,  0.47874 , -0.23385 ,  0.3112  , -0.24794 , -0.64864 ,
        0.7182  , -0.46691 , -0.080093, -0.16302 ,  0.078378, -0.48693 ,
        0.29361 ,  0.59439 ], dtype=float32)

In [127]:
data_pos.x_padded.shape

(1216, 236)

# Embedding the data

Are stopwords able to change the sense of a word? I think so!

- standing in line - waiting for something
- standing in a line - they're just standing 

Based on this, I will not remove stopwords. I will also leave in punctuation, but it seems like a good idea to lowercase the entire text. We're not doing NER, and I don't want Line and line to end up having two meanings - the position alone should clarify the sense. CBoW seems like a terrible choice in this setting - the word senses will almost certainly get lost. Try representation with pre-trained GloVe vectors.

https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db

idea: only embed the sentence containing the word in question (maybe later)
use word position in an attention model, or for determining weights in a CNN/RNN (think that is an attention model)
 
represent sentence/doc
one-hot encode labels

prediction: something with a softmax layer

CNNs seem promising, as they can model interactions between words (exactly what we want). They also have a local structure, which is great. (can steal code from demo notebook if I want to use this)

In [11]:
#train_df = load_data(train_path)
#train_df.head()

In [12]:
#test_df = load_data(test_path)
#test_df.head()

start out simple! ignore position, see it as a document classification problem

## Start with just one lemma 

Will onehot encode the sense key. This makes the most sense.

want a training accuracy score for each network

Could learn representation as I go, but there's not a lot of examples per unique sense_key, in some cases...
Also, this is a pain. Since the WSD texts appear to be generic enough, pretrained GloVe vectors should be ok. 
Do I use these as an initial guess or what? Also, GloVe only encodes one word at a time - so do I apply a context window myself as well?

for a 50-dim embedding of a 100-word document, we get a $100*50$ matrix. Seems to make sense to run a CNN over this! 

output layer size should depend on the number of distinct senses for each lemma, so this is a lemma-by-lemma approach

or try summing up all the vectors to create one representation for the entire document, then input it into a deep neural net of size 50. however this is silly and a RNN is better, can then have feedback in time if we input one word at a time.

# Classify using a CNN

## Parameters

In [87]:
from dataclasses import dataclass

@dataclass
class Parameters:

    # Preprocessing parameeters
    num_words: int = 8000

    # Model parameters
    embedding_size: int = 64
    out_size: int = 32
    stride: int = 2

    # Training parameters
    epochs: int = 10
    batch_size: int = 12
    learning_rate: float = 0.001
    early_stopping_win = 5
        
    # Runtime parameters - will be different for each lemma
    n_outputs: int = None
    seq_len: int = None

## TextClassifier

In [14]:
class TextClassifier(nn.ModuleList):

    def __init__(self, params):
        super(TextClassifier, self).__init__()

        # Parameters regarding text preprocessing
        self.seq_len = params.seq_len
        self.num_words = params.num_words
        self.embedding_size = params.embedding_size

        # Dropout definition
        self.dropout = nn.Dropout(0.25)

        # CNN parameters definition
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5

        # Output size for each convolution
        self.out_size = params.out_size
        # Number of strides for each convolution
        self.stride = params.stride

        # Embedding layer definition
        self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)

        # Convolution layers definition
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)

        # Max pooling layers definition
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)

        # Fully connected layer definition
        self.fc = nn.Linear(self.in_features_fc(), params.n_outputs)
        
        # Softmax output layer definition
        self.log_softmax = nn.LogSoftmax(dim = 1)

    def in_features_fc(self):
        '''Calculates the number of output features after Convolution + Max pooling

        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1

        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        '''
        
        # Calculate size of convolved/pooled features for convolution_1/max_pooling_1 features
        out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)

        # Calculate size of convolved/pooled features for convolution_2/max_pooling_2 features
        out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_conv_2 = math.floor(out_conv_2)
        out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_pool_2 = math.floor(out_pool_2)

        # Calculate size of convolved/pooled features for convolution_3/max_pooling_3 features
        out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_conv_3 = math.floor(out_conv_3)
        out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_pool_3 = math.floor(out_pool_3)

        # Calculate size of convolved/pooled features for convolution_4/max_pooling_4 features
        out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_conv_4 = math.floor(out_conv_4)
        out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_pool_4 = math.floor(out_pool_4)

        # Returns "flattened" vector (input for fully connected layer)
        return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size
    

    def forward(self, x):

        # Sequence of tokens is filtered through an embedding layer
        x = self.embedding(x)

        # Convolution layer 1 is applied
        x1 = self.conv_1(x)
        x1 = torch.relu(x1)
        x1 = self.pool_1(x1)

        # Convolution layer 2 is applied
        x2 = self.conv_2(x)
        x2 = torch.relu((x2))
        x2 = self.pool_2(x2)

        # Convolution layer 3 is applied
        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pool_3(x3)

        # Convolution layer 4 is applied
        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pool_4(x4)

        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)
        
        # The "flattened" vector is passed through a fully connected layer
        out = self.fc(union)
        # Dropout is applied
        out = self.dropout(out)
        out = self.log_softmax(out)

        # Use this, or there's a dim-0 error when a batch contains only one value
        if len(out) > 1:
            return out.squeeze()
        else:
            return out

Learned about the softmax outputs and loss function from here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html

## Run

In [111]:
class DatasetMapper(Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

class Run:
    '''Training, evaluation and metrics calculation'''

    @staticmethod
    def train(model, data, params):

        # Initialize dataset maper
        train = DatasetMapper(data['x_train'], data['y_train'])
        test = DatasetMapper(data['x_test'], data['y_test'])

        # Initialize loaders
        loader_train = DataLoader(train, batch_size=params.batch_size)
        loader_test = DataLoader(test, batch_size=params.batch_size)

        # Define loss function and optimizer
        loss_function = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
        
        # Define vector for early stopping
        prev_loss = np.zeros(params.early_stopping_win)

        # Starts training phase
        for epoch in range(params.epochs):
            # Set model in training model
            model.train()
            predictions = []
            # Starts batch training
            for x_batch, y_batch in loader_train:

                y_batch = y_batch.type(torch.FloatTensor)

                # Feed the model
                y_pred = model(x_batch.long())
                                
                # Loss calculation
                y_true = np.zeros(y_batch.shape[0])
    
                for i in range(y_batch.shape[0]):
                    for j in range(y_batch.shape[1]):
                        if y_batch[i,j] == 1:
                            y_true[i] = j
            
                y_true = torch.from_numpy(y_true).long()
                loss = loss_function(y_pred, y_true)

                # Clean gradientes
                optimizer.zero_grad()

                # Gradients calculation
                loss.backward()

                # Gradients update
                optimizer.step()

                # Save predictions
                predictions += list(y_pred.detach().numpy())
                
            # Evaluation phase
            test_predictions = Run.evaluation(model, loader_test)
            
            if epoch % 5 == 0:
                # Metrics calculation
                train_accuracy = Run.calculate_accuracy(data['y_train'], predictions)
                test_accuracy = Run.calculate_accuracy(data['y_test'], test_predictions)
                print("Epoch: %d, loss: %.4f, Train accuracy: %.4f, Test accuracy: %.4f" % \
                      (epoch, loss.item(), train_accuracy, test_accuracy))
            
            # Early stopping check
            if epoch > 10:
                if loss.item() < min(prev_loss):
                    prev_loss = prev_loss[1:]
                    prev_loss = np.append(prev_loss, loss.item())
                else:
                    break
                
        return train_accuracy, test_accuracy

    @staticmethod
    def evaluation(model, loader_test):

        # Set the model in evaluation mode
        model.eval()
        predictions = []

        # Start evaluation phase
        with torch.no_grad():
            for x_batch, y_batch in loader_test:
                y_pred = model(x_batch.long())
                predictions += list(y_pred.detach().numpy())
        return predictions
        
    @staticmethod
    def calculate_accuracy(grand_truth, predictions):
        # Metrics calculation
        correct = 0
        
        for true, pred in zip(grand_truth, predictions):
    
            for i, element in enumerate(pred):
                if element == max(pred) and true[i] == 1:
                    correct += 1
                else:
                    pass
            
        # Return accuracy
        return (correct) / len(grand_truth)
    
    
    #def prediction(model, )
    

## Controller

In [109]:
class Controller(Parameters):

    def __init__(self, df):
        
        self.lemma = None
        self.train_accuracy = None
        self.test_accuracy = None
        
        # Preprocessing pipeline
        self.data, seq_len, n_outputs, lemma = self.prepare_data(df, Parameters.num_words, Parameters.seq_len)
        Parameters.seq_len = seq_len
        Parameters.n_outputs = n_outputs

        # Initialize the model
        self.model = TextClassifier(Parameters)

        # Training - Evaluation pipeline
        train_accuracy, test_accuracy = Run().train(self.model, self.data, Parameters)

        # Save accuracies
        self.lemma = lemma
        self.train_accuracy = train_accuracy
        self.test_accuracy = test_accuracy
        

    @staticmethod
    def prepare_data(df, num_words, seq_len):
        # Preprocessing pipeline
        pr = Preprocessing(df, num_words, seq_len)
        pr.load_data()
        pr.build_vocabulary()
        pr.word_to_idx()
        pr.find_seq_len()
        pr.padding_sentences()
        pr.onehot_encode()
        pr.split_data()

        return {'x_train': pr.x_train, 'y_train': pr.y_train, 'x_test': pr.x_test, 'y_test': pr.y_test}, pr.seq_len, pr.n_outputs, pr.lemma

    
    def prepare_test_data(df, num_words, seq_len):
        
        pr = Preprocessing(df, num_words, seq_len)
        pr.load
    
#if __name__ == '__main__':
#    controller = Controller(df_pos)

## Run the code

In [17]:
df = load_data(train_path)
df.head()

Unnamed: 0,Sense_key,Lemma,Position,Text
0,keep%2:42:07::,keep.v,15,"[action, by, the, committee, in, pursuance, of..."
1,national%3:01:00::,national.a,25,"[a, guard, of, honour, stood, in, formation, i..."
2,build%2:31:03::,build.v,38,"[the, principle, that, statistics, should, be,..."
3,place%1:04:00::,place.n,36,"[again, ,, he, appealed, for, additional, supp..."
4,position%1:04:01::,position.n,76,"[also, ,, the, iaea, has, the, lowest, number,..."


In [83]:
test_df = load_data(test_path)
test_df.head()

Unnamed: 0,Sense_key,Lemma,Position,Text
0,?,physical.a,58,"[iaea, pointed, out, that, training, and, educ..."
1,?,see.v,8,"[aid, official, development, assistance, (, od..."
2,?,line.n,39,"[she, would, appreciate, receiving, informatio..."
3,?,keep.v,42,"[we, look, forward, to, its, eventual, assessm..."
4,?,national.a,57,"[in, his, report, to, the, general, assembly, ..."


In [86]:
test_short = test_df[test_df.Lemma == "positive.a"]
test_short.head()

Unnamed: 0,Sense_key,Lemma,Position,Text
50,?,positive.a,48,"[foreign, direct, investment, is, the, largest..."
68,?,positive.a,36,"[if, there, is, evidence, that, the, vessel, h..."
185,?,positive.a,66,"[promotion, of, the, international, human, rig..."
406,?,positive.a,27,"[no, further, confrontation, took, place, betw..."
513,?,positive.a,18,"[from, september, 2002, to, july, 2003, ,, the..."


In [112]:
df_short = df[df.Lemma == "positive.a"]
test_short = test_df[test_df.Lemma == "positive.a"]
controller = Controller(df_short)

print('-'*60)
print("Lemma: %s, Final training accuracy: %.4f, Final test accuracy: %.4f" % \
              (controller.lemma, controller.train_accuracy, controller.test_accuracy))
print('-'*60)

Epoch: 0, loss: 1.4820, Train accuracy: 0.3191, Test accuracy: 0.3783


KeyboardInterrupt: 

In [90]:
tmp = Preprocessing(test_short, 8000, 100)
tmp.load_data()
tmp.build_vocabulary()
tmp.word_to_idx()
tmp.find_seq_len()
tmp.padding_sentences()
tmp.onehot_encode()
tmp.split_data()

In [95]:
tmp.x_padded[0]

array([ 316,  317,   79,   13,    1, 1153, 1154,    2,  502,  421,   10,
         80,   31,    3, 1155,   16, 1156,  503,    4,  629,  504,   48,
        318,   14,  229,  273,  163,   15,    5,   14,  230,  505,  204,
          6,   80,   31,    3,   25, 1762,    7,    1,  630,  832,    3,
         22, 1763, 1764,    6,    9, 1765,   15,  316,  317,   79,  204,
          6,   80,   31,   33,  833,   27,   40, 1766,  370,    7,  205,
          3, 1157,   16,  834,    8, 1158,   23,    1,   94, 1767,  503,
         22,   35,  631,    4,  231,    8, 1159,    4,  116, 1154,    2,
        421,   10,    8,   95,    2,  632,   80,   31,    5,  633,  634,
          4,    1, 1160,  835,    1, 1153,  142,  506,    2,  503,    3,
         81,  422, 1768, 1161,    1,  143,  503,   18,    8,  836,    2,
       1769,  371, 1162,    3,   27, 1770,   28,   29,    5,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Loop over all lemmas

In [99]:
lemma_vec = []
train_accuracy_vec = []
test_accuracy_vec = []

start_time = time.time()

for lemma in df.Lemma.unique():
    
    df_short = df[df.Lemma == lemma]
    controller = Controller(df_short)
    
    lemma_vec.append(controller.lemma)
    train_accuracy_vec.append(train_accuracy)
    test_accuracy_vec.append(test_accuracy)

    print('-'*60)
    print("Lemma: %s, Final training accuracy: %.4f, Final test accuracy: %.4f" % \
                  (controller.lemma, controller.train_accuracy, controller.test_accuracy))
    print('-'*60)
    
elapsed_time = time.time() - start_time
print("Elapsed time: ", elapsed_time)  

Epoch: 1, loss: 2.0071, Train accuracy: 0.3739, Test accuracy: 0.4581
Epoch: 2, loss: 1.4639, Train accuracy: 0.5942, Test accuracy: 0.4900


KeyboardInterrupt: 