In [1]:
import re
import math
import time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import nltk
from nltk.tokenize import word_tokenize

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt

# Have a look at the data

In [2]:
# Have a look at the data

train_path = "a3_data/wsd_train.txt"
test_path = "a3_data/wsd_test_blind.txt"

with open(train_path, encoding = "utf-8") as f:
    for d, line in enumerate(f):
        print(line.lower())
        break
        
with open(test_path, encoding = "utf-8") as f:
    for d, line in enumerate(f):
        print(line.lower())
        break

keep%2:42:07::	keep.v	15	action by the committee in pursuance of its mandate , the committee will continue to keep under review the situation relating to the question of palestine and participate in relevant meetings of the general assembly and the security council . the committee will also continue to monitor the situation on the ground and draw the attention of the international community to urgent developments in the occupied palestinian territory , including east jerusalem , requiring international action .

?	physical.a	58	iaea pointed out that training and education were fundamental to the agency 's approach to enhancing physical protection systems in states . training courses , workshops and seminars that had been held on six continents had raised awareness and had provided hands-on experience of various subjects including the physical protection of research facilities , the practical operation of physical protection systems , and the engineering safety aspects of physical prote

# Load and preprocess the data

In [3]:
def load_data(file_path):
    
    sense_list = []
    lemma_list = []
    position_list = []
    text_list = []

    with open(file_path, encoding = "utf-8") as f:
        for d, line in enumerate(f):

            line = line.lower()

            ix = line.find("\t")
            sense_key = line[0:ix]
            line = line[ix+1:]

            ix = line.find("\t")
            lemma = line[0:ix]
            line = line[ix+1:]

            ix = line.find("\t")
            position = line[0:ix]
            text = line[ix+1:].split()

            sense_list.append(sense_key)
            lemma_list.append(lemma)
            position_list.append(position)
            text_list.append(text)
    
    df = pd.DataFrame(sense_list, columns = ["Sense_key"])
    df["Lemma"] = lemma_list
    df["Position"] = position_list
    df["Text"] = text_list

    del sense_list, lemma_list, position_list
    
    return df

In [62]:
class Preprocessing:
    def __init__(self, df, num_words, seq_len):
        self.data = df
        self.num_words = num_words
        self.seq_len = seq_len  
        
        self.vocabulary = None
        self.x_tokenized = None
        self.x_embedded = None
        self.x_padded = None
        self.x_raw = None
        
        self.lemma = None
        self.n_outputs = None
        self.le = None
        self.y = None
        self.y_onehot = None
        
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None  
    
    def load_data(self):
        # split into sentences (x) and sense key (y)
        df = self.data
        self.x_raw = df.Text.values
        self.lemma = df.Lemma.iloc[0]
        self.n_outputs = len(df.Sense_key.unique())
        
        labels = np.asarray(df.Sense_key.values)
        le = preprocessing.LabelEncoder()
        self.y = le.fit_transform(labels)
        self.le = le
        
    def build_vocabulary(self):
        # Builds the vocabulary 
        self.vocabulary = dict()
        fdist = nltk.FreqDist()

        for sentence in self.x_raw:
            for word in sentence:
                fdist[word] += 1

        common_words = fdist.most_common(self.num_words)

        for idx, word in enumerate(common_words):
            self.vocabulary[word[0]] = (idx+1)
            
    def word_to_idx(self):
        # By using the dictionary each token is transformed into its index based representation
        self.x_tokenized = list() 

        for sentence in self.x_raw:
            temp_sentence = list()
            for word in sentence:
                if word in self.vocabulary.keys():
                    temp_sentence.append(self.vocabulary[word])
            self.x_tokenized.append(temp_sentence)
        
    def find_seq_len(self):
        # Find length of the longest line in the data
        max_len = 0
        for item in self.x_raw:
    
            if len(item) > max_len:
                max_len = len(item)
        
        self.seq_len = max_len
    
    def padding_sentences(self):
        # Each sentence which does not fulfill the required length is padded with the index 0
        pad_idx = 0
        self.x_padded = list()

        for sentence in self.x_tokenized:
            while len(sentence) < self.seq_len:
                sentence.insert(len(sentence), pad_idx)

            self.x_padded.append(sentence)
            
        self.x_padded = np.array(self.x_padded) 
        
    def onehot_encode(self):
        # Create a onehot encoded representation of the targets
        self.y_onehot = list()
        y_idx = self.le.inverse_transform(self.y)
        
        for i in range(len(self.y)):
            
            tmp = np.zeros(self.n_outputs)
        
            for k in range(self.n_outputs):
                if self.data.Sense_key.iloc[i] == y_idx[i]:
                    tmp[self.y[i]] = 1
                    
            self.y_onehot.append(tmp)
            
        self.y_onehot = np.array(self.y_onehot)
            
    def split_data(self):
        self.x_train, self.x_test, self.y_train, self.y_test = \
        train_test_split(self.x_padded, self.y_onehot, test_size=0.25, random_state=None)

## Check that it works

In [5]:
df = load_data(train_path)
df_short = df[df.Lemma == "positive.a"]

In [6]:
data_pos = Preprocessing(df_short, num_words = 6000, seq_len = 200)
data_pos.load_data()
data_pos.build_vocabulary()
data_pos.word_to_idx()
data_pos.find_seq_len()
data_pos.padding_sentences()
data_pos.onehot_encode()
data_pos.split_data()

## GloVe

In [10]:
embeddings_dict = {}

with open("glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [11]:
embeddings_dict["romania"]

array([ 0.1232   ,  0.36199  ,  0.13819  ,  0.1923   , -0.93796  ,
        0.70297  ,  0.57263  ,  0.91297  , -0.69626  , -0.054828 ,
        1.2394   , -0.87465  ,  0.91791  , -0.28632  ,  0.71912  ,
       -0.23525  ,  0.075219 , -0.14894  ,  0.41694  ,  1.0402   ,
       -0.36619  , -1.3843   , -0.06398  ,  0.38334  ,  0.50793  ,
       -1.3401   ,  0.81819  , -0.084923 , -0.83609  , -0.68585  ,
        1.8433   ,  0.8701   , -0.13934  , -0.2725   , -1.367    ,
        0.22925  , -0.041979 ,  0.80299  ,  0.038621 , -0.38195  ,
        0.0072519,  0.20913  ,  1.0864   , -2.0325   , -0.46558  ,
        0.52418  , -0.40482  ,  0.10702  ,  0.021184 , -1.1139   ],
      dtype=float32)

In [12]:
data_pos.x_tokenized[0]

[2989,
 2,
 54,
 21,
 750,
 112,
 59,
 27,
 44,
 334,
 989,
 34,
 203,
 14,
 1,
 503,
 6,
 1063,
 1064,
 2,
 335,
 1557,
 27,
 2184,
 2,
 5048,
 4,
 1223,
 3,
 1224,
 1429,
 2,
 81,
 5049,
 652,
 5,
 7,
 58,
 1430,
 2,
 1,
 866,
 3,
 1,
 369,
 40,
 276,
 16,
 6,
 1225,
 569,
 9,
 334,
 17,
 484,
 17,
 225,
 199,
 17,
 2,
 6,
 717,
 6,
 258,
 12,
 1,
 867,
 30,
 2990,
 1,
 2991,
 3,
 343,
 370,
 5,
 24,
 1,
 269,
 204,
 2,
 1,
 369,
 3715,
 6,
 1313,
 1558,
 4,
 3716,
 2495,
 2496,
 22,
 1719,
 3,
 239,
 11,
 318,
 718,
 8,
 2497,
 96,
 12,
 1,
 64,
 3,
 1,
 2498,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0

In [13]:
embeddings_dict[data_pos.x_raw[0][0]]

array([ 0.42115 ,  0.037718, -0.49961 , -0.48811 ,  0.21755 ,  0.45833 ,
        0.84379 , -0.029328, -0.22836 , -0.50353 ,  0.29288 ,  0.88652 ,
        0.075803, -0.89269 , -0.077111,  0.64782 ,  0.1009  , -0.2286  ,
        1.2372  ,  0.29613 ,  0.49105 ,  0.3105  ,  0.37639 , -0.10743 ,
        0.99254 ,  0.53388 ,  0.33742 , -0.72163 ,  0.6276  ,  0.17441 ,
       -0.070688,  0.63826 , -0.096589, -0.24625 ,  0.22688 , -0.25842 ,
       -0.19026 ,  0.47874 , -0.23385 ,  0.3112  , -0.24794 , -0.64864 ,
        0.7182  , -0.46691 , -0.080093, -0.16302 ,  0.078378, -0.48693 ,
        0.29361 ,  0.59439 ], dtype=float32)

## Embedding the data (not used)

Are stopwords able to change the sense of a word? I think so!

- standing in line - waiting for something
- standing in a line - they're just standing 

Based on this, I will not remove stopwords. I will also leave in punctuation, but it seems like a good idea to lowercase the entire text. We're not doing NER, and I don't want Line and line to end up having two meanings - the position alone should clarify the sense. CBoW seems like a terrible choice in this setting - the word senses will almost certainly get lost. Try representation with pre-trained GloVe vectors.

https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db

idea: only embed the sentence containing the word in question (maybe later)
use word position in an attention model, or for determining weights in a CNN/RNN (think that is an attention model)
 
represent sentence/doc
one-hot encode labels

prediction: something with a softmax layer

CNNs seem promising, as they can model interactions between words (exactly what we want). They also have a local structure, which is great. (can steal code from demo notebook if I want to use this)

In [14]:
#train_df = load_data(train_path)
#train_df.head()

In [15]:
#test_df = load_data(test_path)
#test_df.head()

start out simple! ignore position, see it as a document classification problem

Will onehot encode the sense key. This makes the most sense.

want a training accuracy score for each network

Could learn representation as I go, but there's not a lot of examples per unique sense_key, in some cases...
Also, this is a pain. Since the WSD texts appear to be generic enough, pretrained GloVe vectors should be ok. 
Do I use these as an initial guess or what? Also, GloVe only encodes one word at a time - so do I apply a context window myself as well?

for a 50-dim embedding of a 100-word document, we get a $100*50$ matrix. Seems to make sense to run a CNN over this! 

output layer size should depend on the number of distinct senses for each lemma, so this is a lemma-by-lemma approach

or try summing up all the vectors to create one representation for the entire document, then input it into a deep neural net of size 50. however this is silly and a RNN is better, can then have feedback in time if we input one word at a time.

# Classify using a CNN

## Parameters

In [147]:
from dataclasses import dataclass

@dataclass
class Parameters:

    # Preprocessing parameeters
    num_words: int = 8000

    # Model parameters
    embedding_size: int = 64
    out_size: int = 32
    stride: int = 2

    # Training parameters
    epochs: int = 2
    batch_size: int = 12
    learning_rate: float = 0.001
    early_stopping_win = 5
        
    # Runtime parameters - will be different for each lemma
    n_outputs: int = None
    seq_len: int = None

## TextClassifier

In [44]:
class TextClassifier(nn.ModuleList):

    def __init__(self, params):
        super(TextClassifier, self).__init__()

        # Parameters regarding text preprocessing
        self.seq_len = params.seq_len
        self.num_words = params.num_words
        self.embedding_size = params.embedding_size

        # Dropout definition
        self.dropout = nn.Dropout(0.25)

        # CNN parameters definition
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5

        # Output size for each convolution
        self.out_size = params.out_size
        # Number of strides for each convolution
        self.stride = params.stride

        # Embedding layer definition
        self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)

        # Convolution layers definition
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)

        # Max pooling layers definition
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)

        # Fully connected layer definition
        self.fc = nn.Linear(self.in_features_fc(), params.n_outputs)
        
        # Softmax output layer definition
        self.log_softmax = nn.LogSoftmax(dim = 1)

    def in_features_fc(self):
        '''Calculates the number of output features after Convolution + Max pooling

        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1

        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        '''
        
        # Calculate size of convolved/pooled features for convolution_1/max_pooling_1 features
        out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)

        # Calculate size of convolved/pooled features for convolution_2/max_pooling_2 features
        out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_conv_2 = math.floor(out_conv_2)
        out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_pool_2 = math.floor(out_pool_2)

        # Calculate size of convolved/pooled features for convolution_3/max_pooling_3 features
        out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_conv_3 = math.floor(out_conv_3)
        out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_pool_3 = math.floor(out_pool_3)

        # Calculate size of convolved/pooled features for convolution_4/max_pooling_4 features
        out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_conv_4 = math.floor(out_conv_4)
        out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_pool_4 = math.floor(out_pool_4)

        # Returns "flattened" vector (input for fully connected layer)
        return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size
    

    def forward(self, x):

        # Sequence of tokens is filtered through an embedding layer
        x = self.embedding(x)

        # Convolution layer 1 is applied
        x1 = self.conv_1(x)
        x1 = torch.relu(x1)
        x1 = self.pool_1(x1)

        # Convolution layer 2 is applied
        x2 = self.conv_2(x)
        x2 = torch.relu((x2))
        x2 = self.pool_2(x2)

        # Convolution layer 3 is applied
        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pool_3(x3)

        # Convolution layer 4 is applied
        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pool_4(x4)

        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)
        
        # The "flattened" vector is passed through a fully connected layer
        out = self.fc(union)
        # Dropout is applied
        out = self.dropout(out)
        out = self.log_softmax(out)

        # Use this, or there's a dim-0 error when a batch contains only one value
        if len(out) > 1:
            return out.squeeze()
        else:
            return out

Learned about the softmax outputs and loss function from here: https://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html

## Run

In [201]:
class DatasetMapper(Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

class Run:
    '''Training, evaluation and metrics calculation'''

    @staticmethod
    def train(model, data, params):

        # Initialize dataset maper
        train = DatasetMapper(data['x_train'], data['y_train'])
        test = DatasetMapper(data['x_test'], data['y_test'])

        # Initialize loaders
        loader_train = DataLoader(train, batch_size=params.batch_size)
        loader_test = DataLoader(test, batch_size=params.batch_size)

        # Define loss function and optimizer
        loss_function = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
        
        # Define vector for early stopping
        prev_loss = np.zeros(params.early_stopping_win)

        # Starts training phase
        for epoch in range(params.epochs):
            # Set model in training model
            model.train()
            predictions = []
            # Starts batch training
            for x_batch, y_batch in loader_train:

                y_batch = y_batch.type(torch.FloatTensor)

                # Feed the model
                y_pred = model(x_batch.long())
                          
                # Transform back from onehot encoded targets
                y_true = np.zeros(y_batch.shape[0])
                
                for i in range(y_batch.shape[0]):
                    for j in range(y_batch.shape[1]):
                        if y_batch[i,j] == 1:
                            y_true[i] = j
            
                y_true = torch.from_numpy(y_true).long()

                # Loss calculation
                loss = loss_function(y_pred, y_true)

                # Clean gradientes
                optimizer.zero_grad()

                # Gradients calculation
                loss.backward()

                # Gradients update
                optimizer.step()

                # Save predictions
                predictions += list(y_pred.detach().numpy())
                
            # Evaluation phase
            test_predictions = Run.evaluation(model, loader_test)
            
            # Metrics calculation
            train_accuracy = Run.calculate_accuracy(data['y_train'], predictions)
            test_accuracy = Run.calculate_accuracy(data['y_test'], test_predictions)
            
            if epoch % 5 == 0:
                print("Epoch: %d, loss: %.4f, Train accuracy: %.4f, Test accuracy: %.4f" % \
                      (epoch, loss.item(), train_accuracy, test_accuracy))
            
            # Early stopping check
            if epoch > 10:
                if loss.item() < min(prev_loss):
                    prev_loss = prev_loss[1:]
                    prev_loss = np.append(prev_loss, loss.item())
                else:
                    break
                
        return train_accuracy, test_accuracy

    @staticmethod
    def evaluation(model, loader_test):

        # Set the model in evaluation mode
        model.eval()
        predictions = []

        # Start evaluation phase
        with torch.no_grad():
            for x_batch, y_batch in loader_test:
                y_pred = model(x_batch.long())
                predictions += list(y_pred.detach().numpy())
        return predictions
        
    @staticmethod
    def calculate_accuracy(grand_truth, predictions):
        # Metrics calculation
        correct = 0
        
        for true, pred in zip(grand_truth, predictions):
    
            for i, element in enumerate(pred):
                if element == max(pred) and true[i] == 1:
                    correct += 1
                else:
                    pass
            
        # Return accuracy
        return (correct) / len(grand_truth)
    
    @staticmethod
    def prediction(model, data, le, params):
        
        # Initialize loader
        loader = DataLoader(data, batch_size=params.batch_size, shuffle=False)
        
        model.eval()
        predictions = []
        
        with torch.no_grad():
            for batch in loader:
                #print("In prediction, batch: ", batch.shape)
                pred = model(batch.long())
                predictions += list(pred.detach().numpy())
                
        sense_pred = []        
        for line in predictions:
            for i, val in enumerate(line):
                if val == max(line):
                    sense_pred.append(i)
                    
        sense_pred = le.inverse_transform(sense_pred)
        
        # Return the predicted senses
        return sense_pred
                

## Controller

In [212]:
class Controller(Parameters):

    def __init__(self, df, validation_df):
        
        self.lemma = None
        self.train_accuracy = None
        self.test_accuracy = None
        
        # Preprocessing pipeline
        self.data, lemma, n_outputs, le, vocabulary, seq_len = self.prepare_data(df, Parameters.num_words, Parameters.seq_len)
        
        self.le = le
        self.lemma = lemma
        self.vocabulary = vocabulary
        Parameters.seq_len = seq_len
        Parameters.n_outputs = n_outputs  

        # Initialize the model
        self.model = TextClassifier(Parameters)

        # Training - Evaluation pipeline
        train_accuracy, test_accuracy = Run().train(self.model, self.data, Parameters)

        # Save accuracies
        self.train_accuracy = train_accuracy
        self.test_accuracy = test_accuracy
        
        # Make predictions on valdiation dataset
        self.validation_data = self.prepare_validation_data(validation_df, self.vocabulary, Parameters.seq_len)
        #self.sense_pred = Run().prediction(self.model, self.validation_data, self.le, Parameters)
 
    @staticmethod
    def prepare_data(df, num_words, seq_len):
        # Preprocessing pipeline
        pr = Preprocessing(df, num_words, seq_len)
        pr.load_data()
        pr.build_vocabulary()
        pr.word_to_idx()
        pr.find_seq_len()
        pr.padding_sentences()
        pr.onehot_encode()
        pr.split_data()

        return ({'x_train': pr.x_train, 'y_train': pr.y_train, 'x_test': pr.x_test, 'y_test': pr.y_test}, \
                pr.lemma, pr.n_outputs, pr.le, pr.vocabulary, pr.seq_len)
   
    @staticmethod
    def prepare_validation_data(df, vocabulary, seq_len):
        
        num_words = len(vocabulary)

        pr = Preprocessing(test_short, num_words, seq_len)
        pr.load_data()
        pr.vocabulary = vocabulary
        pr.word_to_idx()
        pr.seq_len = seq_len
        pr.padding_sentences()

        return pr.x_padded

    # if __name__ == '__main__':
    #    controller = Controller(df_pos)

In [167]:
# doesn't work for 'hold.v', 'common.a'

In [214]:
test_lemma = 'hold.v'

df_short = df[df.Lemma == test_lemma]
test_short = test_df[test_df.Lemma == test_lemma]
controller = Controller(df_short, test_short)

print('-'*60)
print("Lemma: %s, Final training accuracy: %.4f, Final test accuracy: %.4f" % \
              (controller.lemma, controller.train_accuracy, controller.test_accuracy))
print('-'*60)

Epoch: 0, loss: 2.2451, Train accuracy: 0.1587, Test accuracy: 0.1790
------------------------------------------------------------
Lemma: hold.v, Final training accuracy: 0.4650, Final test accuracy: 0.2417
------------------------------------------------------------


In [215]:
sense_pred = Run().prediction(controller.model, controller.validation_data, controller.le, Parameters)
sense_pred

AttributeError: 'list' object has no attribute 'long'

<__main__.DatasetMapper at 0x20bd63e3e08>

In [250]:
le = controller.le
data = controller.validation_data
model = controller.model    

loader = DataLoader(data, batch_size=Parameters.batch_size, shuffle=False)
  
model.eval()
predictions = []

with torch.no_grad():
    for batch in loader:
        print("In prediction, batch: ", len(batch))
        pred = model(batch.long())
        predictions += list(pred.detach().numpy())
        
sense_pred = []        
for line in predictions:
    for i, val in enumerate(line):
        if val == max(line):
            sense_pred.append(i)

sense_pred = le.inverse_transform(sense_pred)

In prediction, batch:  244


AttributeError: 'list' object has no attribute 'long'

In [269]:
len(batch)

244

In [270]:
len(batch[0])

12

In [268]:
if isinstance(batch, list):
    tensor_batch = []

    for line in batch:
        tmp = line.numpy()
        tensor_batch.append(tmp)

    tensor_batch = torch.from_numpy(np.array(tensor_batch)).long()
    
tensor_batch.shape

torch.Size([244, 12])

In [267]:
pred = model(tensor_batch)

RuntimeError: Given groups=1, weight of size [32, 244, 2], expected input[244, 12, 64] to have 244 channels, but got 12 channels instead

## Run the code

In [21]:
df = load_data(train_path)
test_df = load_data(test_path)

Unnamed: 0,Sense_key,Lemma,Position,Text
0,?,physical.a,58,"[iaea, pointed, out, that, training, and, educ..."
1,?,see.v,8,"[aid, official, development, assistance, (, od..."
2,?,line.n,39,"[she, would, appreciate, receiving, informatio..."
3,?,keep.v,42,"[we, look, forward, to, its, eventual, assessm..."
4,?,national.a,57,"[in, his, report, to, the, general, assembly, ..."


In [189]:
df[df.Lemma == test_lemma].Sense_key.value_counts()

hold%2:36:00::    475
hold%2:40:00::    442
hold%2:31:01::    442
hold%2:35:03::    436
hold%2:40:04::    431
hold%2:32:11::    267
hold%2:42:00::    250
hold%2:31:10::    191
hold%2:40:02::     91
hold%2:35:00::     55
hold%2:41:15::     46
Name: Sense_key, dtype: int64

### Loop over all lemmas

In [181]:
lemma_vec = []
train_accuracy_vec = []
test_accuracy_vec = []
predicted_df = test_df.copy()

start_time = time.time()

for lemma in df.Lemma.unique()[20:]:
    
    df_short = df[df.Lemma == lemma]
    test_short = test_df[test_df.Lemma == lemma]
    controller = Controller(df_short, test_short)
    
    print('-'*60)
    print("Lemma: %s, Final training accuracy: %.4f, Final test accuracy: %.4f" % \
                  (controller.lemma, controller.train_accuracy, controller.test_accuracy))
    print('-'*60)
    
    # Append accuracies for each lemma
    lemma_vec.append(controller.lemma)
    train_accuracy_vec.append(controller.train_accuracy)
    test_accuracy_vec.append(controller.test_accuracy)
    
    # Make predictions
    predictions = controller.sense_pred
    for k, idx in enumerate(test_short.index):
        predicted_df.iloc[idx].Sense_key = predictions[k]
    
elapsed_time = time.time() - start_time
print("Elapsed time: ", elapsed_time)  

Epoch: 0, loss: 2.1964, Train accuracy: 0.2385, Test accuracy: 0.2767
------------------------------------------------------------
Lemma: find.v, Final training accuracy: 0.2385, Final test accuracy: 0.2767
------------------------------------------------------------
Epoch: 0, loss: 2.0176, Train accuracy: 0.2480, Test accuracy: 0.2351
------------------------------------------------------------
Lemma: life.n, Final training accuracy: 0.2480, Final test accuracy: 0.2351
------------------------------------------------------------
Epoch: 0, loss: 1.7722, Train accuracy: 0.2848, Test accuracy: 0.2934
------------------------------------------------------------
Lemma: order.n, Final training accuracy: 0.2848, Final test accuracy: 0.2934
------------------------------------------------------------
Epoch: 0, loss: 2.1438, Train accuracy: 0.2000, Test accuracy: 0.2131
------------------------------------------------------------
Lemma: bring.v, Final training accuracy: 0.2000, Final test accu