In [261]:
import torch
from torch import nn

import nltk
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [93]:
train_path = "a3_data/wsd_train.txt"
test_path = "a3_data/wsd_test_blind.txt"

with open(train_path, encoding = "utf-8") as f:
    for d, line in enumerate(f):
        print(line.lower())
        break
        
with open(test_path, encoding = "utf-8") as f:
    for d, line in enumerate(f):
        print(line.lower())
        break

keep%2:42:07::	keep.v	15	action by the committee in pursuance of its mandate , the committee will continue to keep under review the situation relating to the question of palestine and participate in relevant meetings of the general assembly and the security council . the committee will also continue to monitor the situation on the ground and draw the attention of the international community to urgent developments in the occupied palestinian territory , including east jerusalem , requiring international action .

?	physical.a	58	iaea pointed out that training and education were fundamental to the agency 's approach to enhancing physical protection systems in states . training courses , workshops and seminars that had been held on six continents had raised awareness and had provided hands-on experience of various subjects including the physical protection of research facilities , the practical operation of physical protection systems , and the engineering safety aspects of physical prote

# Load the data

In [208]:
def load_data(file_path):
    
    sense_list = []
    lemma_list = []
    position_list = []
    text_list = []

    with open(file_path, encoding = "utf-8") as f:
        for d, line in enumerate(f):

            line = line.lower()

            ix = line.find("\t")
            sense_key = line[0:ix]
            line = line[ix+1:]

            ix = line.find("\t")
            lemma = line[0:ix]
            line = line[ix+1:]

            ix = line.find("\t")
            position = line[0:ix]
            text = line[ix+1:].split()

            #if d == 0:
            #    print("sense_key ", sense_key)
            #    print("lemma ", lemma)
            #    print("position ", position)
            #    print("text ", text)

            sense_list.append(sense_key)
            lemma_list.append(lemma)
            position_list.append(position)
            text_list.append(text)

            #if d == 10000:
            #    break

    #print(d)
    
    df = pd.DataFrame(sense_list, columns = ["Sense_key"])
    df["Lemma"] = lemma_list
    df["Position"] = position_list
    df["Text"] = text_list

    del sense_list, lemma_list, position_list
    
    return df

In [270]:
class Preprocessing:
    def __init__(self, df):
        self.data = df
        self.lemma = None
        
        self.line_length = None    
        self.vocabulary = None
        
        self.x_tokenized = None
        self.x_embedded = None
        self.x_padded = None
        self.x_raw = None
        self.y = None
        
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
    
    def load_data(self):
        # split into sentences (x) and sense key (y)
        df = self.data
        self.x_raw = df.Text.values
        self.y = df.Sense_key.values
        self.lemma = df.Lemma.iloc[0]
        
    def build_vocabulary(self):
        # Builds the vocabulary 
        self.vocabulary = dict()
        fdist = nltk.FreqDist()

        for sentence in self.x_raw:
            for word in sentence:
                fdist[word] += 1

        common_words = fdist.most_common()

        for idx, word in enumerate(common_words):
            self.vocabulary[word[0]] = (idx+1)
            
    def word_to_idx(self):
        # By using the dictionary (vocabulary), it is transformed
        # each token into its index based representatio
        self.x_tokenized = list() 

        for sentence in self.x_raw:
            temp_sentence = list()
            for word in sentence:
                if word in self.vocabulary.keys():
                    temp_sentence.append(self.vocabulary[word])
            self.x_tokenized.append(temp_sentence)
        
    def find_line_length(self):
        
        max_len = 0
        for item in self.x_raw:
    
            if len(item) > max_len:
                max_len = len(item)
        
        self.line_length = max_len
    
    def padding_sentences(self):
        # Each sentence which does not fulfill the required length is padded with the index 0
        pad_idx = 0
        self.x_padded = list()

        for sentence in self.x_tokenized:
            while len(sentence) < self.line_length:
                sentence.insert(len(sentence), pad_idx)

            self.x_padded.append(sentence)
            
        self.x_padded = np.array(self.x_padded)
    
    def split_data(self):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_padded, self.y, test_size=0.25, random_state=42)

In [271]:
df_pos = df[df.Lemma == "positive.a"]

data_pos = Preprocessing(df_pos)
data_pos.load_data()
data_pos.build_vocabulary()
data_pos.word_to_idx()
data_pos.find_line_length()
data_pos.padding_sentences()
data_pos.split_data()

In [272]:
data_pos.x_padded[0]

array([2990,    3,   55,   22,  751,  113,   60,   28,   45,  335,  990,
         35,  204,   15,    2,  504,    7, 1064, 1065,    3,  336, 1558,
         28, 2185,    3, 5049,    5, 1224,    4, 1225, 1430,    3,   82,
       5050,  653,    6,    8,   59, 1431,    3,    2,  867,    4,    2,
        370,   41,  277,   17,    7, 1226,  570,   10,  335,   18,  485,
         18,  226,  200,   18,    3,    7,  718,    7,  259,   13,    2,
        868,   31, 2991,    2, 2992,    4,  344,  371,    6,   25,    2,
        270,  205,    3,    2,  370, 3716,    7, 1314, 1559,    5, 3717,
       2496, 2497,   23, 1720,    4,  240,   12,  319,  719,    9, 2498,
         97,   13,    2,   65,    4,    2, 2499,    6,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,   

# Embedding the data

Are stopwords able to change the sense of a word? I think so!

- standing in line - waiting for something
- standing in a line - they're just standing 

Based on this, I will not remove stopwords. I will also leave in punctuation, but it seems like a good idea to lowercase the entire text. We're not doing NER, and I don't want Line and line to end up having two meanings - the position alone should clarify the sense. CBoW seems like a terrible choice in this setting - the word senses will almost certainly get lost. Try representation with pre-trained GloVe vectors.

https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db

idea: only embed the sentence containing the word in question (maybe later)
use word position in an attention model, or for determining weights in a CNN/RNN (think that is an attention model)
 
represent sentence/doc
one-hot encode labels

prediction: something with a softmax layer

CNNs seem promising, as they can model interactions between words (exactly what we want). They also have a local structure, which is great. (can steal code from demo notebook if I want to use this)

In [209]:
train_df = load_data(train_path)
train_df.head()

Unnamed: 0,Sense_key,Lemma,Position,Text
0,keep%2:42:07::,keep.v,15,"[action, by, the, committee, in, pursuance, of..."
1,national%3:01:00::,national.a,25,"[a, guard, of, honour, stood, in, formation, i..."
2,build%2:31:03::,build.v,38,"[the, principle, that, statistics, should, be,..."
3,place%1:04:00::,place.n,36,"[again, ,, he, appealed, for, additional, supp..."
4,position%1:04:01::,position.n,76,"[also, ,, the, iaea, has, the, lowest, number,..."


In [210]:
test_df = load_data(test_path)
test_df.head()

Unnamed: 0,Sense_key,Lemma,Position,Text
0,?,physical.a,58,"[iaea, pointed, out, that, training, and, educ..."
1,?,see.v,8,"[aid, official, development, assistance, (, od..."
2,?,line.n,39,"[she, would, appreciate, receiving, informatio..."
3,?,keep.v,42,"[we, look, forward, to, its, eventual, assessm..."
4,?,national.a,57,"[in, his, report, to, the, general, assembly, ..."


start out simple! ignore position, see it as a document classification problem

In [117]:
from collections import Counter

def count_word_frequencies(YOUR_FILE, ENCODING):
    
    freqs = Counter()
    with open(YOUR_FILE, encoding = ENCODING) as f:
        for line in f:
            tokens = line.lower().split()
            for token in tokens:
                freqs[token] += 1
                
    return freqs

In [118]:
def map_to_int(docs:list()) -> (list(), int, Counter()):
    
    '''
    Function from assignment 2.
    
    Create bag of words from cleaned and smaller corpus.
    Associate each word in bag with an unique integer,
    ranging from 0 (most common word) to length of bag of words.
    Map each token in docs to the respective int. Return this list of list of ints.
    '''
    
    freqs = Counter()
    for doc in docs:
        for token in doc:
            freqs[token] += 1
    most_common = freqs.most_common()
    
    token_to_int = []
    for i in range(len(most_common)):
        token_to_int.append(most_common[i][0])

    # Get pairs of elements    
    mapping = zip(token_to_int, range(0,len(token_to_int)))
    
    # Make pairs into a dictionary
    vocab = dict(mapping)
        
    # Match token to int
    docs_int = list()
    
    for doc in docs:
        docs_int.append(list([vocab.get(x) for x in doc]))
        
    return docs_int, len(vocab), vocab

In [133]:
texts_int, vocab_size, vocab = map_to_int(text_list)
df["Text_int"] = texts_int

In [138]:
df.Lemma.value_counts()

see.v             6538
line.n            5545
keep.v            5393
follow.v          3297
hold.v            3126
serve.v           3030
force.n           2753
lead.v            2549
build.v           2495
bring.v           2494
extend.v          2452
find.v            2399
case.n            2356
position.n        2221
security.n        2164
national.a        2146
life.n            2141
time.n            2121
professional.a    2004
order.n           2004
regular.a         1974
place.n           1931
point.n           1913
physical.a        1895
common.a          1744
bad.a             1732
critical.a        1567
major.a           1507
active.a          1342
positive.a        1216
Name: Lemma, dtype: int64

## Start with just one lemma - positive

In [199]:
df_pos = df[df.Lemma == "positive.a"]
df_pos.shape

(1216, 5)

In [200]:
df_pos.Sense_key.value_counts()

positive%3:00:01::                  431
positive%5:00:00:advantageous:00    292
positive%5:00:00:plus:00            240
positive%3:00:04::                  196
positive%5:00:00:formal:01           57
Name: Sense_key, dtype: int64

In [201]:
df_pos.head()

Unnamed: 0,Sense_key,Lemma,Position,Text,Text_int
33,positive%5:00:00:advantageous:00,positive.a,51,"[thirdly, ,, there, are, situations, where, na...","[3659, 2, 78, 22, 902, 165, 51, 20, 28, 835, 1..."
185,positive%5:00:00:advantageous:00,positive.a,26,"[the, executive, director, made, an, oral, pre...","[0, 600, 1045, 94, 29, 3441, 2193, 10, 152, 85..."
332,positive%5:00:00:plus:00,positive.a,18,"[private, financial, flows, since, the, asian,...","[332, 148, 2130, 200, 0, 2018, 148, 843, 2, 33..."
355,positive%3:00:01::,positive.a,39,"[she, emphasized, the, importance, of, governm...","[321, 1529, 0, 346, 1, 64, 924, 6, 7519, 24, 3..."
399,positive%5:00:00:plus:00,positive.a,49,"[central, banks, were, compelled, to, manage, ...","[463, 2097, 50, 5364, 4, 2223, 1414, 1675, 225..."


Will onehot encode the sense key. This makes the most sense.

In [194]:
for i, val in enumerate(df_pos.Sense_key.unique()):

    col_name = "Onehot_sense_" + str(i)
    df_pos[col_name] = df.loc[:, "Sense_key"] == val
    df_pos[col_name] = df_pos[col_name].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [195]:
df_pos.head(2)

Unnamed: 0,Sense_key,Lemma,Position,Text,Text_int,Onehot_sense_0,Onehot_sense_1,Onehot_sense_2,Onehot_sense_3,Onehot_sense_4
33,positive%5:00:00:advantageous:00,positive.a,51,"[thirdly, ,, there, are, situations, where, na...","[3659, 2, 78, 22, 902, 165, 51, 20, 28, 835, 1...",1,0,0,0,0
185,positive%5:00:00:advantageous:00,positive.a,26,"[the, executive, director, made, an, oral, pre...","[0, 600, 1045, 94, 29, 3441, 2193, 10, 152, 85...",1,0,0,0,0


In [196]:
max_len = 0

for item in df_pos.Text_int:
    
    if len(item) > max_len:
        max_len = len(item)
        
max_len

236

In [197]:
for i in range(len(df_pos.Text_int)):
    
    diff = max_len - len(df_pos.Text_int.iloc[i]) 
    
    vec = df_pos.Text_int.iloc[i]
    
    
    
    df_pos.Text_int.iloc
    
    print(diff)
    
    break

124


In [198]:
max_len - len(df_pos.Text_int.iloc[0])

vec = df_pos.Text_int.iloc[0]

append = -1*np.ones(diff)

vec

[3659,
 2,
 78,
 22,
 902,
 165,
 51,
 20,
 28,
 835,
 1068,
 29,
 399,
 11,
 0,
 652,
 4,
 1728,
 1743,
 2,
 296,
 4643,
 20,
 5354,
 2,
 16505,
 3,
 2599,
 1,
 1189,
 1429,
 2,
 67,
 4229,
 3953,
 5,
 6,
 53,
 780,
 2,
 0,
 638,
 1,
 0,
 276,
 33,
 233,
 13,
 4,
 427,
 272,
 340,
 835,
 12,
 614,
 12,
 258,
 166,
 12,
 2,
 4,
 807,
 4,
 308,
 10,
 0,
 1070,
 24,
 11879,
 0,
 6276,
 1,
 396,
 273,
 5,
 23,
 0,
 230,
 90,
 2,
 0,
 276,
 2926,
 4,
 1125,
 1646,
 3,
 10258,
 1182,
 1808,
 31,
 1979,
 1,
 134,
 8,
 619,
 544,
 7,
 5977,
 439,
 10,
 0,
 49,
 1,
 0,
 1388,
 5,
 -1,
 -1,
 -1,
 -1,
 array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1.

In [202]:
# want a training accuracy score for each network

Could learn representation as I go, but there's not a lot of examples per unique sense_key, in some cases...
Also, this is a pain. Since the WSD texts appear to be generic enough, pretrained GloVe vectors should be ok. 
Do I use these as an initial guess or what? Also, GloVe only encodes one word at a time - so do I apply a context window myself as well?

for a 50-dim embedding of a 100-word document, we get a $100*50$ matrix. Seems to make sense to run a CNN over this! 

output layer size should depend on the number of distinct senses for each lemma, so this is a lemma-by-lemma approach

or try summing up all the vectors to create one representation for the entire document, then input it into a deep neural net of size 50. however this is silly and a RNN is better, can then have feedback in time if we input one word at a time.

# Classify using a CNN

In [None]:
# Taken from https://towardsdatascience.com/text-classification-with-cnns-in-pytorch-1113df31e79f

class TextClassifier(nn.ModuleList):

    def __init__(self, params):
        super(TextClassifier, self).__init__()

        # Parameters regarding text preprocessing
        self.seq_len = params.seq_len
        self.num_words = params.num_words
        self.embedding_size = params.embedding_size

        # Dropout definition
        self.dropout = nn.Dropout(0.25)

        # CNN parameters definition
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5

        # Output size for each convolution
        self.out_size = params.out_size
        # Number of strides for each convolution
        self.stride = params.stride

        # Embedding layer definition
        self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)

        # Convolution layers definition
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)

        # Max pooling layers definition
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)

        # Fully connected layer definition
        self.fc = nn.Linear(self.in_features_fc(), 1)