## Set up


In [1]:
# Imports

import nltk
nltk.download("punkt")

# !pip install gensim
import gensim

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs
import os

import tensorboard as tb
import tensorflow as tf

import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

from tqdm import tqdm
import random

writer = SummaryWriter('runs/word2vec')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2021-02-25 12:43:30--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-02-25 12:43:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-02-25 12:43:30--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [2]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

if not os.path.exists('/content/drive/MyDrive/ICL AI/NLP/NLP_cw'):
    os.makedirs('/content/drive/MyDrive/ICL AI/NLP/NLP_cw')

root_path = '/content/drive/MyDrive/ICL AI/NLP/NLP_cw/'

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

## Hyperparameters and data loading


In [3]:
# Load data
train_df = pd.read_csv(f'{root_path}data/task-1/train.csv')
test_df = pd.read_csv(f'{root_path}data/task-1/dev.csv')

In [4]:
# Number of epochs
epochs = 10

# Proportion of training data for train compared to dev
train_proportion = 0.8

## Preprocessing


### Making vocabulary


In [5]:
def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list
    punctuation_exclusion = [",", "\'", "\"", ".", "‘", "’"] 

    for sentence in data:


        # for token in sentence.split(' '): # simplest split is

        #     from nltk.tokenize import RegexpTokenizer
        
        tokenized_sentence = nltk.tokenize.word_tokenize(sentence.lower())


        cleaned = [token for token in tokenized_sentence if token not in punctuation_exclusion]

        tokenized_corpus.append(cleaned)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list

    for sentence in tokenized_corpus:

        for token in sentence:

            if token not in vocabulary:

                if True:
                    vocabulary.append(token)

    return vocabulary, tokenized_corpus

### Padding

In [6]:
def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_labels = [l for f, l in batch]
    batch_features = [f for f, l in batch]

    batch_features_len = [len(f) for f, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    batch_labels = torch.FloatTensor(batch_labels)

    return seq_tensor, batch_labels

class Task1Dataset(Dataset):

    def __init__(self, train_data, labels):
        self.x_train = train_data
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

## Model


### BiLSTM

In [7]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)

        out = self.hidden2label(lstm_out[-1])
        return out

### Custom NN

#### GRU

In [None]:
class GRU_Net(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(GRU_Net, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 1)
        self.hidden = self.init_hidden()

  def init_hidden(self):
      # Before we've done anything, we dont have any hidden state.
      # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
      # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
      return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
              torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

  def forward(self, sentence):
      embedded = self.embedding(sentence)
      embedded = embedded.permute(1, 0, 2)

      gru_out, self.hidden = self.gru(
          embedded.view(len(embedded), 
                        self.batch_size, 
                        self.embedding_dim), 
                        self.hidden)

      out = self.hidden2label(gru_out[-1])
      return out
      
      

#### Pytorch dense NN

In [8]:
class FFN(nn.Module):
    def __init__(self, embedding_dim, vocab_size, batch_size, max_feature):
        super(FFN, self).__init__()
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.model = nn.Sequential(
            nn.Linear(embedding_dim*max_feature, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
      # input shape: [batch size, sentence length]
      input_len = x.size()[1]

      # print("input size:", x.size()) 
      # print("input: ", x)

      # embedding output shape: [batch size, sentence length, embedding dim]
      embedded = self.embedding(x).view((self.batch_size, -1))

      # print("embedded size: ", embedded.size())
      # print("embedding dim: ", self.embedding_dim)
      # embedded = embedded.permute(1, 0, 2)
      # print("permuted embed size: ",embedded.size())

      out = self.model(embedded)
      return out

#### Keras dense NN

In [None]:
def create_model(embedding_weights):
  model = Sequential()
  e = Embedding(vocab_size, 100, weights=[embedding_weights], input_length=4, trainable=False)
  model.add(e)
  model.add(Flatten())
  model.add(Dense(1, activation='sigmoid'))
  # compile the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
  # summarize the model
  print(model.summary())
  # fit the model
  model.fit(padded_docs, labels, epochs=50, verbose=0)

#### Pytorch tutorial
for understanding how this network works and why mine doesnt


In [91]:
import torch.nn.functional as F
import torch.optim as optim
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        print("embeds size: ", embeds.size())
        out = F.relu(self.linear1(embeds))
        print("first layer size: ", embeds.size())
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Size([1, 20])
first layer size:  torch.Size([1, 20])
embeds size:  torch.Siz

KeyboardInterrupt: ignored

## Train & eval


In [9]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    
    print("Training model.")
    print(type(model).__name__)

    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:

            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            
            if type(model).__name__ == "BiLSTM":
              model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [10]:

# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            if type(model).__name__ == "BiLSTM":
              model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [11]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

## Full process

### Pre-trained embeddings


#### Setting data, creating vocab

In [12]:
# We set our training data and test data
training_data = train_df['original']
test_data = test_df['original']

# Creating word vectors
training_vocab, training_tokenized_corpus = create_vocab(training_data)
test_vocab, test_tokenized_corpus = create_vocab(test_data)

# Creating joint vocab from test and train:
joint_vocab, joint_tokenized_corpus = create_vocab(pd.concat([training_data, test_data]))

print("Vocab created.")


Vocab created.


In [None]:
print(joint_tokenized_corpus[:20])

[['france', 'is', 'hunting', 'down', 'its', 'citizens', 'who', 'joined', '<', 'isis/', '>', 'without', 'trial', 'in', 'iraq'], ['pentagon', 'claims', '2,000', '%', 'increase', 'in', 'russian', 'trolls', 'after', '<', 'syria/', '>', 'strikes', 'what', 'does', 'that', 'mean', '?'], ['iceland', 'pm', 'calls', 'snap', 'vote', 'as', 'pedophile', 'furor', 'crashes', '<', 'coalition/', '>'], ['in', 'an', 'apparent', 'first', 'iran', 'and', 'israel', '<', 'engage/', '>', 'each', 'other', 'militarily'], ['trump', 'was', 'told', 'weeks', 'ago', 'that', 'flynn', 'misled', '<', 'vice/', '>', 'president'], ['all', '22', '<', 'promises/', '>', 'trump', 'made', 'in', 'his', 'speech', 'to', 'congress', 'in', 'one', 'chart'], ['new', 'doj', 'alert', 'system', 'will', 'flag', '<', 'crimes/', '>', 'against', 'police'], ['as', 'someone', 'who', 'grew', 'up', 'among', 'fundamentalist', '<', 'christians/', '>', 'in', 'the', 'us', 'i', "'m", 'surprised', 'anyone', "'s", 'surprised', 'about', 'roy', 'moore'],

#### Creating embeddings

##### Glove embeddings (imported)

In [None]:
# We create representations for our tokens
wvecs = [] # word vectors
word2idx = [] # word2index
idx2word = []

# This is a large file, it will take a while to load in the memory!
with codecs.open('glove.6B.100d.txt', 'r','utf-8') as f:
  index = 1
  for line in f.readlines():
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:
          (word, vec) = (word, list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1

wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in training_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]

##### Glove embeddings (custom)

might not work

In [None]:
!pip install glove_python
from glove import *

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(joint_tokenized_corpus, window=3)  # window hyperparam set to trigram

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('custom_glove.txt')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


##### word2vec embeddings (custom)

Takes forever....

In [None]:
# corpus to vocab

# already have train_vocab, train_token_corpus, test_vocab, test_corpus & joint
print("LENGTH OF HOINT VOCAB:", len(joint_vocab), "\nVOCAB:", joint_vocab)

# creating custom word2idx
word2idx = {}
n_words = 0

for token in joint_vocab:
    if token not in word2idx:
        word2idx[token] = n_words
        n_words += 1
        
assert len(word2idx) == len(joint_vocab)

LENGTH OF HOINT VOCAB: 14313 


In [None]:
# window size = 2 = bigram

def get_focus_context_pairs(tokenized_corpus, window_size=2):
    focus_context_pairs = []
    for sentence in tokenized_corpus:

        for token_idx, token in enumerate(sentence):
            for w in range(-window_size, window_size+1):
                context_word_pos = token_idx + w

                if w == 0 or context_word_pos >= len(sentence) or context_word_pos < 0:
                    continue

                try:
                    focus_context_pairs.append([token, sentence[context_word_pos]])
                except:
                    continue
    
    return focus_context_pairs
  
focus_context_pairs = get_focus_context_pairs(joint_tokenized_corpus)
print(focus_context_pairs[:10])  # preview

# from word pairs to inded pairs
def get_focus_context_idx(focus_context_pairs):
    idx_pairs = []
    for pair in focus_context_pairs:
        idx_pairs.append([word2idx[pair[0]], word2idx[pair[1]]])
    
    return idx_pairs

idx_pairs = get_focus_context_idx(focus_context_pairs)
print(idx_pairs[:10])

[['France', 'is'], ['France', '‘'], ['is', 'France'], ['is', '‘'], ['is', 'hunting'], ['‘', 'France'], ['‘', 'is'], ['‘', 'hunting'], ['‘', 'down'], ['hunting', 'is']]
[[0, 1], [0, 2], [1, 0], [1, 2], [1, 3], [2, 0], [2, 1], [2, 3], [2, 4], [3, 1]]


In [None]:
# one hot encoding of above indices
def get_one_hot(indicies, vocab_size=len(joint_vocab)):
    oh_matrix = np.zeros((len(indicies), vocab_size))
    for i, idx in enumerate(indicies):
        oh_matrix[i, idx] = 1

    return torch.Tensor(oh_matrix)

In [None]:
# setting up word2vec

class Word2Vec(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim_size):
        super().__init__()
        
        self.projection = nn.Linear(input_size, hidden_dim_size, bias=False)
        self.output = nn.Linear(hidden_dim_size, output_size)
        
    def forward(self, input_token):
        x = self.projection(input_token)
        output = self.output(x)
        return output


# setting up training loop
def train(word2vec_model, idx_pairs, state_dict_filename, early_stop=False, num_epochs=10, lr=1e-3):

    word2vec_model.train()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(word2vec_model.parameters(), lr=lr)

    for epoch in tqdm(range(num_epochs)):

        random.shuffle(idx_pairs)

        for focus, context in idx_pairs:
            print(focus)
            oh_inputs = get_one_hot([focus], len(joint_vocab))
            target = torch.LongTensor([context])

            pred_outputs = word2vec_model(oh_inputs)

            loss = criterion(pred_outputs, target)

            loss.backward()
            optimizer.step()
            word2vec_model.zero_grad()
            
        ### These lines stop training early
            if early_stop: break
        if early_stop: break
        ###


        torch.save(word2vec_model.state_dict(), state_dict_filename)
        writer.add_embedding(word2vec_model.projection.weight.T,
                             metadata=word2idx.keys(), global_step=epoch)

tf.io.gfile = tb.compat.tensorflow_stub.io.gfile  # so saving doesnt run into issues



In [None]:
# create the embeddings
word2vec = Word2Vec(len(joint_vocab), len(joint_vocab), 10)  # hyperparam hidden dim = 50
train(word2vec, idx_pairs, "word2vec.pt")

  0%|          | 0/10 [00:00<?, ?it/s]

829
361
3141
3712
9423
156
32
750
156
344
339
7595
41
1590
105
12259
6931
48
46
3836
5035
11225
245
3786
1671
3280
46
384
1811
46
46
10550
1680
4203
1447
3020
46
12291
96
5055
6159
54
160
73
1577
13
10
110
2148
13
54
6859
10704
92
54
113
1880
3324
2847
7368
1286
3380
5765
13
11423
41
96
1960
14097
476
112
4779
4930
448
4401
3512
8717
2478
593
2877
601
3754
147
383
855
11330
341
156
69
4141
12375
797
7
1937
4799
497
355
112
8065
245
10265
6124
170
2122
328
3160
112
35
173
379
43
742
69
3936
2020
9454
2129
46
69
103
4490
97
480
11246
4321
245
401
9465
13684
321
1589
696
46
147
69
6917
3580
474
8363
8598
10145
552
13671
147
474
7319
147
1177
546
764
156
355
706
286
232
383
245
69
1663
7711
10448
156
8893
1436
14126
20
55
13
5206
9413
1893
3251
156
156
649
686
1488
46
2322
2190
232
232
42
245
69
245
48
791
11202
14263
142
13666
147
605
359
8988
13667
7303
27
13171
5853
2401
7358
1514
8237
12080
46
54
508
657
433
957
1654
1163
7884
1348
41
2579
5014
5360
67
5414
147
245
5955
4987
11751
68
2

  0%|          | 0/10 [00:05<?, ?it/s]

46
10204
789
43
147
46
3403
47
4039
4707
9724
112
36
548
69
245
2199
11260
1021
341
7639
452
424
54
69
46
156
461
1266
9839
2102
682
6366
10734
7803
46
424
706
36
5105
6602
3278
3583
1095
807
686
476
2003
1494
4667
5208
3052
9055
173
11567
147
949
665
582
341
446
7574
16
389
9346
618
232
785
69
1178
69
2139
10472
245
96
13098
73
9035





KeyboardInterrupt: ignored

In [None]:
weights_matrix = word2vec.projection.weight.T
print(weights_matrix.shape)

##### word2vec with gensim

In [13]:
# word2vec
model1 = gensim.models.Word2Vec(joint_tokenized_corpus, min_count = 1,  
                              size = 100, window = 5, sg = 1)

model1.wv.save_word2vec_format("custom_word2vec.txt")

##### fasttext with gensim

In [14]:
# fasttext
# fastText — which is essentially an extension of the word2vec model — treats each *word* as composed of character n-grams. 

model2 = gensim.models.FastText(joint_tokenized_corpus, 
                                size=100, 
                                window = 5, 
                                min_count = 1, 
                                workers = 4, 
                                sg = 1)

model1.wv.save_word2vec_format("custom_fasttext.txt")

#### Set embeddings from text

In [32]:
# which model?

options_models = ["BiLSTM", "FFN"]

model_to_run = 1  # 0 = BiLSTM, 1 = FFN

# which embedding?

options = ['glove.6B.100d.txt', 'custom_word2vec.txt', "custom_fasttext.txt", "custom_glove.txt"]

picked_embeddings = 0  # 0 = pre-made glove, 1 = custom word2vec, 2 = custom fasttext, 3 = custom glove

file_to_load = options[picked_embeddings]


# which batch size
BATCH_SIZE = 32  # hyperparam

In [33]:
# We create representations for our tokens

wvecs = [] # word vectors
word2idx = [] # word2index
idx2word = []

# This is a large file, it will take a while to load in the memory!
with codecs.open(file_to_load, 'r','utf-8') as f:
  index = 1
  for line in f.readlines():
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:
          (word, vec) = (word, list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1

wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in training_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]


In [16]:
# print(vectorized_seqs)
print(word2idx)



#### Splitting dataset & padding

In [34]:
feature = vectorized_seqs

# manual padding
def find_max_list(list):
    list_len = [len(i) for i in list]
    return max(list_len)

max_len = find_max_list(feature)

padd_feature = []
for sentence in feature:
  if len(sentence) < max_len:
    
    diff = max_len - len(sentence)
    new_list = [0]*diff
    sentence.extend(new_list)

    assert len(sentence) == max_len

  padd_feature.append(sentence)

In [35]:
if model_to_run == 0:  # BiLSTM
  feature = feature
else:
  feature = padd_feature


# 'feature' is a list of lists, each containing embedding IDs for word tokens
train_and_dev = Task1Dataset(feature, train_df['meanGrade'])

train_examples = round(len(train_and_dev)*train_proportion)
dev_examples = len(train_and_dev) - train_examples

train_dataset, dev_dataset = random_split(train_and_dev,
                                           (train_examples,
                                            dev_examples))

train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

Dataloaders created.


#### Creating model

In [36]:

INPUT_DIM = len(word2idx)  # numbers of tokens
EMBEDDING_DIM = 100  # hyperparam
MAX_LEN = max_len

if model_to_run == 1:
  model = FFN(EMBEDDING_DIM, INPUT_DIM, BATCH_SIZE, MAX_LEN)
elif model_to_run == 0: 
  model = BiLSTM(EMBEDDING_DIM, 50, INPUT_DIM, BATCH_SIZE, device)

print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(wvecs))

Model initialised.


tensor([[-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        [-0.1529, -0.2428,  0.8984,  ..., -0.5910,  1.0039,  0.2066],
        [-0.1897,  0.0500,  0.1908,  ..., -0.3980,  0.4765, -0.1598],
        ...,
        [ 0.1286,  0.1019, -0.6857,  ...,  0.2914, -0.0697,  0.2229],
        [-0.1355, -0.2036, -0.4885,  ..., -0.2279, -0.6748, -0.2403],
        [ 0.1977, -0.0688,  0.0190,  ...,  0.1300, -0.2731, -0.0403]],
       device='cuda:0')

#### Running model

In [42]:
loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters())

print(f"running model {options_models[model_to_run]} with embeddings {file_to_load}")
train(train_loader, dev_loader, model, epochs)

# on BiLSTM w/o padding:
# custom word2vec embed: 
# | Epoch: 10 | Train Loss: 0.13 | Train MSE: 0.13 | Train RMSE: 0.35 | Val. Loss: 0.44 | Val. MSE: 0.44 |  Val. RMSE: 0.66 |

# glove embed:
# | Epoch: 10 | Train Loss: 0.22 | Train MSE: 0.22 | Train RMSE: 0.47 | Val. Loss: 0.38 | Val. MSE: 0.38 |  Val. RMSE: 0.62 |

# custom fasttext embed:
# | Epoch: 10 | Train Loss: 0.12 | Train MSE: 0.12 | Train RMSE: 0.35 | Val. Loss: 0.46 | Val. MSE: 0.46 |  Val. RMSE: 0.67 |


# on FFN w/ padding to len 26:
# custom word2vec: 
# | Epoch: 10 | Train Loss: 0.11 | Train MSE: 0.11 | Train RMSE: 0.33 |         Val. Loss: 0.45 | Val. MSE: 0.45 |  Val. RMSE: 0.67 |

# custom fasttext:
# | Epoch: 10 | Train Loss: 0.11 | Train MSE: 0.11 | Train RMSE: 0.33 |         Val. Loss: 0.43 | Val. MSE: 0.43 |  Val. RMSE: 0.65 |

# pre trained glove:



running model FFN with embeddings glove.6B.100d.txt
Training model.
FFN


RuntimeError: ignored

### No pre-trained embeddings


In [None]:
train_and_dev = train_df['edit']

training_data, dev_data, training_y, dev_y = train_test_split(train_df['edit'], train_df['meanGrade'],
                                                                        test_size=(1-train_proportion),
                                                                        random_state=42)

# We train a Tf-idf model
count_vect = CountVectorizer(stop_words='english')
train_counts = count_vect.fit_transform(training_data)
transformer = TfidfTransformer().fit(train_counts)
train_counts = transformer.transform(train_counts)
regression_model = LinearRegression().fit(train_counts, training_y)

# Train predictions
predicted_train = regression_model.predict(train_counts)

# Calculate Tf-idf using train and dev, and validate model on dev:
test_and_test_counts = count_vect.transform(train_and_dev)
transformer = TfidfTransformer().fit(test_and_test_counts)

test_counts = count_vect.transform(dev_data)

test_counts = transformer.transform(test_counts)

# Dev predictions
predicted = regression_model.predict(test_counts)

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, training_y, True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_y, True)