### Download from google disk preprocessed data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import shutil

In [4]:
shutil.copy('.//drive//MyDrive//challenge//train.jsonl', './/')
shutil.copy('.//drive//MyDrive//challenge//test.jsonl', './/')
shutil.copy('.//drive//MyDrive//challenge//val.jsonl', './/')

'.//val.jsonl'

### Imports and device and download stopwords

In [5]:
import json

import torch
import nltk
import numpy as np
import gensim.downloader as api

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import spacy

from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm.autonotebook import tqdm

In [6]:
nltk.download('stopwords')
nlp = spacy.load('en')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Embedding

In [8]:
# word2vec = api.load('word2vec-google-news-300')
word2vec = api.load('glove-wiki-gigaword-300')
vocab_size = len(word2vec.vocab)



### RNN

In [31]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx, dropout_clf):
        
        super().__init__()

        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=pad_idx)
        
        self.rnn_for_seq1 = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.rnn_for_seq2 = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           dropout=dropout,
                           bidirectional=bidirectional)        
        self.dropout = nn.Dropout(p=dropout_clf)
        self.fc1 = nn.Linear(hidden_dim * n_layers * (2 if bidirectional else 1) * 4, 256)
        self.fc2 = nn.Linear(256, output_dim)
        self.activ = nn.ReLU()
        
        
    def forward(self, text):
        
        embedded1 = self.embedding(text['seq1'][0].permute(1, 0))
        embedded2 = self.embedding(text['seq2'][0].permute(1, 0))

        h0 = torch.zeros(self.n_layers * (2 if self.bidirectional else 1), embedded1.size(1), self.hidden_dim).to(text['seq2'][0].device)
        c0 = torch.zeros(self.n_layers * (2 if self.bidirectional else 1), embedded1.size(1), self.hidden_dim).to(text['seq2'][0].device)
        
        output1, (hidden1, cell1) = self.rnn_for_seq1(embedded1, (h0, c0))
        output2, (hidden2, cell2) = self.rnn_for_seq2(embedded2, (h0, c0))

        hidden1 = hidden1.permute(1, 0, 2)
        hidden2 = hidden2.permute(1, 0, 2)

        hidden = torch.cat((hidden1, hidden1 * hidden2, torch.abs(hidden1 - hidden2), hidden2), dim=1).view(embedded1.size(1), -1)

        hidden = self.dropout(hidden)
        hidden = self.fc1(hidden)
        hidden = self.activ(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)

        return hidden

In [32]:
emb_dim = 300
hidden_dim = 256
output_dim = 3
n_layers = 1
bidirectional = True
dropout = 0.5
PAD_IDX = 1
patience = 3
dropout_clf = 0.5

In [33]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX,
    dropout_clf=dropout_clf
)

  "num_layers={}".format(dropout, num_layers))


In [34]:
model.embedding.weight = nn.Parameter(torch.FloatTensor(word2vec.vectors))

In [35]:
model = model.to(device)

In [36]:
opt = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_func = nn.CrossEntropyLoss()
max_epochs = 20

### Datasets and DataLoaders on corpus for RNN

In [15]:
class TextDatasetRNN(Dataset):
    def __init__(self, dir_json, word2vec, max_length, tokenizer=None):
        self.label_map = {'neutral':torch.tensor(0, dtype=torch.long),
                          'contradiction':torch.tensor(1, dtype=torch.long),
                          'entailment':torch.tensor(2, dtype=torch.long)}
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.word2vec = word2vec

        data = []
        broken_data = []
        with open(dir_json) as json_file:
            for i, line in enumerate(json_file):
                line = json.loads(line)
                try:
                    line_data = {
                        'seq1':self.preprocessing_sequence(line['sentence1']),
                        'seq2':self.preprocessing_sequence(line['sentence2']),
                        'label':self.label_map[line['gold_label']]                        
                    }
                    data.append(line_data)
                except (ValueError, KeyError):
                    broken_data.append((i, line))
        self.json_data = data
        self.broken = broken_data
    
    def __len__(self):
        return len(self.json_data)

    def __getitem__(self, ind):
        return self.json_data[ind]
    
    def preprocessing_sequence(self, seq):
        tokens = [t for t in self.tokenizer.tokenize(seq.lower())]
        embedings = [self.word2vec.index2word.index(t) for t in tokens if t in self.word2vec]
        lenght = len(embedings)
        if lenght == 0:
            raise ValueError
        if len(embedings) < self.max_length:
            embedings.extend([1 for i in range(self.max_length - len(embedings))])
        else:
            lenght = self.max_length
            embedings = embedings[:self.max_length]
        return torch.from_numpy(np.array(embedings)).type(torch.long), torch.tensor(lenght, dtype=torch.long)

In [16]:
train_datasetRNN = TextDatasetRNN(dir_json='.//train.jsonl',
                            tokenizer=WordPunctTokenizer(),
                            word2vec=word2vec,
                            max_length=10)
val_datasetRNN = TextDatasetRNN('.//val.jsonl',
                          tokenizer=WordPunctTokenizer(),
                          word2vec=word2vec,
                          max_length=10)
test_datasetRNN = TextDatasetRNN('.//test.jsonl',
                           tokenizer=WordPunctTokenizer(),
                           word2vec=word2vec,
                           max_length=10)

In [17]:
train_loaderRNN = DataLoader(train_datasetRNN, batch_size=1000, shuffle=True)
val_loaderRNN = DataLoader(val_datasetRNN, batch_size=1000)
test_loaderRNN = DataLoader(test_datasetRNN, batch_size=1000)

### Info about GPU

In [18]:
torch.cuda.empty_cache()

In [19]:
!nvidia-smi

Thu Apr  8 20:54:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    27W /  70W |   1544MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Training RNN

In [20]:
def freeze_word2vec(model, grad=False):
    for x in model.embedding.parameters():
        x.requires_grad = grad

In [21]:
max_epochs = 30
patience = 3

In [37]:
min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    if epoch < 5:
        freeze_word2vec(model)
    else:
        freeze_word2vec(model, True)        
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_loaderRNN), total=len(train_loaderRNN), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        with torch.set_grad_enabled(True):

            opt.zero_grad()
            label = batch['label'].to(device)
            data = {
                'seq1':(
                    batch['seq1'][0].to(device),
                    batch['seq1'][1].cpu()
                ),
                'seq2':(
                    batch['seq2'][0].to(device),
                    batch['seq2'][1].cpu()                    
                )
            }
            output = model(data)

            loss = loss_func(output, label)
            loss.backward()
            train_loss += loss.item()

            opt.step()

    train_loss /= len(train_loaderRNN)

    val_loss = 0.0
    model.eval()

    pbar = tqdm(enumerate(val_loaderRNN), total=len(val_loaderRNN), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        # model(it.to(device))

        with torch.no_grad():
            label = batch['label'].to(device)
            data = {
                'seq1':(
                    batch['seq1'][0].to(device),
                    batch['seq1'][1].cpu()
                ),
                'seq2':(
                    batch['seq2'][0].to(device),
                    batch['seq2'][1].cpu()                    
                )
            }
            output = model(data)

            loss = loss_func(output, label)
            val_loss += loss.item()        

    val_loss /= len(val_loaderRNN)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print(f'Epoch: {epoch}, Training Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}')
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 1, Training Loss: 0.885440, Validation Loss: 0.771027


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 2, Training Loss: 0.762895, Validation Loss: 0.708375


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 3, Training Loss: 0.719259, Validation Loss: 0.677720


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 4, Training Loss: 0.689727, Validation Loss: 0.652465


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 5, Training Loss: 0.659401, Validation Loss: 0.632254


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 6, Training Loss: 0.625761, Validation Loss: 0.604205


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 7, Training Loss: 0.599257, Validation Loss: 0.593891


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 8, Training Loss: 0.574706, Validation Loss: 0.583736


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 9, Training Loss: 0.555599, Validation Loss: 0.576893


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 10, Training Loss: 0.536475, Validation Loss: 0.573015


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 11, Training Loss: 0.519325, Validation Loss: 0.577362


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 12, Training Loss: 0.504006, Validation Loss: 0.571086


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 13, Training Loss: 0.486809, Validation Loss: 0.576499


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))



<All keys matched successfully>

### Accuracy on RNN

In [38]:
count = 0
corr = 0
for batch in test_loaderRNN:
    with torch.no_grad():
        label = batch['label'].to(device)
        data = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
                ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()                    
            )
            }
        output = model(data)
        pred = torch.argmax(output, dim=1)
        count += pred.shape[0]
        corr += torch.sum(pred == label).cpu()
print(corr / (count))

tensor(0.7670)


In [39]:
count = 0
corr = 0
for batch in val_loaderRNN:
    with torch.no_grad():
        label = batch['label'].to(device)
        data = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
                ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()                    
            )
            }
        output = model(data)
        pred = torch.argmax(output, dim=1)
        count += pred.shape[0]
        corr += torch.sum(pred == label).cpu()
print(corr / (count))

tensor(0.7705)


### Save model

In [25]:
torch.save(model.state_dict(), './/best_modelRNN.pt')

In [26]:
shutil.copy('.//best_modelRNN.pt', './/drive//MyDrive//challenge//')

'.//drive//MyDrive//challenge//best_modelRNN.pt'