### Download default data

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

--2021-03-15 20:58:13--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94550081 (90M) [application/zip]
Saving to: ‘snli_1.0.zip’


2021-03-15 20:58:15 (32.9 MB/s) - ‘snli_1.0.zip’ saved [94550081/94550081]



In [None]:
!unzip snli_1.0.zip

Archive:  snli_1.0.zip
   creating: snli_1.0/
  inflating: snli_1.0/.DS_Store      
   creating: __MACOSX/
   creating: __MACOSX/snli_1.0/
  inflating: __MACOSX/snli_1.0/._.DS_Store  
 extracting: snli_1.0/Icon           
  inflating: __MACOSX/snli_1.0/._Icon  
  inflating: snli_1.0/README.txt     
  inflating: __MACOSX/snli_1.0/._README.txt  
  inflating: snli_1.0/snli_1.0_dev.jsonl  
  inflating: snli_1.0/snli_1.0_dev.txt  
  inflating: snli_1.0/snli_1.0_test.jsonl  
  inflating: snli_1.0/snli_1.0_test.txt  
  inflating: snli_1.0/snli_1.0_train.jsonl  
  inflating: snli_1.0/snli_1.0_train.txt  
  inflating: __MACOSX/._snli_1.0     


### Download from google disk preprocessed data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil

In [None]:
shutil.copy('.//drive//MyDrive//challenge//train.jsonl', './/')
shutil.copy('.//drive//MyDrive//challenge//test.jsonl', './/')
shutil.copy('.//drive//MyDrive//challenge//val.jsonl', './/')

'.//val.jsonl'

### Imports and device and download stopwords

In [None]:
import json

import torch
import nltk
import numpy as np
import gensim.downloader as api

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import spacy

from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm.autonotebook import tqdm

In [None]:
nltk.download('stopwords')
nlp = spacy.load('en')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Embedding

In [None]:
# word2vec = api.load('word2vec-google-news-300')
word2vec = api.load('glove-wiki-gigaword-300')
vocab_size = len(word2vec.vocab)



### CNN

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, out_channels, kernel_sizes, dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv_0_1 = nn.Conv1d(in_channels=emb_dim,
                                  out_channels=out_channels,
                                  kernel_size=kernel_sizes[0],
                                  padding=1, stride=2)
        self.conv_1_1 = nn.Conv1d(in_channels=emb_dim,
                                  out_channels=out_channels,
                                  kernel_size=kernel_sizes[1],
                                  padding=1, stride=2)
        self.conv_2_1 = nn.Conv1d(in_channels=emb_dim,
                                  out_channels=out_channels,
                                  kernel_size=kernel_sizes[2],
                                  padding=1, stride=2)
        self.conv_0_2 = nn.Conv1d(in_channels=emb_dim,
                                  out_channels=out_channels,
                                  kernel_size=kernel_sizes[0],
                                  padding=1, stride=2)        
        self.conv_1_2 = nn.Conv1d(in_channels=emb_dim,
                                  out_channels=out_channels,
                                  kernel_size=kernel_sizes[1],
                                  padding=1, stride=2)        
        self.conv_2_2 = nn.Conv1d(in_channels=emb_dim,
                                  out_channels=out_channels,
                                  kernel_size=kernel_sizes[2],
                                  padding=1, stride=2)        
        
        self.fc = nn.Linear(len(kernel_sizes) * out_channels * 2, 3)

        # self.fc = nn.Linear(len(kernel_sizes) * out_channels, 3)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, data):
        
        embedded1 = self.embedding(data['seq1'])
        embedded2 = self.embedding(data['seq1']) - self.embedding(data['seq2'])

        # embedded1 = torch.cat((embedded1, embedded1 - embedded2), dim=1)

        embedded1 = embedded1.permute(0, 2, 1)
        embedded2 = embedded2.permute(0, 2, 1)
        
        conved_0_1 = F.relu(self.conv_0_1(embedded1))
        conved_1_1 = F.relu(self.conv_1_1(embedded1))
        conved_2_1 = F.relu(self.conv_2_1(embedded1))

        conved_0_2 = F.relu(self.conv_0_2(embedded2))
        conved_1_2 = F.relu(self.conv_1_2(embedded2))
        conved_2_2 = F.relu(self.conv_2_2(embedded2))        
        
        pooled_0_1 = F.max_pool1d(conved_0_1, conved_0_1.shape[2]).squeeze(2)
        pooled_1_1 = F.max_pool1d(conved_1_1, conved_1_1.shape[2]).squeeze(2)
        pooled_2_1 = F.max_pool1d(conved_2_1, conved_2_1.shape[2]).squeeze(2)

        pooled_0_2 = F.max_pool1d(conved_0_2, conved_0_2.shape[2]).squeeze(2)
        pooled_1_2 = F.max_pool1d(conved_1_2, conved_1_2.shape[2]).squeeze(2)
        pooled_2_2 = F.max_pool1d(conved_2_2, conved_2_2.shape[2]).squeeze(2)
        
        cat1 = self.dropout(torch.cat((pooled_0_1, pooled_1_1, pooled_2_1), dim=1))
        cat2 = self.dropout(torch.cat((pooled_0_2, pooled_1_2, pooled_2_2), dim=1))
        cat = torch.cat((cat1, cat2), dim=1)
            
        return self.fc(cat).squeeze(1)

In [None]:
kernel_sizes = [2, 3, 4]
vocab_size = vocab_size
out_channels=64
dropout = 0.2
dim = 300

model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
            kernel_sizes=kernel_sizes, dropout=dropout)
model.embedding.weight = nn.Parameter(torch.from_numpy(word2vec.vectors))

In [None]:
def freeze_emb(require=False):
    for x in model.embedding.parameters():
        x.requires_grad = require

In [None]:
model = model.to(device)

In [None]:
opt = optim.Adam(model.parameters(), lr=0.001)
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=111, eta_min=0.000001)
loss_func = nn.CrossEntropyLoss()

### RNN

In [None]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx, dropout_clf):
        
        super().__init__()

        
        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=pad_idx)
        
        self.rnn_for_seq1 = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           dropout=dropout,
                           bidirectional=bidirectional)
        self.rnn_for_seq2 = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           dropout=dropout,
                           bidirectional=bidirectional)        
        self.dropout = nn.Dropout(p=dropout_clf)
        self.fc = nn.Linear(hidden_dim * 2 * (2 if bidirectional else 1),
                                          output_dim)
        
    def forward(self, text):
        
        embedded1 = self.embedding(text['seq1'][0].permute(1, 0))
        embedded2 = self.embedding(text['seq2'][0].permute(1, 0))
        
        packed_embedded1 = nn.utils.rnn.pack_padded_sequence(embedded1, text['seq1'][1], enforce_sorted=False)
        packed_embedded2 = nn.utils.rnn.pack_padded_sequence(embedded2, text['seq2'][1], enforce_sorted=False)
        
        packed_output1, (hidden1, cell1) = self.rnn_for_seq1(packed_embedded1)
        packed_output2, (hidden2, cell2) = self.rnn_for_seq2(packed_embedded2)

        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden1 = torch.cat((hidden1[-1, :, :], hidden1[-2, :, :]), dim=1)
        hidden2 = torch.cat((hidden2[-1, :, :], hidden2[-2, :, :]), dim=1)
        
        hidden = self.dropout(torch.cat((hidden1, hidden2), dim=1))

        return self.fc(hidden).squeeze(1)

In [None]:
emb_dim = 300
hidden_dim = 256
output_dim = 3
n_layers = 2
bidirectional = True
dropout = 0.2
PAD_IDX = 1
patience = 4
dropout_clf = 0.2

In [None]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX,
    dropout_clf=dropout_clf
)

In [None]:
model = model.to(device)

In [None]:
opt = torch.optim.Adam(model.parameters())
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=111, eta_min=0.000001)
loss_func = nn.CrossEntropyLoss()
max_epochs = 20

### Datasets and DataLoaders on corpus for CNN

In [None]:
class TextDatasetCNN(Dataset):
    def __init__(self, dir_json, stop_words, word2vec, max_length, tokenizer=None):
        data = []
        with open(dir_json) as json_file:
            for line in json_file:
                data.append(json.loads(line))
        self.json_data = data
        self.label_map = {'neutral':torch.tensor(0, dtype=torch.long),
                          'contradiction':torch.tensor(1, dtype=torch.long),
                          'entailment':torch.tensor(2, dtype=torch.long)}

        self.stop_words = stop_words
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.word2vec = word2vec
        self.stange_input = set()
    
    def __len__(self):
        return len(self.json_data)

    def __getitem__(self, ind):
        data = self.json_data[ind]
        try:
            return {'seq1':self.preprocessing_sequence(data['sentence1']),
                    'seq2':self.preprocessing_sequence(data['sentence2']),
                    'label':self.label_map[data['gold_label']]}
        except KeyError:
            if ind not in self.stange_input:
                self.stange_input.add(ind)
            return self.__getitem__(ind+1)
    
    def preprocessing_sequence(self, seq):
        if not self.tokenizer is None:
            tokens = [t for t in self.tokenizer.tokenize(seq.lower())]
        else:
            tokens = [str(t.lemma_) for t in nlp(seq.lower())]
        embedings = [self.word2vec.index2word.index(t) for t in tokens if t in self.word2vec]
        if len(embedings) < self.max_length:
            embedings.extend([1 for i in range(self.max_length - len(embedings))])
        else:
            embedings = embedings[:self.max_length]
        return torch.from_numpy(np.array(embedings)).type(torch.long)

In [None]:
train_datasetCNN = TextDatasetCNN(#dir_json='.//snli_1.0//snli_1.0_train.jsonl',
                                  dir_json='.//train.jsonl',
                            stop_words=stopwords.words('english'),
                            tokenizer=WordPunctTokenizer(),
                            word2vec=word2vec,
                            max_length=10)
val_datasetCNN = TextDatasetCNN(#'.//snli_1.0//snli_1.0_dev.jsonl',
                                dir_json='.//val.jsonl',
                          stop_words=stopwords.words('english'),
                          tokenizer=WordPunctTokenizer(),
                          word2vec=word2vec,
                          max_length=10)
test_datasetCNN = TextDatasetCNN(#'.//snli_1.0//snli_1.0_test.jsonl',
                                 dir_json='.//test.jsonl',
                           stop_words=stopwords.words('english'),
                           tokenizer=WordPunctTokenizer(),
                           word2vec=word2vec,
                           max_length=10)

In [None]:
train_loaderCNN = DataLoader(train_datasetCNN, batch_size=5000, shuffle=True)
val_loaderCNN = DataLoader(val_datasetCNN, batch_size=5000)
test_loaderCNN = DataLoader(test_datasetCNN, batch_size=5000)

### Datasets and DataLoaders on corpus for RNN

In [None]:
class TextDatasetRNN(Dataset):
    def __init__(self, dir_json, word2vec, max_length, tokenizer=None):
        data = []
        with open(dir_json) as json_file:
            for line in json_file:
                data.append(json.loads(line))
        self.json_data = data
        self.label_map = {'neutral':torch.tensor(0, dtype=torch.long),
                          'contradiction':torch.tensor(1, dtype=torch.long),
                          'entailment':torch.tensor(2, dtype=torch.long)}

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.word2vec = word2vec
        self.stange_input = set()
    
    def __len__(self):
        return len(self.json_data)

    def __getitem__(self, ind):
        data = self.json_data[ind]
        try:
            return {'seq1':self.preprocessing_sequence(data['sentence1'], ind),
                    'seq2':self.preprocessing_sequence(data['sentence2'], ind),
                    'label':self.label_map[data['gold_label']]}
        except (ValueError, KeyError):
            if ind < len(self):
                return self.__getitem__(ind + 1)
            else:
                return self.__getitem__(ind - 1)

    
    def preprocessing_sequence(self, seq, ind):
        tokens = [t for t in self.tokenizer.tokenize(seq.lower())]
        embedings = [self.word2vec.index2word.index(t) for t in tokens if t in self.word2vec]
        lenght = len(embedings)
        if lenght == 0:
            if not ind in self.stange_input:
                self.stange_input.add(ind)
            raise ValueError
        if len(embedings) < self.max_length:
            embedings.extend([1 for i in range(self.max_length - len(embedings))])
        else:
            lenght = self.max_length
            embedings = embedings[:self.max_length]
        return torch.from_numpy(np.array(embedings)).type(torch.long), torch.tensor(lenght, dtype=torch.long)

In [None]:
train_datasetRNN = TextDatasetRNN(dir_json='.//train.jsonl',
                            tokenizer=WordPunctTokenizer(),
                            word2vec=word2vec,
                            max_length=10)
val_datasetRNN = TextDatasetRNN('.//val.jsonl',
                          tokenizer=WordPunctTokenizer(),
                          word2vec=word2vec,
                          max_length=10)
test_datasetRNN = TextDatasetRNN('.//test.jsonl',
                           tokenizer=WordPunctTokenizer(),
                           word2vec=word2vec,
                           max_length=10)

In [None]:
train_loaderRNN = DataLoader(train_datasetRNN, batch_size=1000, shuffle=True)
val_loaderRNN = DataLoader(val_datasetRNN, batch_size=1000)
test_loaderRNN = DataLoader(test_datasetRNN, batch_size=1000)

### Info about GPU

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



### Training СNN

In [None]:
max_epochs = 30
patience = 3

In [None]:
min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_loaderCNN), total=len(train_loaderCNN), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    if epoch > 5:
        freeze_emb(True)
    else:
        freeze_emb()
    for it, batch in pbar: 
        with torch.set_grad_enabled(True):
            opt.zero_grad()

            data = {
                'seq1':batch['seq1'].to(device),
                'seq2':batch['seq2'].to(device)
            }
            label = batch['label'].to(device)

            output = model(data)
            loss = loss_func(output, label)
            loss.backward()
            train_loss += loss.item()
            opt.step()
            lr_scheduler.step()

    train_loss /= len(train_loaderCNN)

    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_loaderCNN), total=len(val_loaderCNN), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        with torch.no_grad():
            data = {
                'seq1':batch['seq1'].to(device),
                'seq2':batch['seq2'].to(device)
            }
            label = batch['label'].to(device)
            output = model(data)

            loss = loss_func(output, label)
            val_loss += loss.item()

    val_loss /= len(val_loaderCNN)

    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
        cur_patience = 0
    else:
        cur_patience += 1
        if cur_patience == patience:
            break
    
    print(f'Epoch: {epoch}, Training Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}')
model.load_state_dict(best_model)

### Accuracy on test CNN

In [None]:
model.eval()

CNN(
  (embedding): Embedding(400000, 300)
  (conv_0_1): Conv1d(300, 64, kernel_size=(2,), stride=(2,), padding=(1,))
  (conv_1_1): Conv1d(300, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (conv_2_1): Conv1d(300, 64, kernel_size=(4,), stride=(2,), padding=(1,))
  (conv_0_2): Conv1d(300, 64, kernel_size=(2,), stride=(2,), padding=(1,))
  (conv_1_2): Conv1d(300, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (conv_2_2): Conv1d(300, 64, kernel_size=(4,), stride=(2,), padding=(1,))
  (fc): Linear(in_features=384, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
count = 0
corr = 0
for batch in test_loaderCNN:
    with torch.no_grad():
            data = {
                'seq1':batch['seq1'].to(device),
                'seq2':batch['seq2'].to(device)
            }
            label = batch['label'].to(device)
            output = model(data)
            pred = torch.argmax(output, dim=1)
            count += pred.shape[0]
            corr += torch.sum(pred == label).cpu()
print(corr / (count))

tensor(0.7072)


In [None]:
count = 0
corr = 0
for batch in val_loaderCNN:
    with torch.no_grad():
            data = {
                'seq1':batch['seq1'].to(device),
                'seq2':batch['seq2'].to(device)
            }
            label = batch['label'].to(device)
            output = model(data)
            pred = torch.argmax(output, dim=1)
            count += pred.shape[0]
            corr += torch.sum(pred == label).cpu()
print(corr / (count))

tensor(0.6972)


### Training RNN

In [None]:
max_epochs = 30
patience = 3

In [None]:
min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_loaderRNN), total=len(train_loaderRNN), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        with torch.set_grad_enabled(True):

            opt.zero_grad()
            label = batch['label'].to(device)
            data = {
                'seq1':(
                    batch['seq1'][0].to(device),
                    batch['seq1'][1].cpu()
                ),
                'seq2':(
                    batch['seq2'][0].to(device),
                    batch['seq2'][1].cpu()                    
                )
            }
            output = model(data)

            loss = loss_func(output, label)
            loss.backward()
            train_loss += loss.item()

            opt.step()

    train_loss /= len(train_loaderRNN)

    val_loss = 0.0
    model.eval()

    pbar = tqdm(enumerate(val_loaderRNN), total=len(val_loaderRNN), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        # model(it.to(device))

        with torch.no_grad():
            label = batch['label'].to(device)
            data = {
                'seq1':(
                    batch['seq1'][0].to(device),
                    batch['seq1'][1].cpu()
                ),
                'seq2':(
                    batch['seq2'][0].to(device),
                    batch['seq2'][1].cpu()                    
                )
            }
            output = model(data)

            loss = loss_func(output, label)
            val_loss += loss.item()        

    val_loss /= len(val_loaderRNN)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print(f'Epoch: {epoch}, Training Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}')
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 1, Training Loss: 0.784173, Validation Loss: 0.754188


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 2, Training Loss: 0.735917, Validation Loss: 0.738447


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 3, Training Loss: 0.698039, Validation Loss: 0.736609


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 4, Training Loss: 0.658207, Validation Loss: 0.739840


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 5, Training Loss: 0.611051, Validation Loss: 0.769931


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

<All keys matched successfully>

### Accuracy on RNN

In [None]:
count = 0
corr = 0
for batch in test_loaderRNN:
    with torch.no_grad():
        label = batch['label'].to(device)
        data = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
                ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()                    
            )
            }
        output = model(data)
        pred = torch.argmax(output, dim=1)
        count += pred.shape[0]
        corr += torch.sum(pred == label).cpu()
print(corr / (count))

tensor(0.6779)


In [None]:
count = 0
corr = 0
for batch in val_loaderRNN:
    with torch.no_grad():
        label = batch['label'].to(device)
        data = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
                ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()                    
            )
            }
        output = model(data)
        pred = torch.argmax(output, dim=1)
        count += pred.shape[0]
        corr += torch.sum(pred == label).cpu()
print(corr / (count))

tensor(0.6800)


### Save model

In [None]:
torch.save(model.state_dict(), './/modelRNN.pt')

In [None]:
shutil.copy('.//modelRNN.pt', './/drive//MyDrive//challenge//')

'.//drive//MyDrive//challenge//modelRNN.pt'

### Create Meta Alorithm

In [None]:
model1 = model # CNN

In [None]:
model2 = model # RNN

In [None]:
shutil.copy('.//drive//MyDrive//challenge//modelCNN.pt', './/')
shutil.copy('.//drive//MyDrive//challenge//modelRNN.pt', './/')

'.//modelRNN.pt'

In [None]:
model1.load_state_dict(torch.load('.//modelCNN.pt', map_location=torch.device('cpu')))
# model2.load_state_dict(torch.load('.//modelRNN.pt'))

<All keys matched successfully>

In [None]:
model1.eval()
# model2.eval()

CNN(
  (embedding): Embedding(400000, 300)
  (conv_0_1): Conv1d(300, 64, kernel_size=(2,), stride=(2,), padding=(1,))
  (conv_1_1): Conv1d(300, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (conv_2_1): Conv1d(300, 64, kernel_size=(4,), stride=(2,), padding=(1,))
  (conv_0_2): Conv1d(300, 64, kernel_size=(2,), stride=(2,), padding=(1,))
  (conv_1_2): Conv1d(300, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (conv_2_2): Conv1d(300, 64, kernel_size=(4,), stride=(2,), padding=(1,))
  (fc): Linear(in_features=384, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
meta_train_X_dataRNN = np.array([[1, 2, 3, 4, 5, 6]])
meta_train_y_dataRNN = np.array([])

In [None]:
with torch.no_grad():
    pbar = tqdm(enumerate(train_loaderRNN), total=len(train_loaderRNN), leave=False)
    for it, batch in pbar:
        label = batch['label'].cpu()
        data1 = {
            'seq1':(
                batch['seq1'][0].to(device)
            ),
            'seq2':(
                batch['seq2'][0].to(device)
            )            
        }
        data2 = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
            ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()
            )
        }
        output1 = model1(data1)
        output2 = model2(data2)
        meta_train_X_dataRNN = np.vstack([meta_train_X_dataRNN, np.hstack((output1.cpu().numpy(), output2.cpu().numpy()))])
        meta_train_y_dataRNN = np.concatenate([meta_train_y_dataRNN, label])

HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))

In [None]:
meta_val_X_data = np.array([[1, 2, 3, 4, 5, 6]])
meta_val_y_data = np.array([])

In [None]:
with torch.no_grad():
    pbar = tqdm(enumerate(val_loaderRNN), total=len(val_loaderRNN), leave=False)
    for it, batch in pbar:
        label = batch['label'].cpu()
        data1 = {
            'seq1':(
                batch['seq1'][0].to(device)
            ),
            'seq2':(
                batch['seq2'][0].to(device)
            )            
        }
        data2 = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
            ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()
            )
        }
        output1 = model1(data1)
        output2 = model2(data2)
        meta_val_X_data = np.vstack([meta_val_X_data, np.hstack((output1.cpu().numpy(), output2.cpu().numpy()))])
        meta_val_y_data = np.concatenate([meta_val_y_data, label])

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In [None]:
meta_test_X_data = np.array([[1, 2, 3, 4, 5, 6]])
meta_test_y_data = np.array([])

In [None]:
with torch.no_grad():
    pbar = tqdm(enumerate(test_loaderRNN), total=len(test_loaderRNN), leave=False)
    for it, batch in pbar:
        label = batch['label'].cpu()
        data1 = {
            'seq1':(
                batch['seq1'][0].to(device)
            ),
            'seq2':(
                batch['seq2'][0].to(device)
            )            
        }
        data2 = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
            ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()
            )
        }
        output1 = model1(data1)
        output2 = model2(data2)
        meta_test_X_data = np.vstack([meta_test_X_data, np.hstack((output1.cpu().numpy(), output2.cpu().numpy()))])
        meta_test_y_data = np.concatenate([meta_test_y_data, label])

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

In [None]:
train_df = pd.DataFrame(np.hstack((meta_train_X_dataRNN[1:, :], meta_train_y_dataRNN[:, np.newaxis])))

In [None]:
val_df = pd.DataFrame(np.hstack((meta_val_X_data[1:, :], meta_val_y_data[:, np.newaxis])))

In [None]:
test_df = pd.DataFrame(np.hstack((meta_test_X_data[1:, :], meta_test_y_data[:, np.newaxis])))

In [None]:
train_df.to_csv('train_meta.csv')
val_df.to_csv('val_meta.csv')
test_df.to_csv('test_meta.csv')

In [None]:
shutil.copy('.//train_meta.csv', './/drive//MyDrive//challenge//')
shutil.copy('.//val_meta.csv', './/drive//MyDrive//challenge//')
shutil.copy('.//test_meta.csv', './/drive//MyDrive//challenge//')

'.//drive//MyDrive//challenge//test_meta.csv'

In [None]:
xgbclf = XGBClassifier(n_estimators=1000)

In [None]:
xgbclf.fit(meta_train_X_dataRNN[1:, :], meta_train_y_dataRNN)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
xgbclf.score(meta_val_X_data[1:, :], meta_val_y_data)

0.7064

In [None]:
xgbclf.score(meta_test_X_data[1:, :], meta_test_y_data)

0.7026

In [None]:
f1_score(meta_test_y_data, xgbclf.predict(meta_test_X_data[1:, :]), average='weighted')

0.7024548642879651

In [None]:
print(classification_report(meta_test_y_data, xgbclf.predict(meta_test_X_data[1:, :])))

              precision    recall  f1-score   support

         0.0       0.68      0.67      0.68      3265
         1.0       0.70      0.71      0.71      3307
         2.0       0.72      0.73      0.72      3428

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.70     10000
weighted avg       0.70      0.70      0.70     10000



In [None]:
print(classification_report(meta_val_y_data, xgbclf.predict(meta_val_X_data[1:, :])))

              precision    recall  f1-score   support

         0.0       0.69      0.67      0.68      3274
         1.0       0.71      0.72      0.71      3343
         2.0       0.72      0.73      0.73      3383

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000



In [None]:
with open('train.jsonl') as preprocessed, open('.//snli_1.0//snli_1.0_train.jsonl') as file_json:
    data_before = []
    for i, line in enumerate(file_json):
        data_before = json.loads(line)
        if i > 0: 
            break
    data_after = []
    for i, line in enumerate(preprocessed):
        data_after = json.loads(line)
        if i > 0: 
            break            

In [None]:
from pprint import pprint

In [None]:
print('Data before:')
pprint(data_before)

Data before:
{'annotator_labels': ['contradiction'],
 'captionID': '3416050480.jpg#4',
 'gold_label': 'contradiction',
 'pairID': '3416050480.jpg#4r1c',
 'sentence1': 'A person on a horse jumps over a broken down airplane.',
 'sentence1_binary_parse': '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( '
                           'over ( a ( broken ( down airplane ) ) ) ) ) . ) )',
 'sentence1_parse': '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT '
                    'a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) '
                    '(JJ broken) (JJ down) (NN airplane)))) (. .)))',
 'sentence2': 'A person is at a diner, ordering an omelette.',
 'sentence2_binary_parse': '( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) '
                           '( ordering ( an omelette ) ) ) . ) )',
 'sentence2_parse': '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) '
                    '(NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP '
             

In [None]:
print('Data after:')
pprint(data_after)

Data after:
{'gold_label': 'contradiction',
 'sentence1': 'a person on a horse jump over a broken down airplane .',
 'sentence2': 'a person be at a diner , order an omelette .'}


In [None]:
preds = np.array([])
true = np.array([])

In [None]:
with torch.no_grad():
    pbar = tqdm(enumerate(test_loaderRNN), total=len(test_loaderRNN), leave=False)
    for it, batch in pbar:
        label = batch['label'].cpu()
        data2 = {
            'seq1':(
                batch['seq1'][0].to(device),
                batch['seq1'][1].cpu()
            ),
            'seq2':(
                batch['seq2'][0].to(device),
                batch['seq2'][1].cpu()
            )
        }
        data1 = {
            'seq1':(
                batch['seq1'][0].to(device)
            ),
            'seq2':(
                batch['seq2'][0].to(device)
            )            
        }        
        output1 = model1(data1)
        preds = np.concatenate((preds, torch.argmax(output1, dim=1).cpu().numpy()))
        true = np.concatenate((true, label.numpy()))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))



In [None]:
import pandas as pd

In [None]:
preds.shape

(10000,)

In [None]:
test_df = pd.DataFrame(preds.astype(np.int16), columns=['class'])

In [None]:
test_df

Unnamed: 0,class
0,2
1,2
2,2
3,2
4,0
...,...
9995,1
9996,2
9997,1
9998,2


In [None]:
test_df.to_csv('test.csv')