In [1]:
import numpy as np
import scipy.stats as sps
import pandas as pd
import jsonlines
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F

from matplotlib import pyplot
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.manifold import TSNE
from sklearn.svm import SVC

---
Загружаем данные

In [2]:
train_data = pd.read_json("train.jsonl", lines=True, orient="records")
val_data = pd.read_json("val.jsonl", lines=True, orient="records")
test_data = pd.read_json("test.jsonl", lines=True, orient="records")

train_data['label'] = train_data['label'].apply(lambda x: 1 if x == 'entailment' else 0)
val_data['label'] = val_data['label'].apply(lambda x: 1 if x == 'entailment' else 0)

In [3]:
train_data.head()

Unnamed: 0,premise,hypothesis,label,idx
0,"Женщину доставили в больницу, за ее жизнь сейч...",Женщину спасают врачи.,1,0
1,Он проводит невидимую грань между настоящим и ...,В эти минуты все мы подводим друг друга.,0,1
2,"Мужчина рассказал: детская коляска, принадлежа...",Сосед часто крадет детские коляски ради денег.,0,2
3,"Я просто об этом даже не думаю, потому что есл...",Спрятаться не удастся.,1,3
4,В ходе проверки нашли дома с наледью и сосульк...,Все сосульки с крыш были сбиты.,0,4


In [4]:
print('max length of train premise is: %d' % np.max([len(s) for s in train_data.premise]))
print('max length of val premise is: %d' % np.max([len(s) for s in val_data.premise]))
print('max length of test premise is: %d' % np.max([len(s) for s in test_data.premise]))

print('max length of train hypothesis is: %d' % np.max([len(s) for s in train_data.hypothesis]))
print('max length of val hypothesis is: %d' % np.max([len(s) for s in val_data.hypothesis]))
print('max length of test hypothesis is: %d' % np.max([len(s) for s in test_data.hypothesis]))

max length of train premise is: 945
max length of val premise is: 717
max length of test premise is: 945
max length of train hypothesis is: 179
max length of val hypothesis is: 129
max length of test hypothesis is: 144


In [5]:
max_len_premise = 161
max_len_hypothesis = 30

---
##### Part 0. Get embeddings

In [6]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
bmodel = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence", return_dict=True)
bmodel.eval()
_ = bmodel.to('cuda')

In [54]:
def get_embeds(data):
    result = []
    for tokens in tqdm(data):
        cnt_tokens = len(tokens)
        embeds = bmodel(**tokenizer(tokens, return_tensors='pt', padding=True).to('cuda'))
        embeds = embeds.last_hidden_state[0][0].cpu().detach().numpy().reshape(-1)
        result.append(embeds)
    result = np.asarray(result)
    return result

In [55]:
train_premise = get_embeds(train_data.premise)
train_hypothesis = get_embeds(train_data.hypothesis)
train_X = np.concatenate((train_premise, train_hypothesis), axis=1)
train_y = np.array(train_data['label'])

HBox(children=(FloatProgress(value=0.0, max=2616.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2616.0), HTML(value='')))




In [56]:
val_premise = get_embeds(val_data.premise)
val_hypothesis = get_embeds(val_data.hypothesis)
val_X = np.concatenate((val_premise, val_hypothesis), axis=1)
val_y = np.array(val_data['label'])

HBox(children=(FloatProgress(value=0.0, max=307.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=307.0), HTML(value='')))




In [57]:
test_premise = get_embeds(test_data.premise)
test_hypothesis = get_embeds(test_data.hypothesis)
test_X = np.hstack((test_premise, test_hypothesis))

HBox(children=(FloatProgress(value=0.0, max=3198.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3198.0), HTML(value='')))




---
##### Part 1. Explore your data

In [18]:
np.vstack((train_X[:, :768], train_X[:, 768:])).shape

(5232, 768)

In [19]:
%time
tsne = TSNE(n_components=1, random_state=0)
vec_rep = tsne.fit_transform(np.vstack((train_X[:, :768], train_X[:, 768:])))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


In [20]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="ru-bert T-SNE")

source = ColumnDataSource(data=dict(x1=vec_rep[:train_X.shape[0]],
                                    x2=vec_rep[train_X.shape[0]:],
                                    names=train_y))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

Большое количество точек расопложились по диагонали. Было бы ожидаемо, если бы метки у них были "entailment", но это не так.

##### Part 3. Make first classifier
---
LogisticRegression

In [46]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
pred = clf.predict(val_X)

In [54]:
accuracy_score(val_y, pred)

0.46905537459283386

---
SVM

In [46]:
clf = SVC(gamma='auto')

params_space = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

scorer = make_scorer(accuracy_score)

rs = RandomizedSearchCV(clf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=scorer)

In [33]:
rs.fit(train_X, train_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   43.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto', kernel='rbf',
                                 max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=50, n_jobs=-1,
                   param_distributions={'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(accuracy_score), verbose=1)

In [34]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

best params: {'kernel': 'sigmoid'}
best CV score: 0.5948012232415902


In [58]:
clf = SVC(gamma='auto', kernel='sigmoid')
clf.fit(train_X, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [59]:
pred = clf.predict(val_X)

In [60]:
accuracy_score(val_y, pred)

0.5798045602605864

---
make submit

In [24]:
def make_submit(pred, path):
    pred_names = ['not_entailment' if p == 0 else 'entailment' for p in pred]
    submission = pd.DataFrame({'Id': np.arange(len(pred)), 'Category': pred_names})
    submission.to_csv(path + '.csv', index=False)

In [40]:
pred = clf.predict(test_X)
make_submit(pred, 'dp_rubert_freezed_svm')

---

In [66]:
from sklearn.metrics.pairwise import cosine_similarity

In [85]:
pred = cosine_similarity(val_X[:, :768], val_X[:, 768:]).diagonal() > 0.6

In [86]:
pred = np.array(pred, dtype=np.int32)

In [87]:
accuracy_score(pred, val_y)

0.5602605863192183

In [88]:
pred = cosine_similarity(test_X[:, :768], test_X[:, 768:]).diagonal() > 0.6
make_submit(pred, 'dp_rubert_freezed_cosine_sim')

---
##### Part 4. Finetune on Classification task

In [6]:
train_X = np.array(list(zip(list(train_data.premise), list(train_data.hypothesis))))
train_y = np.array(train_data['label'])

val_X = np.array(list(zip(list(val_data.premise), list(val_data.hypothesis))))
val_y = np.array(val_data['label'])

test_X = np.array(list(zip(list(test_data.premise), list(test_data.hypothesis))))

In [7]:
BATCH_SIZE = 8
EPOCHS = 100
DEVICE = torch.device('cuda')

In [8]:
def iterate_minibatches(data, batch_size=256, shuffle=True):
    """ iterates minibatches of data in random order """
    indices = np.arange(len(data[0]))
    if shuffle:
        indices = np.random.permutation(indices)

    for start in range(0, len(indices), batch_size):
        batch = [data[0][indices[start : start + batch_size]], data[1][indices[start : start + batch_size]]]
        yield batch

In [9]:
def print_metrics(model, data, batch_size=BATCH_SIZE, name="", device=torch.device('cuda')):
    loss = accuracy = num_samples = 0.0
    model.bmodel.eval()
    with torch.no_grad():
        for batch in iterate_minibatches(data, batch_size=batch_size, shuffle=False):
            pred = model(batch)
#             print(pred)
            y = torch.tensor(batch[1], dtype=torch.long, device=DEVICE)
            loss += criterion(pred.float(), y)
            accuracy += torch.mean((torch.argmax(pred, axis=-1).float() == y.float()).float())
            num_samples += 1
            
    loss = loss.detach().cpu().numpy() / num_samples
    accuracy = accuracy / num_samples
    print("%s val results:" % (name or ""))
    print("loss: %.5f" % loss)
    print("accuracy: %.5f" % accuracy.detach().cpu().numpy())
    return loss, accuracy

In [10]:
class NNclf(nn.Module):
    def __init__(self, emb_dim=768, device=torch.device('cuda')):
        super().__init__()
        
        self.device = device
        
        self.tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
        self.bmodel = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence", return_dict=True)
        self.bmodel.eval()
        _ = self.bmodel.to('cuda')
        
        for param in self.bmodel.parameters():
            param.requires_grad = False
        
        self.linear = nn.Linear(emb_dim, emb_dim)
        self.transformation = nn.Sequential(
            nn.Linear(emb_dim, emb_dim),
            nn.Tanh()
        )
        
        self.dropout = torch.nn.Dropout(0.3)
        
        self.final_predictor = nn.Sequential(
            nn.Linear(emb_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 2)
        )
        
    def forward(self, batch):
        premise = list(batch[0][:, 0])
        hypothesis = list(batch[0][:, 1])
        
        embeds_premise = self.bmodel(**self.tokenizer(premise, return_tensors='pt', padding=True).to('cuda'))
        embeds_premise = embeds_premise.last_hidden_state[:, 0, :]
        
        embeds_hypothesis = self.bmodel(**self.tokenizer(hypothesis, return_tensors='pt', padding=True).to('cuda'))
        embeds_hypothesis = embeds_hypothesis.last_hidden_state[:, 0, :]
        
        embeds = self.dropout(self.transformation(embeds_premise) * self.transformation(embeds_hypothesis))
        
        return self.final_predictor(embeds)
    
    def predict(self, data):
        answer = []
        for item in data:
            pred = torch.argmax(self.forward(np.array([[item]])))
            answer.append(pred.detach().cpu().numpy())
        return answer

In [15]:
sum(p.numel() for p in model.parameters())

179133314

In [11]:
model = NNclf(device=DEVICE).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in tqdm(range(EPOCHS)):
    
#     model.bmodel.train()
    epoch_loss = 0
    epoch_accuracy = 0
    iterations = 0
    for i, batch in tqdm(enumerate(iterate_minibatches([train_X, train_y], batch_size=BATCH_SIZE))):
        pred = model(batch)
        y = torch.tensor(batch[1], dtype=torch.long, device=DEVICE)
        loss = criterion(pred, y)
        accuracy = torch.mean((torch.argmax(pred, axis=-1).float() == y.float()).float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        epoch_accuracy += accuracy
        iterations += 1
    
#     if epoch % 10 == 0:
    print(f"epoch: {epoch}")
    print("train results:")
    print("loss: ", epoch_loss.detach().cpu().numpy() / iterations)
    print("accuracy:", epoch_accuracy.detach().cpu().numpy() / iterations)    
    print_metrics(model, [val_X, val_y])

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 0
train results:
loss:  0.6911035006928516
accuracy: 0.5271406727828746
 val results:
loss: 0.68592
accuracy: 0.55556


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 1
train results:
loss:  0.6796828337036506
accuracy: 0.5833333333333334
 val results:
loss: 0.67363
accuracy: 0.59402


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 2
train results:
loss:  0.6718139181807865
accuracy: 0.6035932721712538
 val results:
loss: 0.66693
accuracy: 0.58654


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 3
train results:
loss:  0.6657058762483276
accuracy: 0.6127675840978594
 val results:
loss: 0.66581
accuracy: 0.61325


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 4
train results:
loss:  0.6589651311938551
accuracy: 0.6131498470948012
 val results:
loss: 0.66161
accuracy: 0.60577


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 5
train results:
loss:  0.6534167181825784
accuracy: 0.6123853211009175
 val results:
loss: 0.65678
accuracy: 0.59081


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 6
train results:
loss:  0.6485546707013331
accuracy: 0.6238532110091743
 val results:
loss: 0.65785
accuracy: 0.59722


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 7
train results:
loss:  0.6440097913829559
accuracy: 0.6334097859327217
 val results:
loss: 0.65805
accuracy: 0.60043


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 8
train results:
loss:  0.6384674912198968
accuracy: 0.6452599388379205
 val results:
loss: 0.65806
accuracy: 0.57372


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 9
train results:
loss:  0.6334357305404243
accuracy: 0.6452599388379205
 val results:
loss: 0.66354
accuracy: 0.56731


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 10
train results:
loss:  0.6302684352303135
accuracy: 0.6487003058103975
 val results:
loss: 0.65786
accuracy: 0.58333


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 11
train results:
loss:  0.6254577170089115
accuracy: 0.658256880733945
 val results:
loss: 0.66407
accuracy: 0.56838


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 12
train results:
loss:  0.6239184458321387
accuracy: 0.6574923547400612
 val results:
loss: 0.66701
accuracy: 0.54701


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 13
train results:
loss:  0.6178762453411697
accuracy: 0.6594036697247706
 val results:
loss: 0.67193
accuracy: 0.55983


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 14
train results:
loss:  0.6132860096222764
accuracy: 0.6662844036697247
 val results:
loss: 0.66888
accuracy: 0.55556


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 15
train results:
loss:  0.6106463791033544
accuracy: 0.6758409785932722
 val results:
loss: 0.67098
accuracy: 0.55876


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 16
train results:
loss:  0.6063926766771789
accuracy: 0.6788990825688074
 val results:
loss: 0.66738
accuracy: 0.57585


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




KeyboardInterrupt: 

In [55]:
predicted_classes = model.predict(test_X)

make_submit(predicted_classes, 'dp_rubert_finetuned_nnclassifier')

---

In [21]:
class NNclf2(nn.Module):
    def __init__(self, emb_dim=768, device=torch.device('cuda')):
        super().__init__()
        
        self.device = device
        
        self.tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
        self.bmodel = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence", return_dict=True)
#         self.bmodel.eval()
        _ = self.bmodel.to('cuda')
        
#         for param in self.bmodel.base_model.parameters():
#             param.requires_grad = False
        
        self.final_predictor = nn.Sequential(
            nn.Linear(2 * emb_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 2)
        )
        
    def forward(self, batch):
        premise = list(batch[0][:, 0])
        hypothesis = list(batch[0][:, 1])
        
        embeds_premise = self.bmodel(**self.tokenizer(premise, return_tensors='pt', padding=True).to('cuda'))
        embeds_premise = embeds_premise.last_hidden_state[:, 0, :]
        
        embeds_hypothesis = self.bmodel(**self.tokenizer(hypothesis, return_tensors='pt', padding=True).to('cuda'))
        embeds_hypothesis = embeds_hypothesis.last_hidden_state[:, 0, :]
        
        return self.final_predictor(torch.cat((embeds_premise, embeds_hypothesis), dim=-1))
        
    
    def predict(self, data):
        answer = []
        for item in data:
            pred = torch.argmax(self.forward(np.array([[item]])))
            answer.append(pred.detach().cpu().numpy())
        return answer

In [22]:
model = NNclf2(device=DEVICE).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in tqdm(range(EPOCHS)):
    
    model.bmodel.train()
    epoch_loss = 0
    epoch_accuracy = 0
    iterations = 0
    for i, batch in tqdm(enumerate(iterate_minibatches([train_X, train_y], batch_size=BATCH_SIZE))):
        pred = model(batch)
        y = torch.tensor(batch[1], dtype=torch.long, device=DEVICE)
        loss = criterion(pred, y)
        accuracy = torch.mean((torch.argmax(pred, axis=-1).float() == y.float()).float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        epoch_accuracy += accuracy
        iterations += 1
    
#     if epoch % 10 == 0:
    print(f"epoch: {epoch}")
    print("train results:")
    print("loss: ", epoch_loss.detach().cpu().numpy() / iterations)
    print("accuracy:", epoch_accuracy.detach().cpu().numpy() / iterations)    
    print_metrics(model, [val_X, val_y])

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 0
train results:
loss:  0.6788626854572821
accuracy: 0.5814220183486238
 val results:
loss: 0.65845
accuracy: 0.57799


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 1
train results:
loss:  0.5740310715608276
accuracy: 0.7033639143730887
 val results:
loss: 0.73524
accuracy: 0.58868


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


epoch: 2
train results:
loss:  0.3681404510404721
accuracy: 0.8436544342507645
 val results:
loss: 1.01143
accuracy: 0.55449


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




KeyboardInterrupt: 