In [65]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr

In [66]:
df_train = pd.read_csv('eng_train.csv')
df_train

Unnamed: 0,PairID,Text,Score
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0
...,...,...,...
5495,ENG-train-5495,A young boy pounding on an anvil.\r\nWoman sit...,0.0
5496,ENG-train-5496,I love how he recognized his wife tempered his...,0.0
5497,ENG-train-5497,I actually read a chapter or two beyond that p...,0.0
5498,ENG-train-5498,A boy gives being in the snow two thumbs up.\r...,0.0


In [67]:
df_dev = pd.read_csv('eng_dev.csv')
df_dev

Unnamed: 0,PairID,Text
0,ENG-dev-0000,The story is gripping and interesting.\r\nIt's...
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...
2,ENG-dev-0002,and from your post i think you are to young to...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...
4,ENG-dev-0004,I am still confused about how I feel about thi...
...,...,...
245,ENG-dev-0245,thats just how they are :( its a shame lol\r\n...
246,ENG-dev-0246,I feel sorry for the books that I will read af...
247,ENG-dev-0247,Uwe Seeler -LRB- born 5 November 1936 in Hambu...
248,ENG-dev-0248,Waco is a city in and the county seat of McLen...


In [68]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

In [69]:
def compute_bert_embeddings(X, tokenizer, model):
    model.eval()

    bert_mean_embeddings = []
    bert_cls_embeddings = []
    for text in tqdm(X):
        marked_text = '[CLS] ' + text + ' [SEP]'
        tokenized_text = tokenizer.tokenize(marked_text)
        tokenized_text = tokenized_text[:512]

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(indexed_tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segments_ids])

        with torch.no_grad():
            #encoded_layers, _ = model(tokens_tensor, segments_tensor)
            encoded_layers = model(tokens_tensor, segments_tensor)

        mean_embedding = torch.mean(encoded_layers[-1], dim=1)
        cls_embedding = encoded_layers[-1][0][0]

        bert_mean_embeddings.append(mean_embedding.numpy())
        bert_cls_embeddings.append(cls_embedding.numpy())

    bert_mean_embeddings = np.array(bert_mean_embeddings).reshape((len(X), -1))
    bert_cls_embeddings = np.array(bert_cls_embeddings).reshape((len(X), -1))

    return bert_mean_embeddings, bert_cls_embeddings

In [86]:
from transformers import AutoTokenizer, AutoModel


tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-dot-v5")
model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-dot-v5")

In [78]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = BertModel.from_pretrained('bert-base-uncased')

In [79]:
def extract_part(X, index):
    return X.str.split('\n').str.__getitem__(index)

In [80]:
embeddings_1, _ = compute_bert_embeddings(
    extract_part(df_train.Text, 0), tokenizer, model
)

100%|██████████████████████████████████████████████████████████████████████████████| 5500/5500 [03:27<00:00, 26.51it/s]


In [81]:
embeddings_2, _ = compute_bert_embeddings(
    extract_part(df_train.Text, 1), tokenizer, model
)

100%|██████████████████████████████████████████████████████████████████████████████| 5500/5500 [03:28<00:00, 26.34it/s]


In [82]:
from numpy.linalg import norm

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [83]:
preds = [cos_sim(embeddings_1[i], embeddings_2[i]) for i in range(len(embeddings_1))]

In [84]:
def get_score(y_true, y_pred):
    return round(spearmanr(y_true, y_pred)[0], 2)

In [85]:
metric_value = get_score(df_train.Score, preds)
print("Spearman Correlation:", metric_value)

Spearman Correlation: 0.77


In [58]:
embeddings_1_dev, _ = compute_bert_embeddings(
    extract_part(df_dev.Text, 0), tokenizer, model
)

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [00:10<00:00, 22.88it/s]


In [59]:
embeddings_2_dev, _ = compute_bert_embeddings(
    extract_part(df_dev.Text, 1), tokenizer, model
)

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [00:12<00:00, 20.06it/s]


In [63]:
preds = [cos_sim(embeddings_1_dev[i], embeddings_2_dev[i]) for i in range(len(embeddings_1_dev))]

In [64]:
df_dev['Pred_Score'] = preds
df_dev.head()

Unnamed: 0,PairID,Text,Pred_Score
0,ENG-dev-0000,The story is gripping and interesting.\r\nIt's...,1.0
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,1.0
2,ENG-dev-0002,and from your post i think you are to young to...,1.0
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,1.0
4,ENG-dev-0004,I am still confused about how I feel about thi...,1.0


In [None]:
df_dev[['PairID', 'Pred_Score']].to_csv('pred_eng_a.csv', index=False)

In [None]:
import zipfile

csv_file = 'pred_eng_a.csv'

zip_file = 'bert_cos_sim.zip'

with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_file, arcname=csv_file)

print(f'{csv_file} успешно архивирован в {zip_file}')