In [1]:
import os
import warnings
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import AutoModel, AutoTokenizer, TextClassificationPipeline
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from scipy.spatial.distance import cosine

warnings.filterwarnings('ignore')

# Set the TOKENIZERS_PARALLELISM environment variable
os.environ["TOKENIZERS_PARALLELISM"] = "false"

RANDOM_STATE = 42

In [2]:
df = pd.read_csv('data.csv', index_col=0)

In [3]:
df

Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0
...,...,...,...,...
3043,11,15 million sperm can be a father?,My sperm count is 15 millions and is there a c...,1
3044,11,26 yo m with history of progressive venous ins...,"Hello doctor, can you please tell me some of t...",0
3045,11,26 yo m with history of progressive venous ins...,"Hello doctor, I am 26 year old male wth progre...",1
3046,11,32 weeks pregnant. Headache strange leg weakne...,I am 32 weeks pregnant and had severe headache...,0


In [4]:
df_q = pd.concat([df['question_1'], df['question_2']], axis=0)

In [5]:
df_q = df_q.reset_index(drop=True)

In [6]:
df_q = df_q.drop_duplicates().reset_index(drop=True)

In [7]:
df_q = pd.DataFrame(df_q, columns=['text'])

In [8]:
tokenizer = AutoTokenizer.from_pretrained('sileod/deberta-v3-large-tasksource-nli')
model = AutoModel.from_pretrained('sileod/deberta-v3-large-tasksource-nli')

Downloading tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/18.8k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of the model checkpoint at sileod/deberta-v3-large-tasksource-nli were not used when initializing DebertaV2Model: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
%%time
tqdm.pandas()
tokenized = df_q['text'].progress_apply((lambda x: tokenizer.encode(x, add_special_tokens=True,
                                                                  truncation=True)))

  0%|                                                                                                                                                                                                                                                  | 0/4567 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4567/4567 [00:00<00:00, 17325.16it/s]

CPU times: user 254 ms, sys: 14.8 ms, total: 269 ms
Wall time: 268 ms





In [10]:
tokenized

0       [1, 643, 361, 386, 1476, 292, 3730, 299, 16072...
1       [1, 4645, 273, 360, 1272, 287, 29569, 260, 108...
2       [1, 87217, 14024, 341, 269, 278, 1231, 646, 26...
3       [1, 620, 266, 2278, 280, 268, 836, 261, 273, 1...
4       [1, 14200, 277, 11739, 453, 456, 39497, 268, 5...
                              ...                        
4562    [1, 573, 16973, 2795, 269, 706, 3543, 263, 269...
4563    [1, 5365, 2278, 261, 295, 274, 811, 848, 351, ...
4564    [1, 5365, 2278, 261, 273, 481, 1760, 395, 597,...
4565    [1, 273, 481, 2686, 1033, 4870, 263, 330, 3567...
4566    [1, 273, 286, 11141, 261, 3540, 8960, 263, 268...
Name: text, Length: 4567, dtype: object

In [11]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [12]:
padded, padded.shape

(array([[    1,   643,   361, ...,     0,     0,     0],
        [    1,  4645,   273, ...,     0,     0,     0],
        [    1, 87217, 14024, ...,     0,     0,     0],
        ...,
        [    1,  5365,  2278, ...,     0,     0,     0],
        [    1,   273,   481, ...,     0,     0,     0],
        [    1,   273,   286, ...,     0,     0,     0]]),
 (4567, 116))

In [13]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(4567, 116)

In [14]:
torch.backends.mps.is_available()

True

In [15]:
torch.backends.mps.is_built()

True

In [16]:
device = torch.device("mps")

In [17]:
batch_size = 100
embeddings = []
for i in tqdm(range(padded.shape[0] // batch_size + 1)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]).to(device)
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)]).to(device)
        
        with torch.no_grad():
            model.to(device)
            batch_embeddings = model(batch, 
                                     attention_mask=attention_mask_batch
                                    )
        
        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())
        del batch
        del attention_mask_batch
        del batch_embeddings

  0%|                                                                                                                                                                                                                                                    | 0/46 [00:00<?, ?it/s]loc("cast"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/75428952-3aa4-11ee-8b65-46d450270006/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":745:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<100x1x116x116xi1>'
 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 45/46 [07:24<00:09,  9.81s/it]loc("cast"("(mpsFileLoc): /AppleInte

In [18]:
features = np.concatenate(embeddings)

In [19]:
df_enc = pd.DataFrame(features, index=df_q.index)

In [20]:
def recommendation_score(n: int) -> float:
    c = 0
    for i in tqdm(range(n)):
        # вытаксиваем случайное предложение, у которого точно есть пара для проверки
        rand_index = random.choice(df[df['label'] == 1].index.tolist())
        sentence = df.loc[rand_index, 'question_1']
        
        #находим индекс выбранного предложения в общем пуле предлоежний и считаем косинусные расстояния до оставшихся
        index_enc = df_q[df_q['text'] == sentence].index
        df_enc['cos_dist'] = df_enc.apply(lambda x: cosine(x, df_enc.loc[index_enc, :]), axis=1)
    
        #определеяем ближайших соседей не считая самого предложения
        index_similar = df_enc.sort_values('cos_dist').head(6).index.tolist()
        check_sentence = df[(df['question_1'] == sentence)&(df['label']==1)]['question_2'].values[0]
        if check_sentence in df_q.loc[index_similar[1:], 'text'].values:
            c += 1
        else:
            continue
    return c / n

In [21]:
print(f'Accuracy: {recommendation_score(100):0.0%}')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:01<00:00,  1.62it/s]

Accuracy: 11%



