In [3]:
import pandas as pd
import numpy as np

import os

from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine


from google.colab import drive
drive.mount("/content/drive")


tqdm.pandas()

DATA_DIR = "/content/drive/MyDrive/итмо/"

columns_pheno = [
    "fg_name",
    "category_fg",
    "category_meta",
    "description_ukb",
    "description_more_ukb",
    ]

pheno_to_split = [
    "description_more_ukb",
    "description_ukb",]

Mounted at /content/drive


In [4]:
merged = pd.read_csv(os.path.join(DATA_DIR, 'merged_bio.tsv'), sep='\t')
print(merged.shape)
merged.sample(7)

(5623, 7)


Unnamed: 0,ukb_code,fg_code,fg_name,category_fg,category_meta,description_ukb,description_more_ukb
1848,Z50,,,,,Z50 Care involving use of rehabilitation proce...,truncated: true
4013,N14_ENDOMET_INFERT,N14_ENDOMET_INFERT,Endometriosis diagnosis and infertility diagno...,XIV Diseases of the genitourinary system (N14_),XIV Diseases of the genitourinary system (N14_),,
2592,614.4,,,,,"Inflammatory diseases of uterus, except cervix",
2426,524.3,,,,,Anomalies of tooth position/malocclusion,
713,1498,,,,,Coffee intake,"ACE touchscreen question ""How many cups of cof..."
178,20471,,,,,Ever seen an un-real vision,"Question asked: ""Did you ever see something th..."
2390,507,,,,,Pleurisy; pleural effusion,


In [5]:
ru_en_names = pd.read_csv(os.path.join(DATA_DIR, 'codes.tsv'), sep='\t')
print(ru_en_names.shape)
ru_en_names.sample(7)

(29, 3)


Unnamed: 0,codes,en,ru
21,collection_monogenic,Monogenic diseases,Моногенные заболевания
11,type_цк,Whole blood,Цельная кровь
19,type_гд,Gonadal biopsy specimens,Биоптаты гонад
1,type_инг,Whole blood in RNA stabilizer,Цельная кровь в консерванте для для стабилизац...
20,type_лп,Leukocyte film,Лейкоцитарная плёнка
27,collection_repro_dis,Reproductive system disease,Нарушение репродуктивной системы
15,type_эяк,Semen,Эякулят


In [6]:
merged['en'] = [ru_en_names['en'].tolist() for i in range(merged.shape[0])]
merged_compare = merged.explode('en')
merged_compare

Unnamed: 0,ukb_code,fg_code,fg_name,category_fg,category_meta,description_ukb,description_more_ukb,en
0,30600,,,,,Albumin,,DNA
0,30600,,,,,Albumin,,Whole blood in RNA stabilizer
0,30600,,,,,Albumin,,Urine
0,30600,,,,,Albumin,,Plasma
0,30600,,,,,Albumin,,Plasma (fetal fraction)
...,...,...,...,...,...,...,...,...
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Population control
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Spontaneous abortion
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Infertility
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Reproductive system disease


## Delete stopwords

In [7]:
stop_words = ['Diseases of', 'Disorders of', 'Diseases', 'Disorders', 'Other', 'Question',]
stop_words = [i.lower() for i in stop_words] + stop_words
def delete_stopwords(string):
    try:
        for stop_word in stop_words:
            string = string.replace(stop_word, "")
    except AttributeError:
        pass
    return string

for column in columns_pheno+['en',]:
    merged_compare[column] = merged_compare[column].apply(delete_stopwords)
merged_compare

Unnamed: 0,ukb_code,fg_code,fg_name,category_fg,category_meta,description_ukb,description_more_ukb,en
0,30600,,,,,Albumin,,DNA
0,30600,,,,,Albumin,,Whole blood in RNA stabilizer
0,30600,,,,,Albumin,,Urine
0,30600,,,,,Albumin,,Plasma
0,30600,,,,,Albumin,,Plasma (fetal fraction)
...,...,...,...,...,...,...,...,...
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Population control
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Spontaneous abortion
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Infertility
5622,,Z21_TOBAC_USE,Tobacco use,XXI Factors influencing health status and cont...,,,,Reproductive system disease


In [8]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [9]:
# Function to get embeddings from BioBERT
def get_biobert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()[0]

# Function to compute similarity using cosine similarity
def return_similarity_bert(text1, text2):
    embedding1 = get_biobert_embeddings(text1)
    embedding2 = get_biobert_embeddings(text2)
    return 1 - cosine(embedding1, embedding2)


In [10]:
def return_similarity(text1, text2):
    if pd.isnull(text1) or pd.isnull(text2):
        return 0
    similarity = return_similarity_bert(text1, text2)
    return similarity

## Process category_ukb

In [11]:
from copy import deepcopy

dataset = merged_compare
n = len(dataset)

# Splitting the dataset
parts = [
    deepcopy(dataset[: n // 3]),
    deepcopy(dataset[n // 3 : 2 * n // 3]),
    deepcopy(dataset[2 * n // 3 :]),
]

In [15]:
i_starting = 2
for column in pheno_to_split:
    for i in range(len(parts)):
        if i < i_starting:
            continue
        cur_part = deepcopy(parts[i])
        cur_part[f'similarity_{column}'] = cur_part.progress_apply(lambda row: return_similarity(row[column], row["en"]), axis=1)
        cur_part.to_csv(os.path.join(DATA_DIR, f'similarity_{column}_{i}.csv'))

100%|██████████| 54356/54356 [00:02<00:00, 26430.70it/s]
100%|██████████| 54356/54356 [23:07<00:00, 39.19it/s]   


## Process all other

In [None]:
# Calculating similarity scores
i_starting = 0
for i, column in enumerate(columns_pheno):
    if i < i_starting:
        continue
    if column in pheno_to_split:
        continue
    cur_merged_compare = deepcopy(merged_compare)
    cur_merged_compare[f'similarity_{column}'] = cur_merged_compare.progress_apply(lambda row: return_similarity(row[column], row["en"]), axis=1)
    cur_merged_compare.to_csv(os.path.join(DATA_DIR, f'similarity_{column}.csv'))