In [None]:
import re
import time
import numpy as np
import spacy
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read the CSV files into dataframes
target_vocab = pd.read_csv('/content/drive/MyDrive/GeekBrains/target.csv')
to_map = pd.read_csv('/content/drive/MyDrive/GeekBrains/to_map.csv')
target_vocab

Unnamed: 0,concept_id,concept_name
0,36684768,Drusen of bilateral optic discs
1,37110249,Microvascular embolism of arteriole (disorder)
2,4220821,Bronzed diabetes
3,4120412,Level of psychoticism
4,4002835,Bruising of oropharynx
...,...,...
564483,4036092,ROM - Range of motion activity
564484,37309624,Low back pain co-occurrent with left side scia...
564485,4006806,Miscarriage with uraemia
564486,4238036,Urological fistula


In [None]:
to_map

Unnamed: 0,concept_code,concept_name
0,10002417,Angiocentric lymphoma stage II
1,10002418,Angiocentric lymphoma stage III
2,10002419,Angiocentric lymphoma stage IV
3,10002441,Angiogram pulmonary abnormal
4,10002442,Angiogram pulmonary normal
...,...,...
396,10007195,Capillary fragility normal
397,10007200,Capillary permeability increased
398,10007201,Capillary permeability normal
399,10007218,Carbohydrate tolerance decreased


In [None]:
# уберём потенциально ненужные слова
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
             'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
             'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
             'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were',
             'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
             'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
             'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
             'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
             'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
             'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',
             'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain',
             'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
             "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
             "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
             'wouldn', "wouldn't",
              # don't care  about right and left most of the time
             #'right','left',
             'noc', 'nos', '[d]', 'unknown_unit', '|', 'see comment', 'due', 'nec', 'unspecified', '[v]', '(see comments)',
             # additional deletions for SNOMED
              '(disorder)', '(procedure)', '(finding)']


In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz (532.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.3/532.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.5.0,>=3.4.1 (from en-core-sci-lg==0.5.1)
  Downloading spacy-3.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting wasabi<1.1.0,>=0.9.1 (from spacy<3.5.0,>=3.4.1->en-core-sci-lg==0.5.1)
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting typer<0.8.0,>=0.3.0 (from spacy<3.5.0,>=3.4.1->en-core-sci-lg==0.5.1)
  Downloading typer-0.7.0-py3-none-any.whl (38 kB)
Building wheels for collected packages: en-core-sci-l

In [None]:
# Загрузка модели для английского языка
nlp = spacy.load("en_core_sci_lg")

  


In [None]:
# ignores weird symbols
to_map['prepared_name'] = to_map['concept_name'].apply(lambda x: re.sub(r"\s+", " ", str(x).encode('ascii', 'ignore').decode()) if isinstance(x, str) else str(x))
target_vocab['prepared_name'] = target_vocab['concept_name'].apply(lambda x: re.sub(r"\s+", " ", str(x).encode('ascii', 'ignore').decode()) if isinstance(x, str) else str(x))
# Remove punctuation, digits etc
to_map['prepared_name'] = to_map['prepared_name'].map(lambda x: re.sub('[,\.!;?)%(\'\":\-]', '', x))
target_vocab['prepared_name'] = target_vocab['prepared_name'].map(lambda x: re.sub('[,\.!;?)%(\'\":\-]', '', x))
# Convert the titles to lowercase
to_map['prepared_name'] = to_map['prepared_name'].map(lambda x: x.lower())
target_vocab['prepared_name'] = target_vocab['prepared_name'].map(lambda x: x.lower())
# Remove stopwords
to_map['prepared_name'] = to_map['prepared_name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
target_vocab['prepared_name'] = target_vocab['prepared_name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))



In [None]:
to_map.head(3)

Unnamed: 0,concept_code,concept_name,prepared_name
0,10002417,Angiocentric lymphoma stage II,angiocentric lymphoma stage ii
1,10002418,Angiocentric lymphoma stage III,angiocentric lymphoma stage iii
2,10002419,Angiocentric lymphoma stage IV,angiocentric lymphoma stage iv


In [None]:
target_vocab.head(5)

Unnamed: 0,concept_id,concept_name,prepared_name
0,36684768,Drusen of bilateral optic discs,drusen bilateral optic discs
1,37110249,Microvascular embolism of arteriole (disorder),microvascular embolism arteriole disorder
2,4220821,Bronzed diabetes,bronzed diabetes
3,4120412,Level of psychoticism,level psychoticism
4,4002835,Bruising of oropharynx,bruising oropharynx


In [None]:
import time
def tokenize_with_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    return tokens

start_time = time.time()

to_map['prepared_name'] = to_map['prepared_name'].apply(tokenize_with_spacy)
target_vocab['prepared_name'] = target_vocab['prepared_name'].apply(tokenize_with_spacy)

elapsed_time = (time.time() - start_time)/60
print(f"Total elapsed time: {elapsed_time:.1f} minutes")

# Пример решения NLP задачи из реальной практики
---
Ранее мы использовали инструменты и подходы NLP для решения задач нашего локального исследования.
> В домашнем задании №3, вы столкнулись с проблемой обозначения побочных эффектов в избранной онтологической системе.

Это общая задача, иногда, когда исходные данные представлены в нестандартной (неудобной в нашем контексте) системе кодирования, а мы хотим нестандартную систему привести к стандартной, необходима процедура маппинга (из английского mapping) | перевода.


In [None]:
def lemmatize_with_spacy(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

start_time = time.time()

# Применение функции к вашему столбцу
voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].apply(lambda x: lemmatize_with_spacy(' '.join(x)))
#voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(lambda x: lemmatize_with_spacy(' '.join(x)))

elapsed_time = (time.time() - start_time)/60
print(f"Total elapsed time: {elapsed_time:.1f} minutes")

Total elapsed time: 0.0 minutes


In [None]:
voc1_df.head()

Unnamed: 0,hierarchy,concept_id,concept_code,concept_name,domain_id,concept_class_id,concept_name_processed
0,Investigations(SOC)-Endocrine investigations (...,36312355,10000005,17 ketosteroids urine,Measurement,PT,"[17, ketosteroid, urine]"
1,"Congenital, familial and genetic disorders(SOC...",35305837,10000013,"17,20-desmolase deficiency",Condition,PT,"[1720desmolase, deficiency]"
2,"Congenital, familial and genetic disorders(SOC...",35305840,10000021,21-hydroxylase deficiency,Condition,PT,"[21hydroxylase, deficiency]"
3,"Congenital, familial and genetic disorders(SOC...",35305665,10000101,Abdominal wall anomaly,Condition,PT,"[abdominal, wall, anomaly]"
4,Surgical and medical procedures(SOC)-Nervous s...,37521451,10000111,Abducent nerve operation,Procedure,PT,"[abducent, nerve, operation]"


In [None]:
voc2_df.head()

Unnamed: 0,concept_id,concept_name,concept_name_processed
0,36684768,Drusen of bilateral optic discs,"['drusen', 'bilateral', 'optic', 'discs']"
1,37110249,Microvascular embolism of arteriole (disorder),"['microvascular', 'embolism', 'arteriole', 'di..."
2,4220821,Bronzed diabetes,"['bronzed', 'diabete']"
3,4120412,Level of psychoticism,"['level', 'psychoticism']"
4,4002835,Bruising of oropharynx,"['bruise', 'oropharynx']"


In [None]:
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(lambda x: x.strip("[]").replace("'", "").split(', ')) # запускатЬ, если загружается преобработанный файл

In [None]:
# Save previously prepared vocabulary for future use
# voc2_df.to_csv('/content/drive/MyDrive/SNOMED_OMOP_vocabulary_preprocessed.csv', index=False)

In [None]:
voc2_df

Unnamed: 0,concept_id,concept_name,concept_name_processed
0,36684768,Drusen of bilateral optic discs,"[drusen, bilateral, optic, discs]"
1,37110249,Microvascular embolism of arteriole (disorder),"[microvascular, embolism, arteriole, disorder]"
2,4220821,Bronzed diabetes,"[bronzed, diabete]"
3,4120412,Level of psychoticism,"[level, psychoticism]"
4,4002835,Bruising of oropharynx,"[bruise, oropharynx]"
...,...,...,...
564483,4036092,ROM - Range of motion activity,"[rom, range, motion, activity]"
564484,37309624,Low back pain co-occurrent with left side scia...,"[low, back, pain, cooccurrent, leave, side, sc..."
564485,4006806,Miscarriage with uraemia,"[miscarriage, uraemia]"
564486,4238036,Urological fistula,"[urological, fistula]"


In [None]:
# Можно ли что-либо придумать с этим кодом?


import time
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Загрузка предобученной модели GloVe
glove_model_path = 'glove.6B.300d.txt'  # Путь к вашей модели GloVe
glove_vectors = {}  # Словарь для хранения векторов слов

with open(glove_model_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=float)
        glove_vectors[word] = vector

# Функция для векторизации текста с использованием GloVe
def vectorize_text(text):
    tokens = text.split()
    vectors = [glove_vectors.get(token, np.zeros(300)) for token in tokens]
    if vectors:
        return np.mean(vectors, axis=0)  # Усреднение векторов
    else:
        return np.zeros(300)  # Возвращает нулевой вектор, если нет известных слов

start_time = time.time()


results = []

num_top_matches = 5  # Количество максимальных значений, которые вы хотите сохранить
batch_size=10

for voc1_index, voc1_row in voc1_df.iterrows():
    if voc1_index % batch_size ==0:
      elapsed_time=(time.time()-start_time)/60
      print(f"Processed {voc1_index} concepts in {elapsed_time:.1f} minutes")


    voc1_tokens = voc1_row['concept_name_processed']
    voc1_name = voc1_row['concept_name']

    voc1_text = ' '.join(voc1_tokens)
    voc1_vector = vectorize_text(voc1_text)

    similarities = []

    for voc2_index, voc2_row in voc2_df.iterrows():



        voc2_text = ' '.join(voc2_row['concept_name_processed'])
        voc2_vector = vectorize_text(voc2_text)

        similarity = cosine_similarity([voc1_vector], [voc2_vector])[0][0]
        similarities.append(similarity)

    # Находим индексы 5 наиболее похожих voc2(SNOMED и т. д.) концепций
    top_indices = np.argsort(similarities)[-num_top_matches:][::-1]

    for max_index in top_indices:
        voc2_concept_id = voc2_df.loc[max_index, 'concept_id']
        voc2_name = voc2_df.loc[max_index, 'concept_name']

        # Добавляем отдельную строку информации в результаты
        results.append((voc1_name, voc1_tokens, voc2_concept_id, voc2_name, max_index))

# Конвертируем результаты в DataFrame
columns = ['voc1_name', 'voc1_tokens', 'voc2_concept_id', 'voc2_name', 'max_index']
results_df = pd.DataFrame(results, columns=columns)

elapsed_time = (time.time() - start_time) / 60
print(f"Total elapsed time: {elapsed_time:.1f} minutes")


Processed 0 concepts in 0.0 minutes
Processed 10 concepts in 21.9 minutes
Processed 20 concepts in 43.8 minutes
Processed 30 concepts in 65.6 minutes
Processed 40 concepts in 87.3 minutes
Processed 50 concepts in 109.1 minutes
Processed 60 concepts in 130.8 minutes
Processed 70 concepts in 152.5 minutes
Processed 80 concepts in 174.2 minutes
Processed 90 concepts in 195.9 minutes
Processed 100 concepts in 217.4 minutes
Processed 110 concepts in 238.9 minutes
Processed 120 concepts in 260.5 minutes
Processed 130 concepts in 282.0 minutes
Processed 140 concepts in 303.5 minutes
Processed 150 concepts in 324.9 minutes
Processed 160 concepts in 346.4 minutes
Processed 170 concepts in 367.9 minutes
Processed 180 concepts in 389.2 minutes
Processed 190 concepts in 410.7 minutes
Processed 200 concepts in 432.1 minutes
Processed 210 concepts in 453.7 minutes
Processed 220 concepts in 475.2 minutes
Processed 230 concepts in 496.6 minutes
Processed 240 concepts in 518.1 minutes
Processed 250 con

In [None]:
results_df

In [None]:
# Output the results as a Pandas dataframe
df_sorted = results_df.sort_values(by=['voc1_name', 'max_index'], ascending=False)
df_sorted.to_csv('meddra_pt_mapping_results031023.csv', index=False)
df_sorted.head(100)