In [4]:
import re
import time
import numpy as np
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
dd = pd.read_csv('to_find', sep = '|')

In [40]:
dd2 = pd.read_csv('CUIs_CONCEPTs.tsv', sep = '\t')

In [41]:
dd2.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009
2,C0002170,"Alopecia,Loss of hair",278040002
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002
4,C0027497,"Nausea,Nausea",422587007


In [42]:
# Загрузка модели для английского языка
nlp = spacy.load("en_core_sci_sm")

In [43]:
# Read the CSV files into dataframes
voc2_df = dd2
voc1_df = dd

In [44]:
voc1_df

Unnamed: 0,id,Text
0,1,"extreme weight gain, short-term memory loss, h..."
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .
2,3,Just TWO tablets of Lexapro 10mg completely de...
3,4,Its called PSSD: post-SSRI sexual dysfunction....
4,6,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su..."
5,7,"I was unable to sleep, had blurred vision, and..."
6,8,Unable to eat anything significant for the 3 d...
7,9,While driving to a friends house crazy thought...
8,10,Would not have been able to work (software dev...
9,11,"First 10 days were HORRIBLE, like a looong pan..."


In [45]:
voc2_df


Unnamed: 0,CUI,CONCEPT,SNOMED_CODE
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009
2,C0002170,"Alopecia,Loss of hair",278040002
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002
4,C0027497,"Nausea,Nausea",422587007
...,...,...,...
539,C0010200,"Coughing,Cough",49727002
540,C0019112,"Hemorrhoids,Hemorrhoids",70153002
541,C0343495,"Lockjaw,Tetanus with trismus",240432006
542,C0271650,"Impaired glucose tolerance,Impaired glucose to...",9414007


In [46]:
# # words to remove from strings
# stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
#              'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
#              'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
#              'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were',
#              'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
#              'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
#              'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
#              'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
#              'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
#              'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',
#              'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain',
#              'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
#              "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
#              "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
#              'wouldn', "wouldn't",
#               # don't care  about right and left most of the time
#              #'right','left']

In [47]:
stop_words = ['noc', 'nos', '[d]', 'unknown_unit', '|', 'see comment', 'due', 'nec', 'unspecified', '[v]', '(see comments)',
              '(disorder)', '(procedure)', '(finding)']
# ignores weird symbols
voc1_df['concept_name_processed'] = voc1_df['Text'].apply(lambda x: re.sub(r"\s+", " ", str(x).encode('ascii', 'ignore').decode()) if isinstance(x, str) else str(x))
voc2_df['concept_name_processed'] = voc2_df['CONCEPT'].apply(lambda x: re.sub(r"\s+", " ", str(x.encode('ascii', 'ignore').decode())))

In [48]:
voc1_df

Unnamed: 0,id,Text,concept_name_processed
0,1,"extreme weight gain, short-term memory loss, h...","extreme weight gain, short-term memory loss, h..."
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .
2,3,Just TWO tablets of Lexapro 10mg completely de...,Just TWO tablets of Lexapro 10mg completely de...
3,4,Its called PSSD: post-SSRI sexual dysfunction....,Its called PSSD: post-SSRI sexual dysfunction....
4,6,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...","Nausea, Blurred Vision, 3 to 5 hours sleep, Su..."
5,7,"I was unable to sleep, had blurred vision, and...","I was unable to sleep, had blurred vision, and..."
6,8,Unable to eat anything significant for the 3 d...,Unable to eat anything significant for the 3 d...
7,9,While driving to a friends house crazy thought...,While driving to a friends house crazy thought...
8,10,Would not have been able to work (software dev...,Would not have been able to work (software dev...
9,11,"First 10 days were HORRIBLE, like a looong pan...","First 10 days were HORRIBLE, like a looong pan..."


In [49]:
# Remove punctuation, digits etc
voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].map(lambda x: re.sub('[,\.!;?)%(\'\":\-]', '', x))
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].map(lambda x: re.sub('[,\.!;?)%(\'\":\-]', '', x))

In [50]:
voc1_df

Unnamed: 0,id,Text,concept_name_processed
0,1,"extreme weight gain, short-term memory loss, h...",extreme weight gain shortterm memory loss hair...
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,COMPLETELY DESTROYED SEXUALLY FUNCTIONING
2,3,Just TWO tablets of Lexapro 10mg completely de...,Just TWO tablets of Lexapro 10mg completely de...
3,4,Its called PSSD: post-SSRI sexual dysfunction....,Its called PSSD postSSRI sexual dysfunction 5 ...
4,6,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...",Nausea Blurred Vision 3 to 5 hours sleep Suici...
5,7,"I was unable to sleep, had blurred vision, and...",I was unable to sleep had blurred vision and f...
6,8,Unable to eat anything significant for the 3 d...,Unable to eat anything significant for the 3 days
7,9,While driving to a friends house crazy thought...,While driving to a friends house crazy thought...
8,10,Would not have been able to work (software dev...,Would not have been able to work software deve...
9,11,"First 10 days were HORRIBLE, like a looong pan...",First 10 days were HORRIBLE like a looong pani...


In [51]:
# Convert the titles to lowercase
voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].map(lambda x: x.lower())
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].map(lambda x: x.lower())

In [52]:
voc1_df

Unnamed: 0,id,Text,concept_name_processed
0,1,"extreme weight gain, short-term memory loss, h...",extreme weight gain shortterm memory loss hair...
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,completely destroyed sexually functioning
2,3,Just TWO tablets of Lexapro 10mg completely de...,just two tablets of lexapro 10mg completely de...
3,4,Its called PSSD: post-SSRI sexual dysfunction....,its called pssd postssri sexual dysfunction 5 ...
4,6,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...",nausea blurred vision 3 to 5 hours sleep suici...
5,7,"I was unable to sleep, had blurred vision, and...",i was unable to sleep had blurred vision and f...
6,8,Unable to eat anything significant for the 3 d...,unable to eat anything significant for the 3 days
7,9,While driving to a friends house crazy thought...,while driving to a friends house crazy thought...
8,10,Would not have been able to work (software dev...,would not have been able to work software deve...
9,11,"First 10 days were HORRIBLE, like a looong pan...",first 10 days were horrible like a looong pani...


In [53]:
# Remove stopwords
voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [59]:
def tokenize_with_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    return tokens

In [60]:
start_time = time.time()

voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].apply(tokenize_with_spacy)
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(tokenize_with_spacy)

elapsed_time = (time.time() - start_time)/60
print(f"Total elapsed time for tokenization: {elapsed_time:.1f} minutes")

Total elapsed time for tokenization: 0.1 minutes


In [61]:
voc1_df

Unnamed: 0,id,Text,concept_name_processed
0,1,"extreme weight gain, short-term memory loss, h...","[extreme, weight, gain, shortterm, memory, los..."
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,"[completely, destroyed, sexually, functioning]"
2,3,Just TWO tablets of Lexapro 10mg completely de...,"[just, two, tablets, of, lexapro, 10, mg, comp..."
3,4,Its called PSSD: post-SSRI sexual dysfunction....,"[its, called, pssd, postssri, sexual, dysfunct..."
4,6,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...","[nausea, blurred, vision, 3, to, 5, hours, sle..."
5,7,"I was unable to sleep, had blurred vision, and...","[i, was, unable, to, sleep, had, blurred, visi..."
6,8,Unable to eat anything significant for the 3 d...,"[unable, to, eat, anything, significant, for, ..."
7,9,While driving to a friends house crazy thought...,"[while, driving, to, a, friends, house, crazy,..."
8,10,Would not have been able to work (software dev...,"[would, not, have, been, able, to, work, softw..."
9,11,"First 10 days were HORRIBLE, like a looong pan...","[first, 10, days, were, horrible, like, a, loo..."


In [62]:
def lemmatize_with_spacy(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

In [63]:
start_time = time.time()

# Применение функции к вашему столбцу
voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].apply(lambda x: lemmatize_with_spacy(' '.join(x)))
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(lambda x: lemmatize_with_spacy(' '.join(x)))

elapsed_time = (time.time() - start_time)/60
print(f"Total elapsed time for lemmatization: {elapsed_time:.1f} minutes")




Total elapsed time for lemmatization: 0.0 minutes


In [64]:
voc1_df.head()

Unnamed: 0,id,Text,concept_name_processed
0,1,"extreme weight gain, short-term memory loss, h...","[extreme, weight, gain, shortterm, memory, los..."
1,2,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,"[completely, destroy, sexually, function]"
2,3,Just TWO tablets of Lexapro 10mg completely de...,"[just, two, tablet, of, lexapro, 10, mg, compl..."
3,4,Its called PSSD: post-SSRI sexual dysfunction....,"[its, call, pssd, postssri, sexual, dysfunctio..."
4,6,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...","[nausea, blur, vision, 3, to, 5, hour, sleep, ..."


In [65]:
voc2_df.head()

Unnamed: 0,CUI,CONCEPT,SNOMED_CODE,concept_name_processed
0,C0000765,"Excessive body weight gain,Excessive weight gain",224994002,"[excessive, body, weight, gainexcessive, weigh..."
1,C0701811,"Poor short-term memory,Poor short-term memory",247592009,"[poor, shortterm, memorypoor, shortterm, memory]"
2,C0002170,"Alopecia,Loss of hair",278040002,"[alopecialoss, of, hair]"
3,C0549622,"Sexual Dysfunction,Sexual disorder",231532002,"[sexual, dysfunctionsexual, disorder]"
4,C0027497,"Nausea,Nausea",422587007,[nauseanausea]


# BioWordVec

In [67]:
voc1_df['concept_name_processed'] = voc1_df['concept_name_processed'].apply(lambda x: sorted(x))
# voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(lambda x: x.strip("[]").replace("'", "").split(', '))
voc2_df['concept_name_processed'] = voc2_df['concept_name_processed'].apply(lambda x: sorted(x))

In [68]:
import time
import numpy as np
import gensim
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained word embeddings model
model_path = 'BioWordVec_PubMed_MIMICIII_d200.vec.bin'
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)


In [69]:
# Функция для векторизации текста с использованием BioWordVec
def vectorize_text(text, model):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)  # Усреднение векторов
    else:
        return np.zeros(model.vector_size)  # Возвращает нулевой вектор, если нет известных слов

In [70]:
start_time = time.time()
# Применение функции к медицинским терминам в обоих наборах данных
voc1_df['voc1_vector'] = voc1_df['concept_name_processed'].apply(lambda x: vectorize_text(' '.join(x), word2vec_model))
voc2_df['voc2_vector'] = voc2_df['concept_name_processed'].apply(lambda x: vectorize_text(' '.join(x), word2vec_model))

# Удаление строк с отсутствующими векторами
voc1_df = voc1_df.dropna(subset=['voc1_vector'])
voc2_df = voc2_df.dropna(subset=['voc2_vector'])


In [71]:
start_time = time.time()

num_top_matches = 10

v1 = np.vstack(voc1_df['voc1_vector'])
v2 = np.vstack(voc2_df['voc2_vector'])


In [76]:
voc2_df.columns

Index(['CUI', 'CONCEPT', 'SNOMED_CODE', 'concept_name_processed',
       'voc2_vector'],
      dtype='object')

In [77]:
similarity = cosine_similarity(v1, v2)
tops = (-similarity).argsort()[:, :num_top_matches]

results_df = pd.concat(     # concat (axis = 0) the list of data frames
    [pd.concat( 
        [voc1_df.loc[[i], ['Text', 'id']].reset_index(drop=True),  # for each target concept from voc_1
         voc2_df.loc[[t], ['CUI', 'CONCEPT']].reset_index(drop=True),        # add potential mapping match from voc_2
         pd.Series(similarity[i][t], name='Similarity')], axis= 1    
    )                       
     for i in range(0, len(voc1_df)) for t in tops[i]]                                   # `i` - target vocabulary index, `t` - index in top similarity array
).reset_index(drop=True)

elapsed_time = (time.time() - start_time) / 60
print(f"Total elapsed time: {elapsed_time:.1f} minutes")

Total elapsed time: 2.1 minutes


In [79]:
results_df.head(20)

Unnamed: 0,Text,id,CUI,CONCEPT,Similarity
0,"extreme weight gain, short-term memory loss, h...",1,C1262477,"Weight decreased,Weight loss",0.878992
1,"extreme weight gain, short-term memory loss, h...",1,C0586746,"Excessive weight loss,Excessive weight loss",0.870397
2,"extreme weight gain, short-term memory loss, h...",1,C1971624,"Loss of appetite,Loss of appetite",0.863028
3,"extreme weight gain, short-term memory loss, h...",1,C1971624,"Loss of appetite,Loss of appetite,Appetite absent",0.862949
4,"extreme weight gain, short-term memory loss, h...",1,C0456814,"Loss of motivation,Loss of motivation",0.860723
5,"extreme weight gain, short-term memory loss, h...",1,C0011057,"Hearing Loss, Sudden,Sudden hearing loss",0.844947
6,"extreme weight gain, short-term memory loss, h...",1,C0000765,"Excessive body weight gain,Excessive weight gain",0.84054
7,"extreme weight gain, short-term memory loss, h...",1,C0701810,"Poor long-term memory,Poor long-term memory,Lo...",0.834841
8,"extreme weight gain, short-term memory loss, h...",1,C0231247,"Failure to lose weight,Failure to lose weight",0.829252
9,"extreme weight gain, short-term memory loss, h...",1,C0424091,"Loss of interest,Lack of interest",0.820407


In [18]:
results_df.head(50)

Unnamed: 0,concept_name,target_concept_id,concept_id,concept_name.1,Similarity
0,Excision of hymen,4248130,4183993,Excision of uterus and supporting structures (...,0.824244
1,Excision of hymen,4248130,4029358,Excision of head structure (procedure),0.819015
2,Excision of hymen,4248130,506989,Primary fleur-de-lys abdominoplasty with excis...,0.816473
3,Excision of hymen,4248130,506988,Primary fleur-de-lys abdominoplasty with excis...,0.813626
4,Excision of hymen,4248130,507710,Primary fleur-de-lys abdominoplasty with excis...,0.812911
5,Excision of hymen,4248130,4125349,Removal of mole of skin by excision (procedure),0.810542
6,Excision of hymen,4248130,4087107,Minor surgery done - excision (finding),0.810011
7,Excision of hymen,4248130,506957,Primary fleur-de-lys abdominoplasty with excis...,0.809283
8,Excision of hymen,4248130,4049783,Complete excision of anus (procedure),0.809114
9,Excision of hymen,4248130,4183993,Excision of uterus and supporting structures,0.80842


In [22]:
# Modification to new variant
new_columns = list(results_df.columns)
new_columns[0] = 'voc1_name'
results_df.columns = new_columns
results_df.rename(columns={
    'target_concept_id': 'voc1_concept_id',
    'concept_id': 'voc2_concept_id',
    'concept_name': 'voc2_name',
    'Similarity': 'similarity'
}, inplace=True)

# Присвойте новый список названий столбцов атрибуту 'columns' DataFrame
results_df['voc1_concept_id']=0
results_df

Unnamed: 0,voc1_name,voc1_concept_id,voc2_concept_id,voc2_name,similarity
0,Excision of hymen,0,4183993,Excision of uterus and supporting structures (...,0.824244
1,Excision of hymen,0,4029358,Excision of head structure (procedure),0.819015
2,Excision of hymen,0,506989,Primary fleur-de-lys abdominoplasty with excis...,0.816473
3,Excision of hymen,0,506988,Primary fleur-de-lys abdominoplasty with excis...,0.813626
4,Excision of hymen,0,507710,Primary fleur-de-lys abdominoplasty with excis...,0.812911
...,...,...,...,...,...
4995,Obstetric umbilical artery Doppler,0,42535610,Doppler ultrasound velocimetry of umbilical ar...,0.933016
4996,Obstetric umbilical artery Doppler,0,42535610,Doppler ultrasound scan velocimetry of umbilic...,0.916219
4997,Obstetric umbilical artery Doppler,0,42535610,Doppler ultrasound scan velocimetry of umbilic...,0.914680
4998,Obstetric umbilical artery Doppler,0,4145534,Doppler ultrasound scan of umbilical artery,0.913540


In [23]:
# Output the results as a Pandas dataframe
df_sorted = results_df.sort_values(by=['voc1_name', 'similarity'], ascending=False)
df_sorted['origin']='BioWordVec'
df_sorted.to_csv('final_table'+'_biowordvec.csv', index=False)
df_sorted.head(50)

Unnamed: 0,voc1_name,voc1_concept_id,voc2_concept_id,voc2_name,similarity,origin
2990,sigmoidosc,0,36684768,Drusen of bilateral optic discs,0.0,BioWordVec
2991,sigmoidosc,0,4095324,Human leukocyte antigen A genotype determination,0.0,BioWordVec
2992,sigmoidosc,0,35610376,Otosyphilis,0.0,BioWordVec
2993,sigmoidosc,0,4146361,Failed attempted abortion with electrolyte imb...,0.0,BioWordVec
2994,sigmoidosc,0,4228331,Leucokeratosis nicotina palati,0.0,BioWordVec
2995,sigmoidosc,0,4283224,Repair of parasternal diaphragmatic hernia,0.0,BioWordVec
2996,sigmoidosc,0,43021066,Aneurysm of patch of right ventricular outflow...,0.0,BioWordVec
2997,sigmoidosc,0,4019257,"Complete tear, sacrotuberous ligament",0.0,BioWordVec
2998,sigmoidosc,0,133645,Burn any degree involving 80-89 percent of bod...,0.0,BioWordVec
2999,sigmoidosc,0,42537493,Recurrent cholesteatoma of mastoid cavity,0.0,BioWordVec


In [None]:
df_biowordvec=df_sorted

# Final merge tables and remove duplicates

In [None]:
import datetime
current_datetime = datetime.datetime.now()
current_date = current_datetime.date()

In [None]:
# Объединить таблицы, устранить дублирующие строки с одинаковыми парами сорс-код и potential_concept_id
table_name = 'combined_table_'+str(current_date)+'.csv'
df_exp = pd.concat([df_tfidf, df_biowordvec, df_levenshtein])
df_exp = df_exp.sort_values(by=['voc1_name'])
df_exp.to_csv(table_name)    

In [None]:
# combined_df=pd.concat([df_tfidf, df_biowordvec, df_levenshtein])                                  
# combined_df['question'] = 'Pick the best semantic match for '
# combined_df = combined_df[['question'] + [col for col in combined_df.columns if col != 'question']]
# combined_df.rename(columns={'voc1_name': 'source_code_description', 'voc2_concept_id': 'potential_target_concept_id'}, inplace=True)
# combined_df = combined_df.sort_values(by=['source_code_description', 'potential_target_concept_id'])
# unique_combined_df = combined_df.drop_duplicates(subset=['source_code_description', 'potential_target_concept_id'])
# unique_combined_df.to_csv('final_table_gpt.csv', index=False)
# unique_combined_df.head(50)

In [None]:
# unique_combined_df = unique_combined_df[['question', 'source_code_description', 'potential_target_concept_id']]
# unique_combined_df['chatgptreply']=''
# unique_combined_df['target_concept_id']=0
# unique_combined_df.head(50)

In [None]:
# login="dev_test5"
# passw = "7hGg365$%fhhTfr$dJ"
# engine = create_engine('postgresql://{}:{}@ovh07.odysseusinc.com:5555/postgres'.format(login, passw))
# table_name = 'chatgpt_table_'+str(current_date)
# unique_combined_df.to_sql(table_name, engine, if_exists='replace', index=False)
# print(f'Import {table_name} was sucessful!')

In [None]:
# Объединить кусочки таблиц (при необходимости)
import pandas as pd
df1 = pd.read_csv('combined_table_1.csv')
df2 = pd.read_csv('combined_table_2.csv')
df3 = pd.read_csv('combined_table_3.csv')
df = pd.concat([df1, df2, df3])
df = df.sort_values(by=['voc1_name', 'voc2_concept_id'])
df = df.drop_duplicates(subset=['voc1_name', 'voc2_concept_id'])
df.to_csv('combined_table_final_091023.csv', index=False)