# Use environment: transformer

In [1]:
# RESOURCES
# https://stackoverflow.com/questions/63461262/bert-sentence-embeddings-from-transformers
# https://chtalhaanwar.medium.com/measure-sentence-similarity-using-the-pre-trained-bert-model-eeaf20bc1933

In [2]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')
print ('Last modified by Xiaoqing: ' + date)

Last modified by Xiaoqing: 211115


In [3]:
import torch #pytorch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity #for similarity
import pandas as pd

In [4]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", output_hidden_states=True)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
#create embeddings
def get_embeddings(text,token_length):
  tokens=tokenizer(text,max_length=token_length,padding='max_length',truncation=True)
  output=model(torch.tensor(tokens.input_ids).unsqueeze(0),
               attention_mask=torch.tensor(tokens.attention_mask).unsqueeze(0)).hidden_states[-1]
  return torch.mean(output,axis=1).detach().numpy()

#calculate similarity
def calculate_similarity(text1,text2,token_length=20):
    out1=get_embeddings(text1,token_length=token_length)#create embeddings of text
    out2=get_embeddings(text2,token_length=token_length)#create embeddings of text
    sim=cosine_similarity(out1,out2)[0][0]
    return sim

In [6]:
# text1='Pneumonia'
# text2='Headache'
# calculate_similarity(text1,text2)

# text1='Pneumonia'
# text2='Tuberculosis'
# calculate_similarity(text1,text2)

# text1='Pneumonia'
# text2='some random stuff'
# calculate_similarity(text1,text2)

# create three data frames: problem, test, treatment

In [7]:
df1 = pd.read_csv('patients_ner_211112.csv') # 10 synthetic patients from google Drive
df2 = pd.read_csv('trials_ner_211112.csv') # 5 real clinical trials from google Drive

df3 = df1[df1['ent_type'] == 'PROBLEM']
df4 = df2[df2['ent_type'] == 'PROBLEM']

df5 = df1[df1['ent_type'] == 'TEST']
df6 = df2[df2['ent_type'] == 'TEST']

df7 = df1[df1['ent_type'] == 'TREATMENT']
df8 = df2[df2['ent_type'] == 'TREATMENT']


In [8]:
df_problem = pd.DataFrame(columns=['patient_id','trial_id','inclusion','similarity']) # create empty df with column names only so we can append rows to it
df_test = pd.DataFrame(columns=['patient_id','trial_id','inclusion','similarity']) # create empty df with column names only so we can append rows to it
df_treatment = pd.DataFrame(columns=['patient_id','trial_id','inclusion','similarity']) # create empty df with column names only so we can append rows to it


# For each patient-clinical-trial pair, calculate similarities

In [9]:
%%time
for index1, row1 in df3.iterrows():
    for index2, row2 in df4.iterrows(): 
        simi = calculate_similarity(row1['sentence'],row2['sentence'])
        df_problem = df_problem.append({'patient_id': row1['patient_id'],'trial_id': row2['trial_id'],'inclusion': row2['inclusion'],'prob_sim': simi}, ignore_index = True)
    

CPU times: user 15min 23s, sys: 8.29 s, total: 15min 31s
Wall time: 15min 26s


In [10]:
df_problem.head()

Unnamed: 0,patient_id,trial_id,inclusion,similarity,prob_sim
0,1.0,1.0,1.0,,0.88992
1,1.0,1.0,0.0,,0.883804
2,1.0,1.0,0.0,,0.885631
3,1.0,1.0,0.0,,0.915829
4,1.0,1.0,0.0,,0.857459


In [11]:
%%time
for index1, row1 in df5.iterrows():
    for index2, row2 in df6.iterrows(): 
        simi = calculate_similarity(row1['sentence'],row2['sentence'])
        df_test = df_test.append({'patient_id': row1['patient_id'],'trial_id': row2['trial_id'],'inclusion': row2['inclusion'],'test_sim': simi}, ignore_index = True)
    

CPU times: user 16min 9s, sys: 9 s, total: 16min 18s
Wall time: 16min 11s


In [12]:
%%time
for index1, row1 in df7.iterrows():
    for index2, row2 in df8.iterrows(): 
        simi = calculate_similarity(row1['sentence'],row2['sentence'])
        df_treatment = df_treatment.append({'patient_id': row1['patient_id'],'trial_id': row2['trial_id'],'inclusion': row2['inclusion'],'treat_sim': simi}, ignore_index = True)
    

CPU times: user 8min 29s, sys: 4.69 s, total: 8min 34s
Wall time: 8min 30s


In [13]:
df9 = df_problem.groupby(['patient_id', 'trial_id', 'inclusion'])['prob_sim'].mean().reset_index()
df10 = df_test.groupby(['patient_id', 'trial_id', 'inclusion'])['test_sim'].mean().reset_index()
df11 = df_treatment.groupby(['patient_id', 'trial_id', 'inclusion'])['treat_sim'].mean().reset_index()

In [14]:
key = ['patient_id', 'trial_id', 'inclusion']
df12 = df9.merge(df10, on=key, how='outer').merge(df11, on=key, how='outer')


In [15]:
df12.head(15)

Unnamed: 0,patient_id,trial_id,inclusion,prob_sim,test_sim,treat_sim
0,1.0,1.0,0.0,0.882579,0.769954,0.824857
1,1.0,1.0,1.0,0.88992,0.816201,
2,1.0,2.0,0.0,0.867623,0.779779,0.803614
3,1.0,2.0,1.0,0.866833,0.769566,0.813542
4,1.0,3.0,0.0,0.894545,0.779701,0.822263
5,1.0,3.0,1.0,0.911047,0.782193,0.747964
6,1.0,4.0,0.0,0.814233,,0.782762
7,1.0,4.0,1.0,0.857412,0.758333,0.808552
8,1.0,5.0,0.0,0.810141,0.756612,0.760447
9,1.0,5.0,1.0,0.799751,0.756208,0.73902


# Save CSV 

In [17]:
df12.to_csv(('demo_'+ date + '.csv'),index = False)