In [1]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Import

In [None]:
# Import indications
path = r'data/bnf_drug_indications_and_prescribing_medications.csv'
drug_indication_df = pd.read_csv(path)

# Import codes
path = r'data/snomed_diagnosis_codes_morethan5.csv'
snomed_diagnosis_codes = pd.read_csv(path)

path = r'data/snomed_problem_codes_morethan5.csv'
snomed_problem_codes = pd.read_csv(path)

path = r'data/snomed_riskfactor_codes_morethan5.csv'
snomed_riskfactor_codes = pd.read_csv(path)

# Import UMLS CUI to snomed map
iter_csv = pd.read_csv(r"data/MRCONSO.RRF", sep="|" , iterator=True, chunksize=10000000)
mrconso_df = pd.concat([chunk for chunk in iter_csv])
mrconso_df.columns = ['cui_code', 'language', 'term_status', 'LUI', 'string_type', 'SUI', 'atom_status', 'AUI', 'SAUI', 'SCUI', 'SDUI', 'SAB', 'TTY', 'snomed_code', 'STR', 'SRL', 'suppress', 'CVF', 'extra']
print('Done!')

# Import metamap output
metamap_output_df = pd.read_fwf('data/metamap_batch_output')
metamap_output_df.columns = ['metamap_text', 'other']

## Map UMLS CUI to SNOMED

mrconso_df

In [3]:
mrconso_df = mrconso_df[(mrconso_df['language'] == 'ENG') & (mrconso_df['SAB'] == 'SNOMEDCT_US')]
mrconso_df.reset_index(inplace=True, drop=True)
mrconso_df = mrconso_df[['cui_code', 'snomed_code']]
mrconso_df.drop_duplicates(inplace=True)

In [4]:
mrconso_df

Unnamed: 0,cui_code,snomed_code
0,C0000039,102735002
2,C0000052,58488005
7,C0000097,285407008
9,C0000102,13579002
14,C0000163,112116001
...,...,...
1597433,C5699989,640961000124109
1597435,C5699990,640971000124102
1597437,C5699991,138875005
1597438,C5699992,6011000124106


metamap_df

In [55]:
# Import metamap output
# read_table works better than read_fwf
metamap_output_df = pd.read_table('data/metamap_batch_output')
metamap_output_df.columns = ['metamap_text']

In [None]:
# Filter for rows and text of interest
filtered_metamap_output_df = metamap_output_df[metamap_output_df['metamap_text'].str.contains('Processing', case=False) | metamap_output_df['metamap_text'].str.contains('Disease or Syndrome', case=False)]
# Fill na
filtered_metamap_output_df.fillna('', inplace=True)
# Reset index
filtered_metamap_output_df.reset_index(inplace=True, drop=True)

In [45]:
filtered_metamap_output_df

Unnamed: 0,metamap_text
0,"Processing 'text_000N_20459.tx.1': ""Primary hy..."
1,833 C0342879 : Primary hypercholesterolem...
2,Processing 'text_000N_211766.tx.1': Combined (...
3,833 C0020474 : Familial combined hyperlip...
4,833 C2047520 : Mixed hyperlipidemia (diso...
...,...
9812,901 C0030567 : Parkinson disease (Parkins...
9813,Processing 'text_000N_153641.tx.1': Parkinson'...
9814,1000 C0030567 : Parkinson disease (Parkins...
9815,Processing 'text_000N_20735.tx.1': Parkinson's...


In [None]:
# First filter procesing lines 
processing_list = filtered_metamap_output_df[filtered_metamap_output_df['metamap_text'].str.contains('Processing', case=False)].index.to_list()

for x in processing_list:
    indication_text = filtered_metamap_output_df.iloc[x]['metamap_text'].split(":",1)[1]
    indication_text = indication_text.replace('"', '') 
    filtered_metamap_output_df.loc[x, 'indication'] = indication_text


In [None]:
# Second filter codes / disorders 
code_list = filtered_metamap_output_df[filtered_metamap_output_df['metamap_text'].str.contains('Disease or Syndrome', case=False)].index.to_list()

for x in code_list:
    start = 'C'
    end = ':'
    s  = filtered_metamap_output_df.iloc[x]['metamap_text']
    snomed_code_text = s[s.find(start)+len(start)-1:s.rfind(end)]
    snomed_code_text = snomed_code_text.replace(' ', '') 
    filtered_metamap_output_df.loc[x, 'cui_code'] = snomed_code_text
    
    disease_text = filtered_metamap_output_df.iloc[x]['metamap_text'].split(":",1)[1]
    filtered_metamap_output_df.loc[x, 'disease_or_syndrome'] = disease_text

In [None]:
# Forward fill indication column
filtered_metamap_output_df['indication'] = filtered_metamap_output_df['indication'].ffill()
# Drop rows where no snomed code
filtered_metamap_output_df = filtered_metamap_output_df[filtered_metamap_output_df['cui_code'].notna()]
# Drop full text column
filtered_metamap_output_df = filtered_metamap_output_df[['indication', 'cui_code', 'disease_or_syndrome']]
# Drop duplicates 
filtered_metamap_output_df.drop_duplicates(inplace=True)
# Reset index 
filtered_metamap_output_df.reset_index(inplace=True, drop=True)

In [60]:
filtered_metamap_output_df

Unnamed: 0,indication,cui_code,disease_or_syndrome
0,Primary hypercholesterolaemia in patients who...,C0342879,Primary hypercholesterolemia [Disease or Synd...
1,Combined (mixed) hyperlipidaemia in patients ...,C0020474,Familial combined hyperlipidemia (Hyperlipide...
2,Combined (mixed) hyperlipidaemia in patients ...,C2047520,Mixed hyperlipidemia (disorder) [Disease or S...
3,Heterozygous familial hypercholesterolaemia i...,C0342882,Familial hypercholesterolemia - heterozygous ...
4,Homozygous familial hypercholesterolaemia in ...,C0342881,Familial homozygous hypercholesterolemia (Hom...
...,...,...,...
1797,Parkinson's disease (patients not receiving l...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...
1798,Parkinson's disease (patients transferring fr...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...
1799,Severe Parkinson's disease inadequately contr...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...
1800,Parkinson's disease (for fine adjustment of S...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...


indication_code_df

In [63]:
indication_code_df = pd.merge(filtered_metamap_output_df, mrconso_df)

In [64]:
indication_code_df

Unnamed: 0,indication,cui_code,disease_or_syndrome,snomed_code
0,Primary hypercholesterolaemia in patients who...,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
1,"Primary hypercholesterolaemia, or combined (m...",C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
2,Primary hypercholesterolaemia (type IIa incl...,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
3,Primary hypercholesterolaemia (type IIa inclu...,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
4,Severe primary hypercholesterolaemia (type II...,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
...,...,...,...,...
7507,Acute pulmonary oedema,C0155919,Acute pulmonary edema [Disease or Syndrome],123262009
7508,Acute pulmonary oedema,C0155919,Acute pulmonary edema [Disease or Syndrome],40541001
7509,Acute pulmonary oedema,C0155919,Acute pulmonary edema [Disease or Syndrome],195113008
7510,Acute pulmonary oedema,C0155919,Acute pulmonary edema [Disease or Syndrome],196149004


drug_indication_code_df

In [110]:
# Strip both so merge works properly 
drug_indication_df2 = drug_indication_df.copy()
indication_code_df2 = indication_code_df.copy()

# Remove text after BMI so obesity works properly
for x in range(len(drug_indication_df2)):
    indication_text = drug_indication_df2.iloc[x]['indication'].split("BMI",1)[0]
    drug_indication_df2.loc[x, 'indication'] = indication_text

for x in range(len(indication_code_df2)):
    indication_text = indication_code_df2.iloc[x]['indication'].split("BMI",1)[0]
    indication_code_df2.loc[x, 'indication'] = indication_text

drug_indication_df2['indication'] = drug_indication_df2['indication'].str.strip()
indication_code_df2['indication'] = indication_code_df2['indication'].str.strip()

In [112]:
drug_indication_code_df = pd.merge(drug_indication_df2, indication_code_df2)

In [113]:
# Rename and drop duplicates
drug_indication_code_df.drop_duplicates(inplace=True)
drug_indication_code_df.rename(columns={'drug_name':'bnf_drug_name', 'indication':'bnf_indication', 'prescribing_medication':'icht_prescribing_medication', 'disease_or_syndrome':'cui_disease_or_syndrome'}, inplace=True)

In [114]:
drug_indication_code_df

Unnamed: 0,bnf_drug_name,bnf_indication,icht_prescribing_medication,cui_code,cui_disease_or_syndrome,snomed_code
0,Atorvastatin,Primary hypercholesterolaemia in patients who ...,atorvastatin,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
1,Atorvastatin,Primary hypercholesterolaemia in patients who ...,Atorvastatin,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
2,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,C0020474,Familial combined hyperlipidemia (Hyperlipide...,48190005
3,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,C0020474,Familial combined hyperlipidemia (Hyperlipide...,238040008
4,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,C0020474,Familial combined hyperlipidemia (Hyperlipide...,190780005
...,...,...,...,...,...,...
15682,Co-careldopa,Parkinson's disease (patients not receiving le...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,154999006
15683,Co-careldopa,Parkinson's disease (patients not receiving le...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,192825001
15684,Co-careldopa,Parkinson's disease (patients not receiving le...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,155000006
15685,Co-careldopa,Parkinson's disease (patients not receiving le...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,192831003


In [116]:
# See how many overlap 

In [117]:
snomed_problem_codes.rename(columns={'PROBLEM':'snomed_code'}, inplace=True)
snomed_diagnosis_codes.rename(columns={'DIAGNOSIS_CODE_SNOMED':'snomed_code'}, inplace=True)
snomed_riskfactor_codes.rename(columns={'SNOMED_CODE':'snomed_code'}, inplace=True)

In [118]:
snomed_problem_codes['snomed_code'] = snomed_problem_codes['snomed_code'].astype(str)
snomed_diagnosis_codes['snomed_code'] = snomed_diagnosis_codes['snomed_code'].astype(str)
snomed_riskfactor_codes['snomed_code'] = snomed_riskfactor_codes['snomed_code'].astype(str)

In [119]:
new_problem_df = pd.merge(drug_indication_code_df, snomed_problem_codes)
new_riskfactor_df = pd.merge(drug_indication_code_df, snomed_riskfactor_codes)
new_diagnosis_df = pd.merge(drug_indication_code_df, snomed_diagnosis_codes)

In [120]:
new_problem_df.snomed_code.nunique()
snomed_problem_codes.snomed_code.nunique()
new_problem_df.snomed_code.nunique()/snomed_problem_codes.snomed_code.nunique()
new_riskfactor_df.snomed_code.nunique()
snomed_riskfactor_codes.snomed_code.nunique()
new_riskfactor_df.snomed_code.nunique()/snomed_riskfactor_codes.snomed_code.nunique()
new_diagnosis_df.snomed_code.nunique()
snomed_diagnosis_codes.snomed_code.nunique()
new_diagnosis_df.snomed_code.nunique()/snomed_diagnosis_codes.snomed_code.nunique()

336

2361

0.1423125794155019

41

211

0.1943127962085308

187

1184

0.1579391891891892

Fuzzy indication merge 

In [127]:
import difflib 
drug_indication_df2 = drug_indication_df.copy()
drug_indication_df2['new_indication'] = drug_indication_df2['indication'].apply(lambda x: difflib.get_close_matches(x, indication_code_df['indication'], n=1, cutoff=0.8))#[0])

In [133]:
# select first element in list for fuzzy indication merge
for x in range(len(drug_indication_df2)):
    if len(drug_indication_df2.iloc[x]['new_indication']) >= 1:
        indication_2_text = drug_indication_df2.iloc[x]['new_indication'][0]
        drug_indication_df2.loc[x, 'indication_2'] = indication_2_text
    else:
        indication_2_text = drug_indication_df2.iloc[x]['indication']
        drug_indication_df2.loc[x, 'indication_2'] = indication_2_text


In [135]:
drug_indication_df2

Unnamed: 0,drug_name,indication,prescribing_medication,new_indication,indication_2
0,Atorvastatin,Primary hypercholesterolaemia in patients who ...,atorvastatin,[ Primary hypercholesterolaemia in patients wh...,Primary hypercholesterolaemia in patients who...
1,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,[ Combined (mixed) hyperlipidaemia in patients...,Combined (mixed) hyperlipidaemia in patients ...
2,Atorvastatin,Heterozygous familial hypercholesterolaemia in...,atorvastatin,[ Heterozygous familial hypercholesterolaemia ...,Heterozygous familial hypercholesterolaemia i...
3,Atorvastatin,Homozygous familial hypercholesterolaemia in ...,atorvastatin,[ Homozygous familial hypercholesterolaemia in...,Homozygous familial hypercholesterolaemia in ...
4,Atorvastatin,Primary prevention of cardiovascular events in...,atorvastatin,[],Primary prevention of cardiovascular events in...
...,...,...,...,...,...
4173,Co-careldopa,Parkinson's disease (patients not receiving le...,Co-careldopa,[ Parkinson's disease (patients not receiving ...,Parkinson's disease (patients not receiving l...
4174,Co-careldopa,Parkinson's disease (patients transferring fro...,Co-careldopa,[ Parkinson's disease (patients transferring f...,Parkinson's disease (patients transferring fr...
4175,Co-careldopa,Severe Parkinson's disease inadequately contro...,Co-careldopa,[ Severe Parkinson's disease inadequately cont...,Severe Parkinson's disease inadequately contr...
4176,Co-careldopa,Parkinson's disease (for fine adjustment of Si...,Co-careldopa,[ Parkinson's disease (for fine adjustment of ...,Parkinson's disease (for fine adjustment of S...


In [140]:
# New merge with indication_2
drug_indication_code_df_2 = pd.merge(drug_indication_df2[['drug_name', 'prescribing_medication', 'indication_2']], indication_code_df, how='inner', left_on='indication_2', right_on='indication')

In [142]:
# Rename and drop duplicates
drug_indication_code_df_2.drop_duplicates(inplace=True)
drug_indication_code_df_2.rename(columns={'drug_name':'bnf_drug_name', 'indication':'bnf_indication', 'prescribing_medication':'icht_prescribing_medication', 'disease_or_syndrome':'cui_disease_or_syndrome'}, inplace=True)
drug_indication_code_df_2.drop('indication_2', axis=1, inplace=True)

In [143]:
drug_indication_code_df_2

Unnamed: 0,bnf_drug_name,icht_prescribing_medication,bnf_indication,cui_code,cui_disease_or_syndrome,snomed_code
0,Atorvastatin,atorvastatin,Primary hypercholesterolaemia in patients who...,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
1,Atorvastatin,Atorvastatin,Primary hypercholesterolaemia in patients who...,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
2,Atorvastatin,atorvastatin,Combined (mixed) hyperlipidaemia in patients ...,C0020474,Familial combined hyperlipidemia (Hyperlipide...,48190005
3,Atorvastatin,atorvastatin,Combined (mixed) hyperlipidaemia in patients ...,C0020474,Familial combined hyperlipidemia (Hyperlipide...,238040008
4,Atorvastatin,atorvastatin,Combined (mixed) hyperlipidaemia in patients ...,C0020474,Familial combined hyperlipidemia (Hyperlipide...,190780005
...,...,...,...,...,...,...
16792,Co-careldopa,Co-careldopa,Parkinson's disease (patients not receiving l...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,154999006
16793,Co-careldopa,Co-careldopa,Parkinson's disease (patients not receiving l...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,192825001
16794,Co-careldopa,Co-careldopa,Parkinson's disease (patients not receiving l...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,155000006
16795,Co-careldopa,Co-careldopa,Parkinson's disease (patients not receiving l...,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,192831003


In [158]:
# Merge fuzzy with normal

In [155]:
drug_indication_code_df_3 = drug_indication_code_df_2.copy()
drug_indication_code_df_3['bnf_indication'] = drug_indication_code_df_3['bnf_indication'].str.strip()
drug_indication_code_df_3 = pd.concat([drug_indication_code_df, drug_indication_code_df_3])

In [156]:
drug_indication_code_df_3.drop_duplicates(inplace=True)
drug_indication_code_df_3.reset_index(drop=True, inplace=True)

In [157]:
drug_indication_code_df_3

Unnamed: 0,bnf_drug_name,bnf_indication,icht_prescribing_medication,cui_code,cui_disease_or_syndrome,snomed_code
0,Atorvastatin,Primary hypercholesterolaemia in patients who ...,atorvastatin,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
1,Atorvastatin,Primary hypercholesterolaemia in patients who ...,Atorvastatin,C0342879,Primary hypercholesterolemia [Disease or Synd...,238076009
2,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,C0020474,Familial combined hyperlipidemia (Hyperlipide...,48190005
3,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,C0020474,Familial combined hyperlipidemia (Hyperlipide...,238040008
4,Atorvastatin,Combined (mixed) hyperlipidaemia in patients w...,atorvastatin,C0020474,Familial combined hyperlipidemia (Hyperlipide...,190780005
...,...,...,...,...,...,...
16578,Co-careldopa,Parkinson's disease (for fine adjustment of Si...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,154999006
16579,Co-careldopa,Parkinson's disease (for fine adjustment of Si...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,192825001
16580,Co-careldopa,Parkinson's disease (for fine adjustment of Si...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,155000006
16581,Co-careldopa,Parkinson's disease (for fine adjustment of Si...,Co-careldopa,C0030567,Parkinson disease (Parkinson Disease) [Diseas...,192831003


In [159]:
# See how many overlap

In [160]:
new_problem_df_2 = pd.merge(drug_indication_code_df_3, snomed_problem_codes)
new_riskfactor_df_2 = pd.merge(drug_indication_code_df_3, snomed_riskfactor_codes)
new_diagnosis_df_2 = pd.merge(drug_indication_code_df_3, snomed_diagnosis_codes)

In [161]:
new_problem_df_2.snomed_code.nunique()
snomed_problem_codes.snomed_code.nunique()
new_problem_df_2.snomed_code.nunique()/snomed_problem_codes.snomed_code.nunique()
new_riskfactor_df_2.snomed_code.nunique()
snomed_riskfactor_codes.snomed_code.nunique()
new_riskfactor_df_2.snomed_code.nunique()/snomed_riskfactor_codes.snomed_code.nunique()
new_diagnosis_df_2.snomed_code.nunique()
snomed_diagnosis_codes.snomed_code.nunique()
new_diagnosis_df_2.snomed_code.nunique()/snomed_diagnosis_codes.snomed_code.nunique()

340

2361

0.14400677678949597

41

211

0.1943127962085308

187

1184

0.1579391891891892

In [None]:
# Check numbers match 

In [163]:
new_problem_df_4 = pd.merge(indication_code_df, snomed_problem_codes)
new_riskfactor_df_4 = pd.merge(indication_code_df, snomed_riskfactor_codes)
new_diagnosis_df_4 = pd.merge(indication_code_df, snomed_diagnosis_codes)

In [164]:
new_problem_df_4.snomed_code.nunique()
snomed_problem_codes.snomed_code.nunique()
new_problem_df_4.snomed_code.nunique()/snomed_problem_codes.snomed_code.nunique()
new_riskfactor_df_4.snomed_code.nunique()
snomed_riskfactor_codes.snomed_code.nunique()
new_riskfactor_df_4.snomed_code.nunique()/snomed_riskfactor_codes.snomed_code.nunique()
new_diagnosis_df_4.snomed_code.nunique()
snomed_diagnosis_codes.snomed_code.nunique()
new_diagnosis_df_4.snomed_code.nunique()/snomed_diagnosis_codes.snomed_code.nunique()

340

2361

0.14400677678949597

41

211

0.1943127962085308

187

1184

0.1579391891891892

Save

In [165]:
# Save
#drug_indication_code_df.to_csv('drugs_indications_and_codes.csv', index=False)

In [166]:
# Save
#drug_indication_code_df_3.to_csv('drugs_indications_and_codes_fuzzy.csv', index=False)