### Group 10 - Donaire Gaurana Isaac Ortiz

#### Medicine Recommendation System

---------------------------------

In [1]:
import pandas as pd
import numpy as np

from scipy import spatial

import os

In [2]:
path = './datasets/SCMFDD-L/'

datasets = list(os.listdir(path))

dd_assoc = pd.read_csv(path + datasets[2])
dd_assoc

Unnamed: 0,drug_id,drug_name,disease_id,disease_name
0,C074619,bivalirudin,MESH:D003328,Coronary Thrombosis
1,C074619,bivalirudin,MESH:D006470,Hemorrhage
2,C074619,bivalirudin,MESH:D007249,Inflammation
3,C074619,bivalirudin,MESH:D007511,Ischemia
4,C074619,bivalirudin,MESH:D013921,Thrombocytopenia
...,...,...,...,...
49212,D013747,Tetrabenazine,MESH:D017109,"Akathisia, Drug-Induced"
49213,D013747,Tetrabenazine,MESH:D018476,Hypokinesia
49214,D013747,Tetrabenazine,MESH:D020734,Parkinsonian Disorders
49215,D013747,Tetrabenazine,MESH:D020820,Dyskinesias


In [3]:
matrix_df = pd.DataFrame(dd_assoc.groupby(['disease_name', 'drug_name'])['disease_id'].count())
matrix_df.rename(columns = {'disease_id': 'binary'}, inplace = True)
matrix_df

Unnamed: 0_level_0,Unnamed: 1_level_0,binary
disease_name,drug_name,Unnamed: 2_level_1
"ACTH Deficiency, Isolated",Dexamethasone,1
"ACTH Deficiency, Isolated",Hydrocortisone,1
AIDS Dementia Complex,Cocaine,1
AIDS Dementia Complex,Methamphetamine,1
AIDS Dementia Complex,Stavudine,1
...,...,...
beta-Thalassemia,Hydroxyurea,1
beta-Thalassemia,Iron,1
beta-Thalassemia,resveratrol,1
succinic semialdehyde dehydrogenase deficiency,4-hydroxybutyric acid,1


In [4]:
matrix = matrix_df.pivot_table(index = 'drug_name',
                               columns = 'disease_name',
                               values = 'binary')
matrix.fillna(int(0), inplace = True)
matrix

disease_name,"ACTH Deficiency, Isolated",AIDS Dementia Complex,AIDS-Related Complex,AIDS-Related Opportunistic Infections,AIDS-related Kaposi sarcoma,"Abdomen, Acute",Abdominal Abscess,Abdominal Injuries,Abdominal Neoplasms,Abdominal Pain,...,"Wounds, Penetrating",Xanthomatosis,Xeroderma Pigmentosum,Xerophthalmia,Xerostomia,Zollinger-Ellison Syndrome,Zygomycosis,alpha 1-Antitrypsin Deficiency,beta-Thalassemia,succinic semialdehyde dehydrogenase deficiency
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"1,10-phenanthroline",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"1,3-butylene glycol",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-Butanol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-benzylimidazole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-octen-3-ol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zolmitriptan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zolpidem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zomepirac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zonisamide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def jaccard_similarity(x, y):
      return np.logical_and(x, y).sum() / float(np.logical_or(x, y).sum())

In [6]:
def get_corr_matrix_jaccard(disease, matrix):
    matrix_func = matrix[matrix[disease] == 1]
    disease_init = list(matrix_func[disease])
    correlation = list()
    
    if disease not in list(matrix_func.columns):
        return 'disease not in matrix'
    
    # jaccard similarity
    for disease_mat in list(matrix_func.columns):
        disease_test = list(matrix_func[disease_mat])
        correlation.append(round(jaccard_similarity(disease_init, disease_test), 2))
    
    return pd.DataFrame(correlation, columns = ['Correlation'], index = matrix_func.columns)

In [7]:
def get_corr_matrix_cosine(disease, matrix):
    matrix_func = matrix[matrix[disease] == 1]
    disease_init = list(matrix_func[disease])
    correlation = list()
    
    if disease not in list(matrix_func.columns):
        return 'disease not in matrix'
    
    # cosine similarity
    for disease_mat in list(matrix_func.columns):
        disease_test = list(matrix_func[disease_mat])
        correlation.append(round(1 - spatial.distance.cosine(disease_init, disease_test), 2))
    
    return pd.DataFrame(correlation, columns = ['Correlation'], index = matrix_func.columns)

In [8]:
def get_drug_matrix(correlation, disease, old_matrix):
    matrix_func = old_matrix[old_matrix[disease] == 1]
    
    rem_cols = list(correlation[correlation['Correlation'] >= 0.1].index)
    rem_mat = matrix_func[matrix_func.columns.intersection(rem_cols)]
    rem_mat = rem_mat.T
    
    non_zero = list()
    for i in list(rem_mat.columns):
        if len(rem_mat[i].value_counts()) == 1:
            continue
        non_zero.append(i)
    
    rem_mat = rem_mat[rem_mat.columns.intersection(non_zero)]
    return rem_mat

In [12]:
def get_recommendation_jaccard(matrix):
    correlations = list()
    f_corr = list()
    
    for init_drug in list(matrix.columns):
        correlation = list()
        init = list(matrix[init_drug])
        
        for test_drug in list(matrix.columns):
            test = list(matrix[test_drug])
            correlation.append(round(jaccard_similarity(init, test), 2))
        correlations.append([init_drug, correlation])
        
    for corr in correlations:
        f_corr.append(sum(corr[1]) / len(corr[1]))
    
    return pd.DataFrame(f_corr, columns = ['Correlation'], index = matrix.columns)

In [17]:
def get_recommendation_cosine(matrix):
    correlations = list()
    f_corr = list()
    
    for init_drug in list(matrix.columns):
        correlation = list()
        init = list(matrix[init_drug])
        
        for test_drug in list(matrix.columns):
            test = list(matrix[test_drug])
            correlation.append(round(1 - spatial.distance.cosine(init, test), 2))
        correlations.append([init_drug, correlation])
        
    for corr in correlations:
        f_corr.append(sum(corr[1]) / len(corr[1]))
    
    return pd.DataFrame(f_corr, columns = ['Correlation'], index = matrix.columns)

In [18]:
def get_recommendation_list(disease, matrix):
    jaccard_one = get_corr_matrix_jaccard(disease, matrix)
    cosine_one = get_corr_matrix_cosine(disease, matrix)
    
    mat_jaccard = get_drug_matrix(jaccard_one, disease, matrix)
    mat_cosine = get_drug_matrix(cosine_one, disease, matrix)
    
    return get_recommendation_jaccard(mat_jaccard).sort_values('Correlation', ascending = False), get_recommendation_cosine(mat_cosine).sort_values('Correlation', ascending = False)

In [25]:
disease_sample = [disease for disease in matrix.columns]

for disease in disease_sample: print(disease)

init_input = input()

ACTH Deficiency, Isolated
AIDS Dementia Complex
AIDS-Related Complex
AIDS-Related Opportunistic Infections
AIDS-related Kaposi sarcoma
Abdomen, Acute
Abdominal Abscess
Abdominal Injuries
Abdominal Neoplasms
Abdominal Pain
Abducens Nerve Diseases
Aberrant Crypt Foci
Abnormalities, Drug-Induced
Abnormalities, Multiple
Abnormalities, Severe Teratoid
Abortion, Habitual
Abortion, Spontaneous
Abortion, Threatened
Abortion, Veterinary
Abruptio Placentae
Abscess
Absence of Tibia
Acanthamoeba Keratitis
Acantholysis
Accelerated Idioventricular Rhythm
Accutane embryopathy
Achalasia microcephaly
Achlorhydria
Acid-Base Imbalance
Acidosis
Acidosis, Lactic
Acidosis, Renal Tubular
Acidosis, Respiratory
Acinetobacter Infections
Acitretin embryopathy
Acne Vulgaris
Acneiform Eruptions
Acquired CJD
Acquired Hyperostosis Syndrome
Acquired Immunodeficiency Syndrome
Acquired angioedema
Acquired ichthyosis
Acrodermatitis
Acrodermatitis enteropathica
Acromegaly
Acth-Independent Macronodular Adrenal Hyperplasia

SUNCT Syndrome
Sacral defect and anterior sacral meningocele
Sacroiliitis
Sagittal Sinus Thrombosis
Salivary Gland Diseases
Salivary Gland Neoplasms
Salmonella Infections
Salpingitis
Sarcoidosis
Sarcoidosis, Pulmonary
Sarcoma
Sarcoma 180
Sarcoma, Clear Cell
Sarcoma, Ewing
Sarcoma, Experimental
Sarcoma, Kaposi
Sarcoma, Myeloid
Sarcoma, Synovial
Sarcopenia
Scabies
Schistosomiasis
Schistosomiasis japonica
Schistosomiasis mansoni
Schizophrenia
Schizophrenia Spectrum and Other Psychotic Disorders
Schizophrenia, Catatonic
Schizophrenia, Childhood
Schizophrenia, Disorganized
Schizophrenia, Paranoid
Schizotypal Personality Disorder
Schnitzler Syndrome
Sciatic Neuropathy
Sciatica
Scleral Diseases
Scleritis
Scleroderma, Diffuse
Scleroderma, Limited
Scleroderma, Localized
Scleroderma, Systemic
Scleromyxedema
Sclerosis
Scoliosis
Scotoma
Scrapie
Scurvy
Seasonal Affective Disorder
Secernentea Infections
Segawa syndrome, autosomal recessive
Seizures
Seizures, Febrile
Self Mutilation
Seminoma
Sensatio

Depressive Disorder, Major


In [26]:
jaccard_res, cosine_res = get_recommendation_list(init_input, matrix)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [27]:
jaccard_res

Unnamed: 0_level_0,Correlation
drug_name,Unnamed: 1_level_1
Lithium,0.259130
Fluoxetine,0.258551
olanzapine,0.254493
Haloperidol,0.250870
Citalopram,0.250290
...,...
tolcapone,0.097391
orlistat,0.090000
Raloxifene Hydrochloride,0.084493
Eszopiclone,0.063913


In [28]:
cosine_res

Unnamed: 0_level_0,Correlation
drug_name,Unnamed: 1_level_1
olanzapine,0.336087
Citalopram,0.333768
Amitriptyline,0.333188
Haloperidol,0.330000
Clomipramine,0.327246
...,...
Isotretinoin,0.150870
Diethylstilbestrol,0.147246
Trichlormethiazide,0.121304
orlistat,0.111014
