In [1]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)
import random
from scipy import spatial

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Run

In [17]:
# Import
path = r'data/cbr_metric_my_method.csv'
cbr_metric_my_method = pd.read_csv(path, index_col=0)

In [19]:
# get random codes / patients to use in survey
index_list = cbr_metric_my_method.index.to_list()
random.shuffle(index_list)
final_index_list = index_list[:10]

In [76]:
# Import
path = r'data/final_problem_dummies.csv'
problem_dummies = pd.read_csv(path)
# Drop columns
problem_dummies.drop(columns=['SUBJECT', 'PROBLEM_DT_TM'], inplace=True)
# Remove prefix
problem_dummies.columns = problem_dummies.columns.str.strip('PROBLEM_')
# Set index
problem_dummies.set_index('new_subject', inplace=True)

In [None]:
# One hot method

# Get closest point for each subject
for n in final_index_list:
    distance, index = spatial.KDTree(problem_dummies).query(problem_dummies.iloc[n], k=2)
    print(n)
    print(index, distance)

In [41]:
# Import
path = r'data/trimmed_patient_embedding_128d.csv'
patients_embeddings = pd.read_csv(path, index_col=0)

In [None]:
# My method

# Get closest point for each subject
for n in final_index_list:
    distance, index = spatial.KDTree(patients_embeddings).query(patients_embeddings.iloc[n], k=2)
    print(n)
    print(index, distance)

In [None]:
# Import
path = r'data/snomed_codes_parents.csv'
snomed_codes_parents = pd.read_csv(path)
# Change to str
snomed_codes_parents['snomed_code'] = snomed_codes_parents['snomed_code'].astype(int).astype(str)
# Get lists
problem_list = problem_dummies.columns.tolist()
snomed_parent_list = snomed_codes_parents['snomed_code'].tolist()
overlap_list = list(set(problem_list)&set(snomed_parent_list))
len(overlap_list)
# Add in one hot encodings for parents across all patients
n = -1
for code in overlap_list:
    n += 1
    if n % 500 == 0:
        print(n)
    # Get parents
    parent_list = snomed_codes_parents[snomed_codes_parents['snomed_code'] == code]['parents'].values[0].strip('[]').split(', ')
    # Filter parent_list for those in problem_dummies
    parent_list = list(set(problem_list)&set(parent_list))
    # Update problem_dummies
    problem_dummies.loc[problem_dummies[code] == 1, parent_list] = 1
# Create frequency df
sum_df = pd.DataFrame()
sum_df.index = problem_dummies.columns
sum_df['frequency'] = problem_dummies.sum(axis=0)


In [None]:
# Rocheteau method

# Get closest point for spesfic point
# Set up patient
for n in final_index_list:
    patient = problem_dummies.loc[[n]]
    patient = patient.loc[:, (patient != 0).any(axis=0)]
    # Find similarity
    score_list2 = []
    for index, subject in problem_dummies.iterrows():
        if index % 10000 == 0:
            print(index)
        running_score = 0
        # Filter for relevant diseases
        subject = subject[patient.columns]
        # Algorithm
        for column in patient:
            running_score += patient[column].values[0] * subject[column] * ((1/sum_df.loc[column].values[0]) + 0.5)
        running_score = running_score * 5
        running_score = running_score - (patient.sum(axis=1).values[0] + subject.sum())
        # Add to list
        score_list2.append(running_score)

    # Create df
    score_df2 = pd.DataFrame()
    score_df2.index = problem_dummies.index
    score_df2['score'] = score_list2
    # Find top scores
    score_df2.apply(lambda s: pd.Series(s.nlargest(2).index))

Results

In [None]:
# 76031

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 76031 - Asthma, Depressive disorder, Hypothyroidism, Fibromyalgia

# 5457 - Asthma, Depressive disorder, Hypothyroidism, Asperger

# 16789 - Asthma, Hypertensive disorder, Diabetes mellitus type 2, Gastroesophageal reflux disease, Cerebrovascular accident, Mixed anxiety and depressive disorder, Hyperlipidemia, Fibromyalgia

# 39239 - Asthma, Depressive disorder, Fibromyalgia

In [None]:
# 38512

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 38512 -  Ischemic heart disease, Hearing loss, Dementia, Essential hypertension, Alzheimer's disease, Diverticulosis of sigmoid colon

# 13785 - Hypertensive disorder, Myocardial infarction, Cataract, Dementia, Alzheimer's disease

# 3791 - Hearing loss, Dementia

# 88330 - Hypertensive disorder, Hypercholesterolemia, Transient ischemic attack, Osteoarthritis, Diverticular disease, Benign prostatic hyperplasia, Myocardial infarction, Alzheimer's disease,  Retention of urine, Small vessel cerebrovascular disease


In [None]:
# 13675

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.
# Note in this case Rocheteau's method and the One hot method retrieved the same similar patient

# 13675 - Hypertensive disorder, Varicella, Polycystic ovaries, Scoliosis deformity of spine

# 59945 - Hypertensive disorder, Varicella, Polycystic ovaries

# 13674 - Varicella, Polycystic ovaries, Scoliosis deformity of spine

In [None]:
# 3534

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 3534 - Varicella, Rheumatoid arthritis with arteritis

# 56760 - Varicella, Rheumatoid arthritis

# 23902 - Rheumatoid arthritis with arteritis

# 242 - Asthma, Varicella, Transient ischemic attack, Cerebrovascular accident, Fibromyositis, Rheumatoid arthritis with aortitis, Seizures in response to acute event

In [None]:
# 79550

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 79550 - Hypertensive disorder, Hypercholesterolemia, Neoplasm of kidney

# 92747 - Hypertensive disorder, Neoplasm of kidney

# 93178 - Hypertensive disorder, Hypercholesterolemia, Insulin treated type 2 diabetes mellitus, Neoplasm of kidney

# 478 - Hypertensive disorder,  Diabetes mellitus type 2, Hypercholesterolemia, Osteoarthritis, Atrial fibrillation, Mixed anxiety and depressive disorder, Malignant tumor of prostate, Neoplasm of kidney

In [None]:
# 52894

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 52894 - Asthma, Hemorrhoids, Colitis, Viral hepatitis type B, Gastritis, Human immunodeficiency virus infection, Supraventricular tachycardia, Non-alcoholic fatty liver, Portal hypertension, Hemophilia

# 14197 - Asthma, Osteoporosis, Hypertensive disorder, Hypercholesterolemia, Vitamin D deficiency, Depressive disorder, Hemorrhoids, Atrial fibrillation, Chronic obstructive lung disease, Anemia, Rheumatoid arthritis, Inflammatory disease of liver, Dysphagia, Steatosis of liver, Aortic valve regurgitation

# 18386 - Asthma, Depressive disorder, Hemorrhoids, Human immunodeficiency virus infection

# 43224 - Hemorrhoids, Atrial fibrillation, Hypothyroidism, Viral hepatitis type B

In [None]:
# 64561

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 64561 - Hypertensive disorder, Varicella, Gestational diabetes mellitus, Pre-eclampsia

# 39443 - Varicella, Gestational diabetes mellitus, Pre-eclampsia, Pregnancy-induced hypertension

# 5188 - Hypertensive disorder, Varicella, Gestational diabetes mellitus

# 24987 - Varicella, Gestational diabetes mellitus, Pre-eclampsia

In [None]:
# 78751

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 78751 - Hypertensive disorder, Diabetes mellitus type 2, Hypercholesterolemia, Vitamin D deficiency, Benign prostatic hyperplasia, Angina, Old myocardial infarction, Chronic ischemic heart disease

# 77680 - Hypertensive disorder, Diabetes mellitus type 2, Ischemic heart disease, Angina

# 55423 - Hypertensive disorder, Diabetes mellitus type 2, Hypercholesterolemia, Vitamin D deficiency

# 56819 - Hypertensive disorder, Diabetes mellitus type 2,  Benign prostatic hyperplasia, Gastroesophageal reflux disease,  Iron deficiency anemia, Acute non-ST segment elevation myocardial infarction, Angina,  Heart failure, Constipation, Hyperlipidemia, Retention of urine, Bilateral cataracts, Heart failure with normal ejection fraction     

In [None]:
#8684

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.
# Note in this case Rocheteau's method and the One hot method retrieved the same similar patient

# 89732 - Osteoarthritis, Alcohol dependence

# 8685 -  Osteoarthritis, Peripheral nerve entrapment syndrome, Alcoholism

In [None]:
# 32662

# The following lines are the random patient in question and their co-morbidities followed by the most similar patient and their co-morbidities retrieved by our method, Rocheteau's method and the One hot method respectively.

# 32662 - Asthma, Hypertensive disorder, Osteoarthritis, Gastroesophageal reflux disease, Anemia, Diabetes mellitus type 2, Hypercholesterolemia, Hypothyroidism

# 31758 - Asthma, Hypertensive disorder, Osteoarthritis, Gastroesophageal reflux disease, Anemia, Diabetes mellitus, Hyperlipidemia, Obstructive sleep apnea syndrome

# 53442 - Asthma, Hypertensive disorder, Rheumatoid arthritis,  Diabetes, Hypercholesterolemia, Anemia, Gastroesophageal reflux disease, Hypothyroidism, Coronary arteriosclerosis, Pulmonary embolism, Chronic kidney disease

# 341 - Asthma, Hypertensive disorder, Osteoarthritis, Hypercholesterolemia, Diabetes mellitus type 2

In [None]:
# My method - mean 0.01 s per patient - 10933.33 times faster or 10^4 fold faster
# Rocheteau method - mean 103.33 s per patient
# One hit method - mean 109.33 s per patient