# Patient Similarity

In this notebook we will explore using vectors to represent patients based on their ICD9 codes and then use vector operations to compute similarity between patients. The vectors will typically be sparse so we will explore using 
dictionaries to represent sparse vectors.

In [34]:

import pymysql
import pandas as pd
import getpass
import pandas as pd
import seaborn as sns
import numpy as np
from numpy.linalg import norm
from collections import defaultdict
import itertools

In [37]:
#import sys
#sys.path.append("..")
#from myla.becvector import *
import myla
from myla.becvector import *

In [3]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),db='mimic2')
cursor = conn.cursor()

Enter MySQL passwd for jovyan········


In [4]:
pd.read_sql('SELECT * from icd9',conn).head()


Unnamed: 0,subject_id,hadm_id,sequence,code,description
0,56,28766,1,198.3,SECONDARY MALIGNANT NEOPLASM OF BRAIN AND SPIN...
1,56,28766,2,162.8,MALIGNANT NEOPLASM OF OTHER PARTS OF BRONCHUS ...
2,56,28766,3,531.4,CHRONIC OR UNSPECIFIED GASTRIC ULCER WITH HEMO...
3,56,28766,4,276.1,HYPOSMOLALITY AND/OR HYPONATREMIA
4,56,28766,5,428.0,CONGESTIVE HEART FAILURE UNSPECIFIED


In [5]:
icd9_codes = pd.read_sql('SELECT subject_id, code, description from icd9',conn)
icd9_codes.head()

Unnamed: 0,subject_id,code,description
0,56,198.3,SECONDARY MALIGNANT NEOPLASM OF BRAIN AND SPIN...
1,56,162.8,MALIGNANT NEOPLASM OF OTHER PARTS OF BRONCHUS ...
2,56,531.4,CHRONIC OR UNSPECIFIED GASTRIC ULCER WITH HEMO...
3,56,276.1,HYPOSMOLALITY AND/OR HYPONATREMIA
4,56,428.0,CONGESTIVE HEART FAILURE UNSPECIFIED


### We need to ...

1. get the unique ICD9 codes
2. Create a vocabulary that maps a code to a dimension in our vector space.
3. Create a map from the code to the description to make things more human friendly

In [10]:
icd9_codes.shape

(53486, 3)

In [8]:
voc_code = icd9_codes.code.unique()
voc_code.sort()

In [11]:
len(voc_code)

2719

In [12]:
code_map = dict(zip(icd9_codes.code, icd9_codes.description))
len(code_map)

2719

In [13]:
code_map

{'198.3': 'SECONDARY MALIGNANT NEOPLASM OF BRAIN AND SPINAL C',
 '162.8': 'MALIGNANT NEOPLASM OF OTHER PARTS OF BRONCHUS OR L',
 '531.40': 'CHR STOMACH ULC W HEM                   \r',
 '276.1': 'HYPOSMOLALITY AND/OR HYPONATREMIA',
 '428.0': 'CONGESTIVE HEART FAILURE UNSPECIFIED',
 '780.39': 'OTHER CONVULSIONS',
 '272.0': 'PURE HYPERCHOLESTEROLEMIA',
 '401.9': 'UNSPECIFIED ESSENTIAL HYPERTENSION',
 '410.71': 'SUBENDOCARDIAL INFARCTION INITIAL EPISODE OF CARE',
 '496': 'CHRONIC AIRWAY OBSTRUCTION NOT ELSEWHERE CLASSIFIE',
 '535.51': 'UNSPECIFIED GASTRITIS AND GASTRODUODENITIS WITH HE',
 '285.1': 'ACUTE POSTHEMORRHAGIC ANEMIA',
 '486': 'PNEUMONIA ORGANISM UNSPECIFIED',
 '414.01': 'CORONARY ATHEROSCLEROSIS OF NATIVE CORONARY ARTERY',
 '250.00': 'DIABETES MELLITUS WITHOUT COMPLICATION TYPE II OR ',
 '427.89': 'OTHER SPECIFIED CARDIAC DYSRHYTHMIAS',
 '965.00': 'POISONING BY OPIUM (ALKALOIDS) UNSPECIFIED',
 '042': 'HUMAN IMMUNODEFICIENCY VIRUS (HIV) DISEASE',
 '070.54': 'CHRONIC HEPATITIS C 

In [14]:
voc_map = dict((voc_code[i], i) for i in range(len(voc_code)))
dim = len(voc_map)

In [16]:
dim
voc_map

{'005.81': 0,
 '007.4': 1,
 '008.45': 2,
 '008.61': 3,
 '008.69': 4,
 '008.8': 5,
 '009.0': 6,
 '009.1': 7,
 '009.3': 8,
 '015.04': 9,
 '018.03': 10,
 '027.0': 11,
 '027.2': 12,
 '031.2': 13,
 '031.9': 14,
 '032.85': 15,
 '035': 16,
 '038.0': 17,
 '038.10': 18,
 '038.11': 19,
 '038.19': 20,
 '038.2': 21,
 '038.3': 22,
 '038.40': 23,
 '038.41': 24,
 '038.42': 25,
 '038.43': 26,
 '038.44': 27,
 '038.49': 28,
 '038.8': 29,
 '038.9': 30,
 '039.1': 31,
 '040.0': 32,
 '041.00': 33,
 '041.01': 34,
 '041.02': 35,
 '041.04': 36,
 '041.09': 37,
 '041.10': 38,
 '041.11': 39,
 '041.19': 40,
 '041.2': 41,
 '041.3': 42,
 '041.4': 43,
 '041.5': 44,
 '041.6': 45,
 '041.7': 46,
 '041.82': 47,
 '041.83': 48,
 '041.84': 49,
 '041.85': 50,
 '041.86': 51,
 '041.89': 52,
 '041.9': 53,
 '042': 54,
 '047.8': 55,
 '047.9': 56,
 '049.8': 57,
 '049.9': 58,
 '052.0': 59,
 '052.1': 60,
 '052.7': 61,
 '053.12': 62,
 '053.19': 63,
 '053.29': 64,
 '053.79': 65,
 '053.9': 66,
 '054.10': 67,
 '054.2': 68,
 '054.3': 69,

### Get a List of ICD9 codes for each patient

In [19]:
demo = defaultdict(list)
demo["gaston"].append("quintana")
demo.

defaultdict(list, {'gaston': ['quintana']})

In [17]:
patients = defaultdict(list)
for _,row in icd9_codes.iterrows():
    patients[row["subject_id"]].append(row["code"])

In [18]:
patients

defaultdict(list,
            {56: ['198.3',
              '162.8',
              '531.40',
              '276.1',
              '428.0',
              '780.39',
              '272.0',
              '401.9'],
             37: ['410.71',
              '428.0',
              '496',
              '535.51',
              '285.1',
              '486',
              '414.01',
              '250.00',
              '427.89'],
             78: ['965.00',
              '042',
              '070.54',
              '571.5',
              '304.01',
              '284.8',
              '969.4',
              'E850.2',
              'E853.2'],
             26: ['996.04',
              '427.1',
              '428.0',
              '427.31',
              '414.01',
              '412',
              '593.9',
              '272.0',
              '600.00'],
             67: ['431', '244.9', '401.9', '294.8', '311'],
             12: ['157.0',
              '574.10',
              '997.1',
              '

In [20]:
max([len(patients[k]) for k in patients]), np.mean([len(patients[k]) for k in patients])

(308, 13.53733232093141)

In [44]:
def patient2vec(p, vmap):
    """
    takes a patient p and a vocabulary vmap and returns a vector representation of p
    """
    pv = zero(len(vmap))
    for code in p:
        pv[vmap[code]] +=1
    #a = defaultdict(list)
    #a[p]=vmap
    return pv

In [53]:
v56 = patient2vec(patients[56], voc_map)
sum([1 for i in v56 if i != 0])

8

In [50]:
norm(patient2vec(patients[56], voc_map))

2.8284271247461903

In [None]:
patient_vectors = {p:patient2vec(patients[p], voc_map) for p in patients}

similarities = {}

for p1, v1 in patient_vectors.items():
    for p2,v2 in patients_vectors.items():
        similarities[(p1,p2)]= cos_sim(v1,v2)

### Cosine Similarity
One of the simplest ways of comparing two texts is with the [cosine similarity measure](https://en.wikipedia.org/wiki/Cosine_similarity). The sentences with the smallest angle between them are the most similar.

![angle between two vectors](https://upload.wikimedia.org/wikipedia/commons/thumb/3/3e/Dot_Product.svg/200px-Dot_Product.svg.png)

---------------

$$\cos{\theta} = \frac{\vec{A}\cdot\vec{B}}{{\left|\left|\vec{A}\right|\right|}{\left|\left|\vec{B}\right|\right|}}$$
    

In [22]:
def cos_sim(v1, v2):
    """
    computes the cosine similarity between two vectors v1 and v2
    Requires a dot product function and a norm function
    """
    return dot(v1,v2) / (norm(v1)*norm(v2))

In [25]:
cos_sim([1,0,1],[0,1,0])

0.0