In [87]:
import requests
import pandas as pd
import json
import random   
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re, math
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from pandas.io.json import json_normalize
from nltk.stem.snowball import SnowballStemmer

In [340]:
url1 = "http://ec2-54-88-151-77.compute-1.amazonaws.com:3003/v1/brief-summaries?limit=100&offset=0"
conn1 = requests.get(url1).json()
file1 = conn1['briefSummaries']
summary =  pd.DataFrame.from_dict(file1, orient='columns')

In [353]:
url1 = "http://ec2-54-88-151-77.compute-1.amazonaws.com:3001/v1/diagnoses?limit=20000&offset=0"
conn1 = requests.get(url1).json()
file1 = conn1['diagnoses']
subjects =  pd.DataFrame.from_dict(file1, orient='columns')

In [339]:
url1 = "http://ec2-54-88-151-77.compute-1.amazonaws.com:3001/v1/d-icd-diagnoses?limit=10000&offset=0"
conn1 = requests.get(url1).json()
file1 = conn1['dIcdDiagnoses']
icd =  pd.DataFrame.from_dict(file1, orient='columns')

In [341]:
icd = icd.dropna()
summary = summary.dropna()

In [342]:
#Defining objects
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [343]:
#cosine functionality
WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

In [344]:
def transform(text):
    r_unwanted = re.compile("[\n\t\r]")
    rx = re.compile(r'\ban\b|\bAn\b|\bA\b|\bthe\b|\bThe\b|\band\b|\ba\b')
    rxP = re.compile(r'\bpatient\b')

    text = text.split()
    for word in text:
        if word in stop:
            text.remove(word)
    
    text_lem = []
        
    for word in text:
        temp = lemmatizer.lemmatize(word)
        text_lem.append(temp)
    
    text_new = ' '.join(text_lem)
    
    text_new = r_unwanted.sub("",text_new) # remove \n
    text_new = re.sub(r'[^\w\s]',' ',text_new) #remove punctuation
    text_new = rx.sub('',text_new) # remove articles
    text_new = rxP.sub('',text_new) #remove patient
    
    vec = text_to_vector(text_new.lower())
    #return text_new    
    return vec    

In [326]:
#summary = summary.drop(['vec'],axis =1)
#icd = icd.drop(['vector','cosine_9'],axis =1)

In [345]:
summary['vec'] = summary.apply(lambda x: transform(x['description']), axis=1)
icd['vector'] = icd.apply(lambda x: transform(x['long_title']), axis=1)

### Study selected for match

In [349]:
summary.description.loc[5]

"\n      Efficacy Study of Gene Therapy for The Treatment of Acute Leber's Hereditary Optic Neuropathy\n      (LHON) onset within three months\n    "

In [350]:
#Picking index-9 from summary as a prototype
icd['cosine_5'] = icd.apply(lambda x: get_cosine(x['vector'],summary.vec.loc[5]), axis=1)


In [366]:
selected = icd[icd.cosine_5 == icd.cosine_5.max()]
selected

Unnamed: 0,icd9_code,long_title,row_id,short_title,vector,cosine_9,cosine_5
4125,3560,Hereditary peripheral neuropathy,3820,Hered periph neuropathy,"{'hereditary': 1, 'peripheral': 1, 'neuropathy...",0.0,0.288675
4127,3562,Hereditary sensory neuropathy,3822,Hered sensory neuropathy,"{'hereditary': 1, 'sensory': 1, 'neuropathy': 1}",0.0,0.288675
4814,37716,Hereditary optic atrophy,4811,Hereditary optic atrophy,"{'hereditary': 1, 'optic': 1, 'atrophy': 1}",0.0,0.288675
4822,37733,Nutritional optic neuropathy,4819,Nutrition optc neuropthy,"{'nutritional': 1, 'optic': 1, 'neuropathy': 1}",0.0,0.288675
4823,37734,Toxic optic neuropathy,4820,Toxic optic neuropathy,"{'toxic': 1, 'optic': 1, 'neuropathy': 1}",0.0,0.288675
4825,37741,Ischemic optic neuropathy,4822,Ischemic optic neuropthy,"{'ischemic': 1, 'optic': 1, 'neuropathy': 1}",0.0,0.288675


In [365]:
#extracting common matches
df = pd.merge(selected,subjects, on=['icd9_code'], how = 'inner')

## Subjects Suitable for the selected Study

In [361]:
df.subject_id

0    690
Name: subject_id, dtype: int64