# Phenotype-based Machine Learning Approach for Disease Prediction

In [1]:
import mysql.connector
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import math
import os
import sys
import logging
mf_module_path = os.path.abspath(os.path.join('../python'))
if mf_module_path not in sys.path:
    sys.path.append(mf_module_path)
import mf
import mf_random
import hpoutil
import networkx
import obonet
import pickle
from tqdm import tqdm
import time

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

2019-10-30 12:23:57,912 - 18743 - root - DEBUG - test


In [3]:
mydb = mysql.connector.connect(host='localhost',
                               user='mimicuser',
                               passwd='mimic',
                               database='mimiciiiv13',
                              auth_plugin='mysql_native_password')
cursor = mydb.cursor(buffered=True)

### Select lab phenotypes to predict radiology findings

In [4]:
def labs_to_rad(labHpos, textHpo):
    """
    Provide a list of labHpo and one textHpo, return a DataFrame that has (SUBJECT_ID, HADM_ID) as row index, 
    all the phenotype terms in the column, and the OCCURRANCE as the values. 
    """
    labHpos_text = ",".join(['\'{}\''.format(labHpo) for labHpo in labHpos])
    labHpo_values_flat = pd.read_sql_query("""SELECT * from JAX_labHpoProfile WHERE MAP_TO IN ({})""".format(labHpos_text), mydb)
    textHpo_value = pd.read_sql_query("""
            SELECT * FROM JAX_textHpoProfile WHERE MAP_TO = '{}'
        """.format(textHpo), mydb)
    
    return labHpo_values_flat, textHpo_value

In [23]:
labHpos = ['HP:0020062', 'HP:0011015', 'HP:0004363', 'HP:0500165']
textHpo = 'HP:0002202'
labHpo_values_flat, textHpo_value = labs_to_rad(labHpos, textHpo)

In [27]:
encounters = pd.read_sql_query("SELECT DISTINCT SUBJECT_ID, HADM_ID FROM ADMISSIONS", mydb)

feature_matrix = encounters \
    .merge(labHpo_values_flat, how='left', on=['SUBJECT_ID', 'HADM_ID']) \
    .pivot_table(columns='MAP_TO', values='OCCURRANCE', index=['SUBJECT_ID', 'HADM_ID'])

target_vector = encounters.merge(textHpo_value, how='left', on=['SUBJECT_ID', 'HADM_ID']).set_index(['SUBJECT_ID', 'HADM_ID'])
target_vector.head()

feature_target = feature_matrix.merge(target_vector.loc[:, 'MAP_TO'], left_index=True, right_index=True).rename(columns={'MAP_TO': textHpo})
#feature_target = feature_target.assign(target = np.array(['case' if feature_target.MAP_TO.iloc[i] == 'HP:0002202' else 'control' for i in np.arange(len(feature_target))]))
feature_target.head(n=20)
feature_target.to_csv('../../../data/predict_textHpo_from_labHpo/Pleural_effusion.csv')

In [28]:
labHpos = ['HP:0012614', 'HP:0100529', 'HP:0011893', 'HP:0012085', 'HP:0004364', 'HP:0020061', 'HP:0003111', 'HP:0002901', 'HP:0040088', 'HP:0004360', 'HP:0010876']
textHpo = 'HP:0002090'
labHpo_values_flat, textHpo_value = labs_to_rad(labHpos, textHpo)

In [30]:
encounters = pd.read_sql_query("SELECT DISTINCT SUBJECT_ID, HADM_ID FROM ADMISSIONS", mydb)

feature_matrix = encounters \
    .merge(labHpo_values_flat, how='left', on=['SUBJECT_ID', 'HADM_ID']) \
    .pivot_table(columns='MAP_TO', values='OCCURRANCE', index=['SUBJECT_ID', 'HADM_ID'])

target_vector = encounters.merge(textHpo_value, how='left', on=['SUBJECT_ID', 'HADM_ID']).set_index(['SUBJECT_ID', 'HADM_ID'])
target_vector.head()

feature_target = feature_matrix.merge(target_vector.loc[:, 'MAP_TO'], left_index=True, right_index=True).rename(columns={'MAP_TO': textHpo})
#feature_target = feature_target.assign(target = np.array(['case' if feature_target.MAP_TO.iloc[i] == 'HP:0002202' else 'control' for i in np.arange(len(feature_target))]))
feature_target.head(n=20)
feature_target.to_csv('../../../data/predict_textHpo_from_labHpo/Pneumonia.csv')

## Task 1: Phenome-wide association studies for diseases

## Data selection
We adopt a similar approach to the loinc2hpo paper. At the patient level, find whether they have a diagnosis code or not, find whether they have a phenotype or not, and then perform logistic regression to determine whether a phenotype is associated with a disease. Control gender and age. 

We will create three separate tables:

    * diagnosis table (first diagnosis): subject_id, ICD9_code_value
    * phenotype table (phenotypes before diagnosis): subject_id, HP terms
    * patient table: subject_id, sex, age at diagnosis
    
    
We will create the tables gradually. First step, we use a simplied version: ignore date, just find diagnosis, and phenotypes for each patient, and test out the whole process

In [9]:
def patient_vectors():
    """
    Find information of patients that showed at least one encounter during defined period
    """
    return pd.read_sql_query('''
        SELECT 
            PATIENTS.SUBJECT_ID, PATIENTS.GENDER, PATIENTS.DOB
        FROM 
            PATIENTS
        WHERE 
            SUBJECT_ID IN (SELECT SUBJECT_ID FROM JAX_encounterOfInterest) 
    ''')

def diagnosis_vectors(icd):
    """
    same as createDiagnosisTable()
    """
    pd.read_sql_query('''
        SELECT
            SUBJECT_ID, HADM_ID, DIAGNOSIS
        FROM
            JAX_mf_diag
    ''', mydb)

def phenotype_vectors():
    pd.read_sql_query('''
        SELECT SUBJECT_ID, MAP_TO, dummy
        FROM JAX_LABHPOPROFILE
        GROUP BY SUBJECT_ID, MAP_TO
        HAVING COUNT(*) > 3
    ''', mydb)

In [18]:
initTables(debug=True)
patient_vectors()



0