In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import yaml
import re

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functools import reduce

from fiber.cohort import Cohort
from fiber.condition import Procedure, Diagnosis, Drug, VitalSign, Patient, LabValue, Encounter, MRNs
from fiber.database.hana import engine, Session

from fiber.database.table import fact
from fiber.database import print_sqla
from fiber.storage import yaml as fiberyaml

#from fiber import VERBOSE 
import random
import matplotlib.ticker as ticker

In [None]:
#pip install pyarrow


In [None]:
VERBOSE = 1

In [None]:
hype_occurs = pd.read_parquet('/home/dattas03/hpi_s4h/cohort/hypertension_phenotype_onsets.parquet', engine='pyarrow')


In [None]:
hype_occurs

In [None]:
hype_occurs = hype_occurs[hype_occurs['hypertensive_onset_0_after_180_after']==True]

MRNs_BP_Onsets = hype_occurs.groupby(['medical_record_number'])['age_in_days'].min().to_frame()

In [None]:
MRNs_BP_Condition = set(hype_occurs[hype_occurs['hypertensive_onset_0_after_180_after']==True]['medical_record_number'])

In [None]:
fiberyaml.get_available_conditions(Diagnosis)

condition = fiberyaml.get_condition(Diagnosis,'hypertension uncomplicated', coding_schemes = ['ICD-10', 'ICD-9'])

In [None]:
MRNS_ICD = Cohort(condition).mrns()

MRNs_ICD_Onsets = get_occurances_icdcodes(MRNS_ICD, condition).groupby(['medical_record_number'])['age_in_days'].min().to_frame()
#MRNs_ICD_Onsets.to_parquet('/data2/projects/hpi_s4h/cohort/MRNs_ICD_Onsets.parquet')


In [None]:
MRNs_ICD_Onsets = pd.read_parquet('/home/dattas03/hpi_s4h/cohort/MRNs_ICD_Onsets.parquet', engine='pyarrow')


In [None]:
MRNS_ICD = MRNs_ICD_Onsets

In [None]:
MRNS_ICD  = MRNS_ICD.reset_index(['medical_record_number'])


In [None]:
MRNS_ICD

In [None]:
def get_occurances_icdcodes(mrns,condition):
    appended_data = []
    mrns = list(mrns)
    for limit in range(0, len(mrns), 10000):
        p_condition = MRNs(mrns[limit:(limit + 10000)])
        cohort = Cohort(p_condition)
        encounters_cohort = cohort.get_occurrences(condition)
        appended_data.append(encounters_cohort)
    appended_data = pd.concat(appended_data)
    return appended_data

In [None]:
materials = pd.read_csv('/home/dattas03/hpi_s4h/bp_meds.tsv', '\t').MATERIAL_NAME.drop_duplicates()

materials.shape

In [None]:
materials.to_csv('BP_Meds.csv')

In [None]:
def get_med_mrns(materials):
    MRN_MEDS = []
    for limit in range(0, len(materials), 10):
        drug_cond = reduce(
            Drug.__or__, 
            [Drug(material_name) for material_name in materials[limit:(limit + 10)]]
        )
        MRN_MEDS.append(Cohort(drug_cond).get_occurrences(drug_cond))
        print(len(MRN_MEDS))
    appended_data = pd.concat(MRN_MEDS)
    return appended_data

In [None]:
MRN_MEDS = get_med_mrns(materials)
#df_mrns_meds = pd.DataFrame(list(MRN_MEDS))
#df_mrns_meds.columns = ['MRNs']
#df_mrns_meds.to_parquet('/data2/projects/hpi_s4h/cohort/hypertension_medication_mrns.parquet')

In [None]:
hype_med_mrns_df = pd.read_parquet('/home/dattas03/hpi_s4h/cohort/hypertension_medication_mrns.parquet', engine='pyarrow')



In [None]:
hype_med_mrns_df

In [None]:
hype_med_mrns = set(hype_med_mrns_df['MRNs'])

In [None]:
MRNS_ICD = set(MRNS_ICD['medical_record_number'])

In [None]:
len(hype_med_mrns)

In [None]:
len(MRNS_ICD)

In [None]:
len(MRNs_BP_Condition)

In [None]:
#pip install matplotlib-venn

In [None]:
##!pip install matplotlib-venn
from matplotlib_venn import venn3

newvenn = plt.figure(figsize=(20,10)) 
venn3([hype_med_mrns, MRNS_ICD, MRNs_BP_Condition], set_labels = ('BPMeds', 'ICD', 'BPVitals'))

In [None]:
newvenn.savefig('Venn_HT_Cohort.pdf', dpi=300)

In [None]:
hype_mrns = (hype_med_mrns.intersection(MRNS_ICD)).union(MRNS_ICD.intersection(MRNs_BP_Condition)).union(hype_med_mrns.intersection(MRNs_BP_Condition))

print(len(hype_mrns))

In [None]:
def get_occurances(mrns):
    appended_data = []
    mrns = list(mrns)
    for limit in range(0, len(mrns), 10000):
        print("entered")
        p_condition = MRNs(mrns[limit:(limit + 10000)])
        cohort = Cohort(p_condition)
        encounters_cohort = cohort.get(Encounter())
        appended_data.append(encounters_cohort.groupby('medical_record_number').size())
    appended_data = pd.concat(appended_data)
    return appended_data

In [None]:
#df_hypemrns_encounters = get_occurances(hype_mrns)
#df_hypemrns_encounters.columns = ['Encounters']
#print(df_hypemrns_encounters.shape)
#df_hypemrns_encounters.to_parquet('/data2/projects/hpi_s4h/cohort/hype_encounter.parquet')

In [None]:
df_hypemrns_encounters = pd.read_parquet('/home/dattas03/hpi_s4h/cohort/hype_encounter.parquet', engine='pyarrow').head()



In [None]:
print("the mean of the encounters for the case are: " + str(np.mean(df_hypemrns_encounters)))
print("the median of the encounters for the case are: " + str(np.median(df_hypemrns_encounters)))
print("the min of the encounters for the case are: " + str(np.min(df_hypemrns_encounters)))
print("the max of the encounters for the case are: " + str(np.max(df_hypemrns_encounters)))

In [None]:
All_MRNs_WDiagnosis = Cohort(Diagnosis()).mrns()

In [None]:
len(All_MRNs_WDiagnosis - hype_mrns)

In [None]:
control_mrns = All_MRNs_WDiagnosis - hype_mrns

In [None]:
sample_control_mrns = random.sample(control_mrns, 1000000)
len(sample_control_mrns)
#sample_control_mrns_occurances = get_occurances(sample_control_mrns)

In [None]:
print("the mean of the encounters for the control are: " + str(np.mean(sample_control_mrns_occurances)))
print("the median of the encounters for the control are: " + str(np.median(sample_control_mrns_occurances)))
print("the min of the encounters for the control are: " + str(np.min(sample_control_mrns_occurances)))
print("the max of the encounters for the control are: " + str(np.max(sample_control_mrns_occurances)))

In [None]:
sample_control_occurances_thresholded = (sample_control_mrns_occurances.where(lambda x : x>=24).dropna())

In [None]:
print("the mean of the encounters for the control are: " + str(np.mean(sample_control_occurances_thresholded)))
print("the median of the encounters for the control are: " + str(np.median(sample_control_occurances_thresholded)))
print("the min of the encounters for the control are: " + str(np.min(sample_control_occurances_thresholded)))
print("the max of the encounters for the control are: " + str(np.max(sample_control_occurances_thresholded)))

In [None]:
#d = pd.DataFrame()

#for threshold in range(1, 100, 5): 
#    length_mrns = len(sample_control_mrns_occurances.where(lambda x : x>=threshold).dropna())
#    temp = pd.DataFrame({'Threshold': threshold, 'MRNs': length_mrns}, index = [0])
#    d = pd.concat([d, temp])

#plt.figure(figsize=(10,10))     
#ax = sns.lineplot(x="Threshold", y="MRNs", data=d)
#ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
#ax.xaxis.set_major_formatter(ticker.ScalarFormatter())

In [None]:
for limit in range(0, 105, 10):
    print(limit)

In [None]:
icd9_matteo = ['696','696.0']
Diagnosis(icd9_matteo, 'ICD-9')
whatever = Cohort(Diagnosis(icd9_matteo, 'ICD-9')).mrns()

In [None]:
whatever