# Predecting hypertension occurance, time frame and/or its complications


In [1]:
import pandas as pd
import numpy as np
import time
import re
import os
from operator import add

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 50000)

### 1- Parse row data file that has patient records that include all visits

input: **patient records**: <br>
>each record has<br>
patient ID| Race, Female| list of all visits seperated by #<br>
>>**each visit record  has**: DaysToEvent, age, list of icd9 codes for that visit, year<br>

output: **visit records**: <br>
>each record has<br>
patient ID, Race, Female, DaysToEvent, Age,year, list of icd9 codes for that visit



In [3]:
# Input data 
icd_code = '40' # icd9 codes for hypertension starts with 401-405 
infile = "HCUP_2003_to_2011_Patient_to_Visits_withoutLine40.csv"
outfile = "patient_visits_Dxs_"+icd_code+".csv"

# preparing the path 
start = os.getcwd()
infile_path = os.path.relpath('..//hcup_data//'+infile, start)
outfile_path = os.path.relpath('..//hcup_data//'+outfile, start)


In [4]:
def PatientHasDx(infile, outfile, icd_code, bunchsize = 1000):
    bunch = []
    counter = 0
    patient_count = 0
    file_length = 0
    tic = time.perf_counter()
    
    with open(infile, "r") as r, open(outfile, "w") as w:
        for line in r:
            
            # each line is a patient record with multiple visits seperated by #
            counter +=1
            data = line.split("|")
            code1, code2 = data[1].split(",")
            dxlist_all = data[2].split("#")
            l = len(dxlist_all)
            
            # listing all disease in all visits and check if specific disease is appeared
            patient_dxs = [item for sublist in [s.split(',')[2:-1]  for s in dxlist_all] for item in sublist]
            dis_len = len(icd_code)
            has_dx = len([s for s in patient_dxs if (s[:2]=='D_') & (icd_code in s[2:2+dis_len])])>0
            
            if not has_dx:
                continue
            
            file_length += l
            patient_count += 1
            
            # Going through each visit to make it a new record 
            for v in range(l):
                
                # extracting the list of icd codes
                dxlist = dxlist_all[v].split(",")
                year = dxlist[-1].replace("\n", "")
                codes = (dxlist[2:-1])
                
                codes_dx = [c[2:] for c in codes if (c[:2]=='D_') & (c[3:].isnumeric())] 
                
                # discart this visit if it doesn't have any diagnosis codes
                if len(codes_dx)==0:
                    file_length -= 1
                    continue
                bunch.append(','.join((data[0],code1,code2,dxlist[0],dxlist[1],year,' '.join(codes))) + '\n')
            
            if counter >= bunchsize :
                w.writelines(bunch)
                bunch = []
                counter = 0
        w.writelines(bunch)
    
    toc = time.perf_counter()
    print(f"Processed {patient_count} patients who has ICD code starts with {icd_code} in  {toc - tic:0.4f} seconds") 
    print("number of total visit records is: ", str(file_length))


In [5]:
PatientHasDx(infile_path, outfile_path, icd_code, bunchsize = 1000)


Processed 4473729 patients who has ICD code starts with 40 in  347.7967 seconds
number of total visit records is:  15583880


### 2- One Hot Encode the list of icd9 codes into 120 categories

output: **visit records**: <br>
>each record has<br>
patient ID, Race, Female, DaysToEvent, Age,year, list of icd9 codes for that visit

output: **visit records**: <br>
>each record has<br>
patient ID, Race, Female, DaysToEvent, Age,year, one hot encoding for the icd9 codes 


##### 120 categories are: 
**0**: codes start with V :Supplementary Classification Of Factors Influencing Health Status and Contact With Health Services<br>
**1-118**: second level of diagnoses<br>
**119**: Unknown codes (not listed in the second level diagnosis codes)<br>


In [34]:
icd = pd.read_csv("icd_list.csv", header = 0)

In [35]:
icd

Unnamed: 0,min,max,Label
0,1,9,Intestinal Infectious Diseases
1,10,18,Tuberculosis
2,20,27,Zoonotic Bacterial Diseases
3,30,41,Other Bacterial Diseases
4,42,42,Human Immunodeficiency Virus
5,45,49,Poliomyelitis And Other Non-Arthropod-Borne Vi...
6,50,59,Viral Diseases Accompanied By Exanthem
7,60,66,Arthropod-Borne Viral Diseases
8,70,79,Other Diseases Due To Viruses And Chlamydiae
9,80,88,Rickettsioses And Other Arthropod-Borne Diseases


In [36]:
def coding_120(i):
    if i[:1]=='V':
        return 0
    else:
        i = int(i[:3])
        code = np.where(( (i>=icd['min']) & (i<=icd['max'])))[0]
        if len(code)==0:
            return 119
        return code[0]+1

def coding_1K(i):
     if i[:1]=='V': 
        return 0
     else:
        return int(i[:3])
    
def one_hot(l,width):
    code = [0]*width 
    for i in l:
        x = [0]*width 
        x[i] = 1
        code = list( map(add, code, x) )
    return code 

In [61]:
def encoding_visit_dxs(infile, outfile, bunchsize = 1000):
    bunch = []
    counter = 0
    tic = time.perf_counter()
    with open(infile, "r") as r, open(outfile, "w") as w:
        for line in r:
            counter +=1
            data = line.split(",")
            
            ID, Race, Female, DaysToEvent, Age,year = data[0:6]
            
            codes = data[6].replace("\n","").split(" ")

            codes_dx = [c[2:] for c in codes if (c[:2]=='D_') & (c[3:].isnumeric())] 

            dx_hot_code = one_hot([coding_120(j) for j in codes_dx],120)
            
            bunch.append(','.join((ID,Race,Female,DaysToEvent,Age,year,str(dx_hot_code)[1:-1])) + '\n')
            if counter >= bunchsize :
                w.writelines(bunch)
                bunch = []
                counter = 0
        w.writelines(bunch)
    toc = time.perf_counter()
    print(f"Encoding all visits in  {toc - tic:0.4f} seconds") 

In [62]:
infile = "patient_visits_Dxs_"+icd_code+".csv"
outfile = "patient_visits_Dxs_"+icd_code+"encoded_120.csv"

# preparing the path 
start = os.getcwd()
infile_path = os.path.relpath('..//hcup_data//'+infile, start)
outfile_path = os.path.relpath('..//hcup_data//'+outfile, start)



In [None]:
encoding_visit_dxs(infile_path, outfile_path, bunchsize = 10000)