In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#1.0 Examine Data types for each variable

#read the d1 dataset
df = pd.read_csv('Data.csv')

#show all columns information
print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51766 entries, 0 to 51765
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id              51766 non-null  int64 
 1   patient_nbr               51766 non-null  int64 
 2   race                      51766 non-null  object
 3   gender                    51766 non-null  object
 4   age                       51766 non-null  object
 5   weight                    51766 non-null  object
 6   admission_type_id         51766 non-null  int64 
 7   discharge_disposition_id  51766 non-null  int64 
 8   admission_source_id       51766 non-null  int64 
 9   time_in_hospital          51766 non-null  int64 
 10  payer_code                51766 non-null  object
 11  medical_specialty         51766 non-null  object
 12  num_lab_procedures        51766 non-null  int64 
 13  num_procedures            51766 non-null  int64 
 14  num_medications       

In [4]:
#1.0 Correct the data-types
def d1_prep():
    #read the d1 dataset
    d1 = pd.read_csv('d1.csv')
    
    #Variable data type casting
    #Change admission_type_id from interval/int to nominal/str
    d1['admission_type_id'] = d1['admission_type_id'].astype(str)
    
    #Change discharge_disposition_id from interval/int to nominal/str
    d1['discharge_disposition_id'] = d1['discharge_disposition_id'].astype(str)
    
    #Change admission_source_id from interval/int to nominal/str
    d1['admission_source_id'] = d1['admission_source_id'].astype(str)
    
    #Change 'change' to binary 0/1 variable
    change_map = {'change':0, 'no change': 1}
    d1['change'] = d1['change'].map(change_map)

    #Change diabetesMed to binary 0/1 variable
    diabetesMed_map = {'yes':0, 'no': 1}
    d1['diabetesMed'] = d1['diabetesMed'].map(diabetesMed_map)
    
    return d1

#from d1_tools import d1_prep


In [5]:
#Return dataset with updated variable types.
d1 = d1_prep()

print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51766 entries, 0 to 51765
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   encounter_id              51766 non-null  int64 
 1   patient_nbr               51766 non-null  int64 
 2   race                      51766 non-null  object
 3   gender                    51766 non-null  object
 4   age                       51766 non-null  object
 5   weight                    51766 non-null  object
 6   admission_type_id         51766 non-null  int64 
 7   discharge_disposition_id  51766 non-null  int64 
 8   admission_source_id       51766 non-null  int64 
 9   time_in_hospital          51766 non-null  int64 
 10  payer_code                51766 non-null  object
 11  medical_specialty         51766 non-null  object
 12  num_lab_procedures        51766 non-null  int64 
 13  num_procedures            51766 non-null  int64 
 14  num_medications       

In [6]:
#2.1 Identify skewness in variables

#Select for numeric data
d2 = d1.select_dtypes(include=[np.number])

print(d1.skew(axis = 0, skipna = True))
print(d2.skew(axis = 0, skipna = True)) 

encounter_id                 0.828531
patient_nbr                  0.609771
admission_type_id            1.855348
discharge_disposition_id     3.237941
admission_source_id          0.155033
time_in_hospital             1.197872
num_lab_procedures          -0.342571
num_procedures               1.385591
num_medications              1.189992
number_outpatient            7.794017
number_emergency            21.279724
number_inpatient             3.494846
number_diagnoses            -1.303386
change                            NaN
diabetesMed                       NaN
dtype: float64
encounter_id           0.828531
patient_nbr            0.609771
time_in_hospital       1.197872
num_lab_procedures    -0.342571
num_procedures         1.385591
num_medications        1.189992
number_outpatient      7.794017
number_emergency      21.279724
number_inpatient       3.494846
number_diagnoses      -1.303386
change                      NaN
diabetesMed                 NaN
dtype: float64


In [7]:
#2.2 Inconsistencies in the data
print("Unique values: \n",d1['race'].unique(), "\nCount: \n", d1['race'].value_counts())
print("\n")
print("Unique values: \n",d1['gender'].unique(), "\nCount: \n", d1['gender'].value_counts())
print("\n")
print("Unique values: \n",d1['age'].unique(), "\nCount: \n", d1['age'].value_counts())
print("\n")
print("Unique values: \n",d1['weight'].unique(), "\nCount: \n", d1['weight'].value_counts())
print("\n")
print("Unique values: \n",d1['payer_code'].unique(), "\nCount: \n", d1['payer_code'].value_counts())
print("\n")
print("Unique values: \n",d1['medical_specialty'].unique(), "\nCount: \n", d1['medical_specialty'].value_counts())
print("\n")
print("Unique values: \n",d1['chlorpropamide'].unique(), "\nCount: \n", d1['chlorpropamide'].value_counts())
print("\n")


Unique values: 
 ['Caucasian' 'AfricanAmerican' 'Hispanic' '?' 'Other' 'Asian'] 
Count: 
 Caucasian          40388
AfricanAmerican     8070
Hispanic            1018
?                   1016
Other                896
Asian                378
Name: race, dtype: int64


Unique values: 
 ['Female' 'Male' 'Unknown/Invalid'] 
Count: 
 Female             27718
Male               24044
Unknown/Invalid        4
Name: gender, dtype: int64


Unique values: 
 ['[60-70)' '[80-90)' '[70-80)' '[40-50)' '[50-60)' '[90-100)' '[30-40)'
 '[20-30)' '[10-20)' '[0-10)' '?'] 
Count: 
 [70-80)     12972
[60-70)     11618
[80-90)      9671
[50-60)      8482
[40-50)      4619
[30-40)      1717
[90-100)     1616
[20-30)       815
[10-20)       223
[0-10)         23
?              10
Name: age, dtype: int64


Unique values: 
 ['?' '[100-125)' '[50-75)' '[75-100)' '[0-25)' '[125-150)' '[25-50)'
 '[150-175)' '[175-200)' '>200'] 
Count: 
 ?            50431
[75-100)       542
[50-75)        347
[100-125)      309
[12

In [9]:
#question 4.1
d2.mean()


encounter_id          2.445955e+08
patient_nbr           7.181953e+07
time_in_hospital      4.237337e+00
num_lab_procedures    4.385320e+01
num_procedures        1.308214e+00
num_medications       1.673585e+01
number_outpatient     5.128463e-01
number_emergency      2.691728e-01
number_inpatient      6.868794e-01
number_diagnoses      7.928486e+00
change                         NaN
diabetesMed                    NaN
dtype: float64

In [10]:
d2.median()


encounter_id          228334512.0
patient_nbr            70375977.0
time_in_hospital              3.0
num_lab_procedures           45.0
num_procedures                1.0
num_medications              16.0
number_outpatient             0.0
number_emergency              0.0
number_inpatient              0.0
number_diagnoses              9.0
change                        NaN
diabetesMed                   NaN
dtype: float64

In [12]:
d2.mode()[0:1]



Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,change,diabetesMed
0,150645834,88785891.0,3.0,1.0,0.0,15.0,0.0,0.0,0.0,9.0,,
