# Packages

In [229]:
import pandas as pd
import seaborn as sns
import matplotlib as ptl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# a) Load the data

In [191]:
def loaddata(x):
    dfraw = pd.read_csv(x)
    df = pd.DataFrame(dfraw)
    return df

dataset = 'C:/Users/arimi/Documents/BSE-term1/ComputingData/HW3/sample_diabetes_mellitus_data.csv'
df1 = loaddata(dataset)


# Describe the data

In [192]:
print(df1.shape)

(10000, 53)


In [193]:
print((df1.isna().sum()))


Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                             405
bmi                            1738
elective_surgery                  0
ethnicity                       242
gender                            3
height                          160
hospital_admit_source           918
icu_admit_source                 21
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                         1658
albumin_apache                 6461
apache_2_diagnosis              180
apache_3j_diagnosis              51
apache_post_operative             0
arf_apache                        0
bilirubin_apache               6592
bun_apache                     2521
creatinine_apache              2389
fio2_apache                    7087
gcs_eyes_apache                  69
gcs_motor_apache            

In [194]:
print(df1['age'].isna().sum())

405


# b) Split the data 

In [195]:
def splitdata(x):
    traindf, testdf = train_test_split(x, test_size=0.4, random_state=42)
    traindf = traindf.reset_index(drop=True)
    testdf = testdf.reset_index(drop=True)
    return traindf, testdf

traindf1, testdf1 = splitdata(df1)

In [196]:
print(traindf1.shape)


(6000, 53)


In [197]:
print(testdf1.shape)

(4000, 53)


In [198]:
print(traindf1.isna().sum())

Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                             253
bmi                            1039
elective_surgery                  0
ethnicity                       148
gender                            2
height                           95
hospital_admit_source           540
icu_admit_source                 12
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                          991
albumin_apache                 3873
apache_2_diagnosis              118
apache_3j_diagnosis              32
apache_post_operative             0
arf_apache                        0
bilirubin_apache               3952
bun_apache                     1511
creatinine_apache              1422
fio2_apache                    4242
gcs_eyes_apache                  38
gcs_motor_apache            

In [199]:
print(testdf1.isna().sum())

Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                             152
bmi                             699
elective_surgery                  0
ethnicity                        94
gender                            1
height                           65
hospital_admit_source           378
icu_admit_source                  9
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                          667
albumin_apache                 2588
apache_2_diagnosis               62
apache_3j_diagnosis              19
apache_post_operative             0
arf_apache                        0
bilirubin_apache               2640
bun_apache                     1010
creatinine_apache               967
fio2_apache                    2845
gcs_eyes_apache                  31
gcs_motor_apache            

# c) Remove those rows that contain NaN values in the columns: age, gender, ethnicity.

In [200]:
def removerows(z):
    columns_to_check = ['age', 'gender', 'ethnicity']
    df_pairwise_deleted = z.dropna(subset=columns_to_check)
    return df_pairwise_deleted

df_d = removerows(traindf1)


In [201]:
print(traindf1.shape)
print(df_d.shape)

(6000, 53)
(5608, 53)


In [202]:
print((traindf1['height'].isna().sum()))

95


In [203]:
print((df_d.isna().sum()))

Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                               0
bmi                             964
elective_surgery                  0
ethnicity                         0
gender                            0
height                           72
hospital_admit_source           504
icu_admit_source                 12
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                          927
albumin_apache                 3621
apache_2_diagnosis              116
apache_3j_diagnosis              31
apache_post_operative             0
arf_apache                        0
bilirubin_apache               3694
bun_apache                     1424
creatinine_apache              1340
fio2_apache                    3926
gcs_eyes_apache                  35
gcs_motor_apache            

# d) Fill NaN with the mean value of the column in the columns: height, weight.

In [204]:
def fillna(w):
    mean_height = w['height'].mean()
    mean_weight = w['weight'].mean()
    w['height'].fillna(mean_height, inplace=True)
    w['weight'].fillna(mean_weight, inplace=True)
    return w

imputed_df = fillna(df_d)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w['height'].fillna(mean_height, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w['weight'].fillna(mean_weight, inplace=True)


In [205]:
def fillnab(r):
    constant = 10002301023012
    r['height'].fillna(method='ffill', inplace=True)
    r['weight'].fillna(constant, inplace=True)
    return r

imputed_df1 = fillnab(traindf1)

In [206]:
print(df_d['height'].mean())

170.01553106936416


In [207]:
print(traindf1['height'].mean())
print(imputed_df['height'].mean())
print(imputed_df1['height'].mean())


169.79330000000002
170.01553106936416
169.79330000000002


# e) Generate dummies for ethnicity column (One hot encoding).

In [208]:
print(imputed_df['ethnicity'])

0              Caucasian
1              Caucasian
2       African American
3              Caucasian
4              Caucasian
              ...       
5995           Caucasian
5996    African American
5997           Caucasian
5998           Caucasian
5999           Caucasian
Name: ethnicity, Length: 5608, dtype: object


In [217]:
def encodeEthnicity(d):
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(d[['ethnicity']]).toarray()
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out (['ethnicity']))
    encoded_df1 = pd.concat([d, encoded_df], axis=1)
    return encoded_df1

    
encoded_df = encodeEthnicity(imputed_df)


In [218]:
print(encoded_df.shape)


(5976, 59)


# f) Create a binary variable for gender M/F.

In [220]:
def vargender(f):
    encoder = LabelEncoder()
    f['gender_encoded'] = encoder.fit_transform(f['gender'])
    return f

final_df = vargender(encoded_df)

In [226]:
print(final_df['gender_encoded'].head())

0    0
1    1
2    1
3    1
4    1
Name: gender_encoded, dtype: int32


In [227]:
print(final_df['gender_encoded'].describe())


count    5976.000000
mean        0.640730
std         0.594486
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: gender_encoded, dtype: float64
count     5608
unique       2
top          M
freq      3093
Name: gender, dtype: object


In [228]:
print(final_df['gender'].describe())

count     5608
unique       2
top          M
freq      3093
Name: gender, dtype: object


# Finally Train a model and get results