# Packages

In [275]:
import pandas as pd
import seaborn as sns
import matplotlib as ptl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# a) Load the data

In [191]:
def loaddata(x):
    dfraw = pd.read_csv(x)
    df = pd.DataFrame(dfraw)
    return df

dataset = 'C:/Users/arimi/Documents/BSE-term1/ComputingData/HW3/sample_diabetes_mellitus_data.csv'
df1 = loaddata(dataset)


# Describe the data

In [192]:
print(df1.shape)

(10000, 53)


In [193]:
print((df1.isna().sum()))


Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                             405
bmi                            1738
elective_surgery                  0
ethnicity                       242
gender                            3
height                          160
hospital_admit_source           918
icu_admit_source                 21
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                         1658
albumin_apache                 6461
apache_2_diagnosis              180
apache_3j_diagnosis              51
apache_post_operative             0
arf_apache                        0
bilirubin_apache               6592
bun_apache                     2521
creatinine_apache              2389
fio2_apache                    7087
gcs_eyes_apache                  69
gcs_motor_apache            

In [194]:
print(df1['age'].isna().sum())

405


# b) Split the data 

In [195]:
def splitdata(x):
    traindf, testdf = train_test_split(x, test_size=0.4, random_state=42)
    traindf = traindf.reset_index(drop=True)
    testdf = testdf.reset_index(drop=True)
    return traindf, testdf

traindf1, testdf1 = splitdata(df1)

In [196]:
print(traindf1.shape)


(6000, 53)


In [197]:
print(testdf1.shape)

(4000, 53)


In [198]:
print(traindf1.isna().sum())

Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                             253
bmi                            1039
elective_surgery                  0
ethnicity                       148
gender                            2
height                           95
hospital_admit_source           540
icu_admit_source                 12
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                          991
albumin_apache                 3873
apache_2_diagnosis              118
apache_3j_diagnosis              32
apache_post_operative             0
arf_apache                        0
bilirubin_apache               3952
bun_apache                     1511
creatinine_apache              1422
fio2_apache                    4242
gcs_eyes_apache                  38
gcs_motor_apache            

In [199]:
print(testdf1.isna().sum())

Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                             152
bmi                             699
elective_surgery                  0
ethnicity                        94
gender                            1
height                           65
hospital_admit_source           378
icu_admit_source                  9
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                          667
albumin_apache                 2588
apache_2_diagnosis               62
apache_3j_diagnosis              19
apache_post_operative             0
arf_apache                        0
bilirubin_apache               2640
bun_apache                     1010
creatinine_apache               967
fio2_apache                    2845
gcs_eyes_apache                  31
gcs_motor_apache            

# c) Remove those rows that contain NaN values in the columns: age, gender, ethnicity.

In [264]:
def removerows(z):
    columns_to_check = ['age', 'gender', 'ethnicity']
    df_pairwise_deleted = z.dropna(subset=columns_to_check)
    return df_pairwise_deleted

df_d = removerows(traindf1)
dft = removerows(testdf1)


In [201]:
print(traindf1.shape)
print(df_d.shape)

(6000, 53)
(5608, 53)


In [202]:
print((traindf1['height'].isna().sum()))

95


In [203]:
print((df_d.isna().sum()))

Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                               0
bmi                             964
elective_surgery                  0
ethnicity                         0
gender                            0
height                           72
hospital_admit_source           504
icu_admit_source                 12
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                          927
albumin_apache                 3621
apache_2_diagnosis              116
apache_3j_diagnosis              31
apache_post_operative             0
arf_apache                        0
bilirubin_apache               3694
bun_apache                     1424
creatinine_apache              1340
fio2_apache                    3926
gcs_eyes_apache                  35
gcs_motor_apache            

# d) Fill NaN with the mean value of the column in the columns: height, weight.

In [266]:
def fillna(w):
    mean_height = w['height'].mean()
    mean_weight = w['weight'].mean()
    w['height'].fillna(mean_height, inplace=True)
    w['weight'].fillna(mean_weight, inplace=True)
    return w

imputed_df = fillna(df_d)
dft1 = fillna(dft)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w['height'].fillna(mean_height, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w['weight'].fillna(mean_weight, inplace=True)


In [205]:
def fillnab(r):
    constant = 10002301023012
    r['height'].fillna(method='ffill', inplace=True)
    r['weight'].fillna(constant, inplace=True)
    return r

imputed_df1 = fillnab(traindf1)

In [206]:
print(df_d['height'].mean())

170.01553106936416


In [250]:
print(traindf1['height'].mean())
print(imputed_df['height'].mean())
print(imputed_df1['height'].mean())


169.79330000000002
170.01553106936416
169.79330000000002


In [238]:
print(imputed_df.shape)

(5608, 53)


In [251]:
print(imputed_df.isna().sum())


Unnamed: 0                        0
encounter_id                      0
hospital_id                       0
age                               0
bmi                             964
elective_surgery                  0
ethnicity                         0
gender                            0
height                            0
hospital_admit_source           504
icu_admit_source                 12
icu_id                            0
icu_stay_type                     0
icu_type                          0
pre_icu_los_days                  0
readmission_status                0
weight                            0
albumin_apache                 3621
apache_2_diagnosis              116
apache_3j_diagnosis              31
apache_post_operative             0
arf_apache                        0
bilirubin_apache               3694
bun_apache                     1424
creatinine_apache              1340
fio2_apache                    3926
gcs_eyes_apache                  35
gcs_motor_apache            

# e) Generate dummies for ethnicity column (One hot encoding).

In [208]:
print(imputed_df['ethnicity'])

0              Caucasian
1              Caucasian
2       African American
3              Caucasian
4              Caucasian
              ...       
5995           Caucasian
5996    African American
5997           Caucasian
5998           Caucasian
5999           Caucasian
Name: ethnicity, Length: 5608, dtype: object


In [246]:
def encodeEthnicity(d):
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(d[['ethnicity']]).toarray()
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out (['ethnicity']))
    encoded_df1 = pd.concat([d, encoded_df], axis=1)
    return encoded_df1

    
encoded_df1 = encodeEthnicity(imputed_df)


In [247]:
print(encoded_df1.shape)


(5976, 59)


In [248]:
print(encoded_df1.isna().sum())

Unnamed: 0                      368
encounter_id                    368
hospital_id                     368
age                             368
bmi                            1332
elective_surgery                368
ethnicity                       368
gender                          368
height                          368
hospital_admit_source           872
icu_admit_source                380
icu_id                          368
icu_stay_type                   368
icu_type                        368
pre_icu_los_days                368
readmission_status              368
weight                          368
albumin_apache                 3989
apache_2_diagnosis              484
apache_3j_diagnosis             399
apache_post_operative           368
arf_apache                      368
bilirubin_apache               4062
bun_apache                     1792
creatinine_apache              1708
fio2_apache                    4294
gcs_eyes_apache                 403
gcs_motor_apache            

# f) Create a binary variable for gender M/F.

In [220]:
def vargender(f):
    encoder = LabelEncoder()
    f['gender_encoded'] = encoder.fit_transform(f['gender'])
    return f

final_df = vargender(encoded_df)

In [226]:
print(final_df['gender_encoded'].head())

0    0
1    1
2    1
3    1
4    1
Name: gender_encoded, dtype: int32


In [227]:
print(final_df['gender_encoded'].describe())


count    5976.000000
mean        0.640730
std         0.594486
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: gender_encoded, dtype: float64
count     5608
unique       2
top          M
freq      3093
Name: gender, dtype: object


In [228]:
print(final_df['gender'].describe())

count     5608
unique       2
top          M
freq      3093
Name: gender, dtype: object


# Finally Train a model and get results

In [267]:
features1 = ['age', 'diabetes_mellitus', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
print(imputed_df[features1].isna().sum())
print(dft1[features1].isna().sum())



age                            0
diabetes_mellitus              0
height                         0
weight                         0
aids                           0
cirrhosis                      0
hepatic_failure                0
immunosuppression              0
leukemia                       0
lymphoma                       0
solid_tumor_with_metastasis    0
dtype: int64
age                            0
diabetes_mellitus              0
height                         0
weight                         0
aids                           0
cirrhosis                      0
hepatic_failure                0
immunosuppression              0
leukemia                       0
lymphoma                       0
solid_tumor_with_metastasis    0
dtype: int64


In [276]:
def model(c, d):
    features = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
    target = 'diabetes_mellitus'
    X = c[features]
    Y = c[target]
    Xt = d[features]
    Yt = d[target]
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, Y)
    train_predictions = model.predict_proba(X)[:, 1]
    test_predictions = model.predict_proba(Xt)[:, 1]
    y_train_pred = model.predict(X)
    y_test_pred = model.predict(Xt)
    c['train_predictions'] = train_predictions
    d['test_predictions'] = test_predictions
    train_accuracy = accuracy_score(Y, y_train_pred)
    test_accuracy = accuracy_score(Yt, y_test_pred)
    print("Train Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    train_roc_auc = roc_auc_score(Y, train_predictions)
    test_roc_auc = roc_auc_score(Yt, test_predictions)
    print("Train ROC AUC:", train_roc_auc)
    print("Test ROC AUC:", test_roc_auc)
    return c, d
   

modeloTra, modeloTes = model(imputed_df, dft1)



Train Accuracy: 0.9825249643366619
Test Accuracy: 0.736436170212766
Train ROC AUC: 0.9982659252128632
Test ROC AUC: 0.6136113120253893


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c['train_predictions'] = train_predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['test_predictions'] = test_predictions


In [274]:
modeloTra.describe()


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,...,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,train_predictions
count,5608.0,5608.0,5608.0,5608.0,4644.0,5608.0,5608.0,5608.0,5608.0,5608.0,...,4357.0,5608.0,5608.0,5608.0,5608.0,5608.0,5608.0,5608.0,5608.0,5608.0
mean,5009.226106,212770.081491,103.480563,62.096113,30.086589,0.229672,170.006221,105.151034,0.664731,0.0,...,11.525763,0.000535,0.019437,0.0148,0.045827,0.006598,0.005171,0.026213,0.233595,0.235649
std,2861.906749,38093.048325,30.693209,16.657339,8.538092,0.420659,10.596279,16.960298,2.193111,0.0,...,6.659038,0.023125,0.138066,0.120764,0.209129,0.080965,0.071731,0.159781,0.423155,0.28784
min,1.0,147009.0,4.0,16.0,14.844926,0.0,137.2,82.0,-0.224306,0.0,...,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2503.75,179559.5,83.0,52.0,24.20811,0.0,162.6,92.0,0.002778,0.0,...,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03
50%,5074.5,212536.0,118.0,64.0,28.548361,0.0,170.2,99.0,0.011111,0.0,...,9.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09
75%,7429.5,245997.75,118.0,75.0,34.107878,0.0,177.8,114.0,0.175868,0.0,...,14.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32625
max,9999.0,278974.0,198.0,89.0,67.81499,1.0,195.59,170.0,38.052083,0.0,...,45.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99
