In [1]:
#Data Manipulation
import pandas as pd
import numpy as np

#Data Visualization
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df_norm = pd.read_csv("healthcare-dataset-stroke-data.csv", index_col="id")
df_synt = pd.read_csv("synthetic_train.csv", index_col="id")

pd.set_option('display.max_columns', None)

print(f"Normal dataset unique columns: {list(df_norm.columns)}")
print(f"Normal dataset number of rows and columns: {df_norm.shape}")
print("----------------------------------------------------------------------------")
print(f"Synthetic dataset unique columns: {list(df_synt.columns)}")
print(f"Synthetic dataset number of rows and columns: {df_synt.shape}")

Normal dataset unique columns: ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']
Normal dataset number of rows and columns: (5110, 11)
----------------------------------------------------------------------------
Synthetic dataset unique columns: ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']
Synthetic dataset number of rows and columns: (15304, 11)


In [3]:
df_total = pd.concat([df_norm, df_synt])

In [4]:
print(df_total.shape)
df_total

(20414, 11)


Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
15299,Female,22.0,0,0,No,Govt_job,Urban,72.63,19.5,never smoked,0
15300,Female,46.0,1,0,Yes,Private,Urban,101.19,32.1,never smoked,0
15301,Female,75.0,0,0,Yes,Self-employed,Urban,87.69,26.2,never smoked,0
15302,Male,46.0,0,0,Yes,Private,Rural,101.13,22.5,Unknown,0


In [5]:
df_total.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
df_total = df_total.dropna()

In [7]:
df = df_total.copy()
df

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
15299,Female,22.0,0,0,No,Govt_job,Urban,72.63,19.5,never smoked,0
15300,Female,46.0,1,0,Yes,Private,Urban,101.19,32.1,never smoked,0
15301,Female,75.0,0,0,Yes,Self-employed,Urban,87.69,26.2,never smoked,0
15302,Male,46.0,0,0,Yes,Private,Rural,101.13,22.5,Unknown,0


In [8]:
#Label Encoding

df.loc[df.gender == "Male", "gender"] = 0
df.loc[df.gender != 0, "gender"] = 1
df["gender"] = df["gender"].astype(int)

df.loc[df.ever_married == "Yes", "ever_married"] = 0
df.loc[df.ever_married != 0, "ever_married"] = 1
df["ever_married"] = df["ever_married"].astype(int)

df.loc[df.Residence_type == "Urban", "Residence_type"] = 0
df.loc[df.Residence_type != 0, "Residence_type"] = 1
df["Residence_type"] = df["Residence_type"].astype(int)

# Standartization

df["avg_glucose_level"] = (df.avg_glucose_level - df.avg_glucose_level.mean()) / df.avg_glucose_level.std()
df["age"] = (df.age - df.age.mean()) / df.age.std()
df["bmi"] = (df.bmi - df.bmi.mean()) / df.bmi.std()

In [9]:
df = pd.get_dummies(data=df, columns=["work_type", "smoking_status"])

In [10]:
print(df.shape)
df

(20213, 18)


Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9046,0,1.161213,0,1,0,0,4.250449,1.181714,1,0,0,1,0,0,0,1,0,0
31112,0,1.759523,0,1,0,1,0.404996,0.597816,1,0,0,1,0,0,0,0,1,0
60182,1,0.332785,0,0,0,0,2.450663,0.868403,1,0,0,1,0,0,0,0,0,1
1665,1,1.713499,1,0,0,1,2.541185,-0.612706,1,0,0,0,1,0,0,0,1,0
56669,0,1.805547,0,0,0,0,2.919873,0.099365,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15299,1,-0.909858,0,0,1,0,-0.637728,-1.253571,0,1,0,0,0,0,0,0,1,0
15300,1,0.194713,1,0,0,0,0.256841,0.540850,0,0,0,1,0,0,0,0,1,0
15301,1,1.529404,0,0,0,0,-0.166012,-0.299395,0,0,0,0,1,0,0,0,1,0
15302,0,0.194713,0,0,0,1,0.254961,-0.826328,0,0,0,1,0,0,1,0,0,0


In [11]:
features = list(df.columns)
features.remove("stroke")
print(features)

['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']


In [12]:
X = df.copy()
y = X.pop("stroke")

#print(X.head(1))
print(X.dtypes)

gender                              int32
age                               float64
hypertension                        int64
heart_disease                       int64
ever_married                        int32
Residence_type                      int32
avg_glucose_level                 float64
bmi                               float64
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object


In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

"""
for n in range(50, 1050, 50):
    XGBmodel = XGBRegressor(n_estimators=n, learning_rate=0.01, random_state=1)
    XGBmodel.fit(X, y)
    scores = -1 * cross_val_score(XGBmodel, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    
    print("Used: ", n, " estimators")
    print("Avarage MAE score:", sum(scores) / len(scores))
"""

XGBmodel = XGBRegressor(n_estimators=800, learning_rate=0.01, random_state=1)
XGBmodel.fit(X, y)

In [14]:
df_test = pd.read_csv("test.csv", index_col="id")
df_test

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15304,Female,57.0,0,0,Yes,Private,Rural,82.54,33.4,Unknown
15305,Male,70.0,1,0,Yes,Private,Urban,72.06,28.5,Unknown
15306,Female,5.0,0,0,No,children,Urban,103.72,19.5,Unknown
15307,Female,56.0,0,0,Yes,Govt_job,Urban,69.24,41.4,smokes
15308,Male,32.0,0,0,Yes,Private,Rural,111.15,30.1,smokes
...,...,...,...,...,...,...,...,...,...,...
25503,Female,27.0,0,0,No,Private,Urban,75.77,17.6,never smoked
25504,Male,49.0,0,0,Yes,Private,Urban,102.91,26.7,Unknown
25505,Female,3.0,0,0,No,children,Rural,104.04,18.3,Unknown
25506,Male,31.0,0,0,Yes,Private,Urban,82.41,28.7,never smoked


In [15]:
#Label Encoding

df_test.loc[df_test.gender == "Male", "gender"] = 0
df_test.loc[df_test.gender != 0, "gender"] = 1
df_test["gender"] = df_test["gender"].astype(int)

df_test.loc[df_test.ever_married == "Yes", "ever_married"] = 0
df_test.loc[df_test.ever_married != 0, "ever_married"] = 1
df_test["ever_married"] = df_test["ever_married"].astype(int)

df_test.loc[df_test.Residence_type == "Urban", "Residence_type"] = 0
df_test.loc[df_test.Residence_type != 0, "Residence_type"] = 1
df_test["Residence_type"] = df_test["Residence_type"].astype(int)

# Standartization

df_test["avg_glucose_level"] = (df_test.avg_glucose_level - df_test.avg_glucose_level.mean()) / df_test.avg_glucose_level.std()
df_test["age"] = (df_test.age - df_test.age.mean()) / df_test.age.std()
df_test["bmi"] = (df_test.bmi - df_test.bmi.mean()) / df_test.bmi.std()

In [16]:
df_test = pd.get_dummies(data=df_test, columns=["work_type", "smoking_status"])

In [17]:
df_test

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
15304,1,0.714727,0,0,0,1,-0.256914,0.765927,0,0,1,0,0,1,0,0,0
15305,0,1.330901,1,0,0,0,-0.661323,0.038262,0,0,1,0,0,1,0,0,0
15306,1,-1.749966,0,0,1,0,0.560393,-1.298267,0,0,0,0,1,1,0,0,0
15307,1,0.667329,0,0,0,0,-0.770143,1.953953,1,0,0,0,0,0,0,0,1
15308,0,-0.470222,0,0,0,1,0.847107,0.275867,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25503,1,-0.707211,0,0,1,0,-0.518159,-1.580423,0,0,1,0,0,0,0,1,0
25504,0,0.335544,0,0,0,0,0.529136,-0.229044,0,0,1,0,0,1,0,0,0
25505,1,-1.844762,0,0,1,1,0.572742,-1.476471,0,0,0,0,1,1,0,0,0
25506,0,-0.517619,0,0,0,0,-0.261931,0.067962,0,0,1,0,0,0,0,1,0


In [22]:
predictions = XGBmodel.predict(df_test)
for pred in predictions:
    print(pred)
    
print("----------------------------------------------")
normalized_predictions = (predictions-min(predictions))/(max(predictions)-min(predictions))    
for norm_pred in normalized_predictions:
    print(norm_pred)
    
outputs = normalized_predictions

0.040952265
0.14419848
0.000995423
0.067969635
0.0073934617
0.009058136
0.0019058187
0.06680179
0.00054294954
0.016813658
0.017500805
0.1585933
0.002089532
0.0018066949
0.01912923
0.0011975607
0.0016039651
0.0008789531
0.032205928
0.073286526
0.029249296
0.0021115234
0.00054294954
0.004790959
0.030798279
0.0152879
0.001629619
0.04722293
-0.00020054895
0.008572344
0.17646107
-0.00015860237
0.012782811
0.12477351
0.055985447
0.003501701
0.16735597
0.033434868
0.004999078
0.0016355995
0.0029069772
-0.00019743996
-0.00013112208
0.062564924
0.060009334
-0.00020054895
0.035969097
-0.0015754041
0.16998674
0.0038981626
9.046531e-05
-0.00020054895
-0.0006741741
0.0070763724
0.1183514
-0.0001818676
0.03315034
0.0041429927
0.022959614
0.000995423
0.007881152
0.023775645
0.00454488
0.029248456
0.15083966
-0.0006741741
0.046552
-0.0017334232
0.0015989388
0.011321271
0.011856357
0.002084506
0.0035020357
0.0672594
-0.00020054895
0.47743377
0.024837771
0.00072472595
-0.041269097
-0.0007314578
0.009691

In [30]:
sub_file = open("subXGBoost.csv", "w")

ID = 15304
run = 0

sub_file.write("id,stroke\n")
for output in outputs:
    sub_file.write(f"{ID},{output}\n")
    ID += 1
    run +=1
    
print(run)

10204
