## Import package

In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import mean, std
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE

## Read dataset

In [2]:
raw_df = pd.read_csv('./Dataset/train.csv')
raw_df_test = pd.read_csv('./Dataset/public_test.csv')
raw_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,height,stroke
0,77,Female,,0.0,0.0,No,children,Unknown,85.81,18.6,Unknown,O,,0
1,84,Male,55.0,0.0,0.0,Yes,Private,Urban,89.17,,never smoked,B,1.59,0
2,91,Female,42.0,0.0,0.0,No,Private,Unknown,98.53,18.5,never smoked,B,,0
3,99,Female,31.0,0.0,0.0,No,Private,Urban,108.89,52.3,Unknown,O,1.24,0
4,121,Female,38.0,0.0,0.0,Yes,Private,Unknown,91.44,,Unknown,O,,0


Drop `id` feature because it's useless

In [3]:
df = raw_df.copy()
df_test = raw_df_test.copy()
df.drop(columns=['id', 'height'], inplace=True)
df_test.drop(columns=['height'], inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke
0,Female,,0.0,0.0,No,children,Unknown,85.81,18.6,Unknown,O,0
1,Male,55.0,0.0,0.0,Yes,Private,Urban,89.17,,never smoked,B,0
2,Female,42.0,0.0,0.0,No,Private,Unknown,98.53,18.5,never smoked,B,0
3,Female,31.0,0.0,0.0,No,Private,Urban,108.89,52.3,Unknown,O,0
4,Female,38.0,0.0,0.0,Yes,Private,Unknown,91.44,,Unknown,O,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,16.0,0.0,0.0,No,Private,Unknown,99.49,22.0,Unknown,B,0
3996,Male,47.0,0.0,0.0,Yes,Private,Rural,75.30,25.0,formerly smoked,B,0
3997,Other,57.0,1.0,0.0,Yes,Private,Rural,129.54,60.9,smokes,AB,0
3998,Female,45.0,0.0,0.0,Yes,Private,Unknown,172.33,45.3,formerly smoked,O,0


In [4]:
def draw_missing_data_table(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [5]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
bmi,1116,0.279
hypertension,230,0.0575
avg_glucose_level,171,0.04275
age,107,0.02675
heart_disease,90,0.0225
blood,10,0.0025
gender,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0


In [6]:
draw_missing_data_table(df_test)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
hypertension,45,0.09
bmi,43,0.086
age,11,0.022
avg_glucose_level,6,0.012
id,0,0.0
gender,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0


In [7]:
df['Residence_type'].unique()

array(['Unknown', 'Urban', 'Rural'], dtype=object)

In [8]:
Residence_type_mapping = {'Unknown': float("nan"), 'Urban': 0, 'Rural': 1}
df['Residence_type'] = df['Residence_type'].map(Residence_type_mapping)
df['Residence_type'].unique()

array([nan,  0.,  1.])

In [9]:
df_test['Residence_type'].unique()

array(['Urban', 'Rural', 'Unknown'], dtype=object)

In [10]:
Residence_type_mapping = {'Unknown': float("nan"), 'Urban': 0, 'Rural': 1}
df_test['Residence_type'] = df_test['Residence_type'].map(Residence_type_mapping)
df_test['Residence_type'].unique()

array([ 0.,  1., nan])

In [11]:
df['smoking_status'].unique()

array(['Unknown', 'never smoked', 'smokes', 'formerly smoked'],
      dtype=object)

In [12]:
smoking_status_mapping = {'Unknown': float("nan"), 'never smoked': 0, 'smokes': 2, 'formerly smoked': 1}
df['smoking_status'] = df['smoking_status'].map(smoking_status_mapping)
df['smoking_status'].unique()

array([nan,  0.,  2.,  1.])

In [13]:
df_test['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'Unknown', 'smokes'],
      dtype=object)

In [14]:
smoking_status_mapping = {'Unknown': float("nan"), 'never smoked': 0, 'smokes': 2, 'formerly smoked': 1}
df_test['smoking_status'] = df_test['smoking_status'].map(smoking_status_mapping)
df_test['smoking_status'].unique()

array([ 1.,  0., nan,  2.])

In [15]:
df['ever_married'].unique()

array(['No', 'Yes'], dtype=object)

In [16]:
marriage_mapping = {'No': 0, 'Yes': 1}
df['ever_married'] = df['ever_married'].map(marriage_mapping)
df['ever_married'].unique()

array([0, 1])

In [17]:
df_test['ever_married'].unique()

array(['No', 'Yes'], dtype=object)

In [18]:
marriage_mapping = {'No': 0, 'Yes': 1}
df_test['ever_married'] = df_test['ever_married'].map(marriage_mapping)
df_test['ever_married'].unique()

array([0, 1])

In [19]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
Residence_type,1987,0.49675
smoking_status,1226,0.3065
bmi,1116,0.279
hypertension,230,0.0575
avg_glucose_level,171,0.04275
age,107,0.02675
heart_disease,90,0.0225
blood,10,0.0025
gender,0,0.0
ever_married,0,0.0


In [20]:
# perform a robust scaler transform of the dataset
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
columns = ['avg_glucose_level','bmi','age']
scaled = std_scale.fit_transform(df[columns])
scaled = pd.DataFrame(scaled,columns=columns)
df=df.drop(columns=columns,axis=1)

df = df.reset_index(drop=True)
df = df.merge(scaled, left_index=True, right_index=True, how = "left")
df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,blood,stroke,avg_glucose_level,bmi,age
0,Female,0.0,0.0,0,children,,,O,0,-0.452194,-1.301803,
1,Male,0.0,0.0,1,Private,0.0,0.0,B,0,-0.377598,,0.531985
2,Female,0.0,0.0,0,Private,,0.0,B,0,-0.169796,-1.314457,-0.047775
3,Female,0.0,0.0,0,Private,0.0,,O,0,0.060208,2.962504,-0.538341
4,Female,0.0,0.0,1,Private,,,O,0,-0.327201,,-0.226163
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,,B,0,-0.148482,-0.871576,-1.207295
3996,Male,0.0,0.0,1,Private,1.0,1.0,B,0,-0.685527,-0.491964,0.175210
3997,Other,1.0,0.0,1,Private,1.0,2.0,AB,0,0.518661,4.050725,0.621179
3998,Female,0.0,0.0,1,Private,,1.0,O,0,1.468647,2.076743,0.086016


In [21]:
# perform a robust scaler transform of the dataset
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
columns = ['avg_glucose_level','bmi','age']
scaled = std_scale.fit_transform(df_test[columns])
scaled = pd.DataFrame(scaled,columns=columns)
df_test=df_test.drop(columns=columns,axis=1)

df_test = df_test.reset_index(drop=True)
df_test = df_test.merge(scaled, left_index=True, right_index=True, how = "left")
df_test

Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,blood,avg_glucose_level,bmi,age
0,67,Female,0.0,0.0,0,Private,0.0,1.0,AB,-0.286363,,-1.149266
1,210,Male,0.0,0.0,1,Self-employed,1.0,0.0,A,-0.316685,0.281740,1.692220
2,242,Male,,0.0,0,children,0.0,,O,-0.167832,-1.399360,-1.726443
3,711,Male,0.0,,1,Private,1.0,0.0,AB,-0.286575,-0.830930,1.692220
4,724,Male,0.0,0.0,0,Private,1.0,0.0,O,-0.523851,1.890275,-1.149266
...,...,...,...,...,...,...,...,...,...,...,...,...
495,72393,Female,0.0,,1,Govt_job,1.0,0.0,A,-0.456845,-0.589045,-0.882876
496,72491,Male,0.0,0.0,1,Private,0.0,2.0,O,-0.674613,0.015667,0.449070
497,72562,Female,0.0,,1,Private,1.0,0.0,AB,-0.892804,0.451060,0.626663
498,72792,Female,,0.0,1,Private,1.0,0.0,AB,-0.605063,,0.449070


In [22]:
le = LabelEncoder()
le_feature = ['gender', 'work_type', 'blood']
for i in le_feature:
    df[i] = le.fit_transform(df[i])
df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,blood,stroke,avg_glucose_level,bmi,age
0,0,0.0,0.0,0,4,,,3,0,-0.452194,-1.301803,
1,1,0.0,0.0,1,2,0.0,0.0,2,0,-0.377598,,0.531985
2,0,0.0,0.0,0,2,,0.0,2,0,-0.169796,-1.314457,-0.047775
3,0,0.0,0.0,0,2,0.0,,3,0,0.060208,2.962504,-0.538341
4,0,0.0,0.0,1,2,,,3,0,-0.327201,,-0.226163
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1,0.0,0.0,0,2,,,2,0,-0.148482,-0.871576,-1.207295
3996,1,0.0,0.0,1,2,1.0,1.0,2,0,-0.685527,-0.491964,0.175210
3997,2,1.0,0.0,1,2,1.0,2.0,1,0,0.518661,4.050725,0.621179
3998,0,0.0,0.0,1,2,,1.0,3,0,1.468647,2.076743,0.086016


In [23]:
le = LabelEncoder()
le_feature = ['gender', 'work_type', 'blood']
for i in le_feature:
    df_test[i] = le.fit_transform(df_test[i])
df_test

Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,blood,avg_glucose_level,bmi,age
0,67,0,0.0,0.0,0,2,0.0,1.0,1,-0.286363,,-1.149266
1,210,1,0.0,0.0,1,3,1.0,0.0,0,-0.316685,0.281740,1.692220
2,242,1,,0.0,0,4,0.0,,3,-0.167832,-1.399360,-1.726443
3,711,1,0.0,,1,2,1.0,0.0,1,-0.286575,-0.830930,1.692220
4,724,1,0.0,0.0,0,2,1.0,0.0,3,-0.523851,1.890275,-1.149266
...,...,...,...,...,...,...,...,...,...,...,...,...
495,72393,0,0.0,,1,0,1.0,0.0,0,-0.456845,-0.589045,-0.882876
496,72491,1,0.0,0.0,1,2,0.0,2.0,3,-0.674613,0.015667,0.449070
497,72562,0,0.0,,1,2,1.0,0.0,1,-0.892804,0.451060,0.626663
498,72792,0,,0.0,1,2,1.0,0.0,1,-0.605063,,0.449070


In [24]:
df.to_csv('./Dataset/processed_train_le.csv', index=False)

In [25]:
X = df.copy()
X.drop(columns='stroke', inplace=True)
y = df['stroke']
print(len(X))
print(len(y))

4000
4000


In [26]:
imputer = KNNImputer()
X = imputer.fit_transform(X)
X

array([[ 0.        ,  0.        ,  0.        , ..., -0.45219368,
        -1.30180283, -1.57299009],
       [ 1.        ,  0.        ,  0.        , ..., -0.37759794,
         0.97839914,  0.53198547],
       [ 0.        ,  0.        ,  0.        , ..., -0.1697955 ,
        -1.31445656, -0.04777474],
       ...,
       [ 2.        ,  1.        ,  0.        , ...,  0.51866106,
         4.05072455,  0.62117935],
       [ 0.        ,  0.        ,  0.        , ...,  1.46864675,
         2.07674282,  0.08601608],
       [ 0.        ,  1.        ,  0.        , ..., -0.96859161,
         0.17868347,  0.44279159]])

In [27]:
X.shape

(4000, 11)

In [28]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X, y)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size= 0.2, random_state= 42)

In [30]:
X_Test_public = df_test.copy()
X_Test_public.drop(columns='id', inplace=True)
id_test = df_test['id']

In [33]:
imputer = KNNImputer()
X_Test_public = imputer.fit_transform(X_Test_public)
X_Test_public

array([[ 0.        ,  0.        ,  0.        , ..., -0.28636338,
        -0.23105564, -1.14926566],
       [ 1.        ,  0.        ,  0.        , ..., -0.31668541,
         0.28174032,  1.6922205 ],
       [ 1.        ,  0.        ,  0.        , ..., -0.16783181,
        -1.39935965, -1.72644254],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.89280398,
         0.45105974,  0.62666319],
       [ 0.        ,  0.        ,  0.        , ..., -0.60506276,
        -0.05689852,  0.4490703 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.07538142,
        -1.3872654 , -1.81523898]])

In [35]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(x_train, y_train)
y_xgb_pred = model_xgb.predict(x_test)
f1_score(y_test, y_xgb_pred)



0.9863370201691607

In [36]:
y_pred = model_xgb.predict(X_Test_public)
len(y_pred)

500

In [37]:
data_test_submit = {'id': id_test, 'stroke': y_pred}
df_test_submit = pd.DataFrame(data= data_test_submit)
df_test_submit

Unnamed: 0,id,stroke
0,67,0
1,210,0
2,242,0
3,711,0
4,724,0
...,...,...
495,72393,0
496,72491,1
497,72562,0
498,72792,0


In [38]:
df_test_submit.to_csv('./Dataset/Submission/xgboost_knn.csv', index= False)

In [33]:
# # evaluate each strategy on the dataset
# from xgboost import XGBClassifier
# results = list()
# strategies = [str(i) for i in [1,3]]
# for s in strategies:
# 	# create the modeling pipeline
# 	pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', XGBClassifier())])
# 	# evaluate the model
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# 	# store results
# 	results.append(scores)
# 	print('>%s %.3f (%.3f)' % (s, mean(scores), std(scores)))
# # plot model performance for comparison
# plt.boxplot(results, labels=strategies, showmeans=True)
# plt.show()


In [55]:
# # create the modeling pipeline
# pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=1)), ('m', XGBClassifier())])
# # Fit the model
# pipeline.fit(x_train, y_train)
# # Make prediction
# y_pred = pipeline.predict(X_Test)



In [28]:
len(y_pred)

500

In [29]:
data_test_submit = {'id': id_test, 'stroke': y_pred}
df_test_submit = pd.DataFrame(data= data_test_submit)
df_test_submit

Unnamed: 0,id,stroke
0,67,0
1,210,0
2,242,0
3,711,0
4,724,0
...,...,...
495,72393,0
496,72491,0
497,72562,0
498,72792,0


In [30]:
df_test_submit.to_csv('./Dataset/Submission_XGBoost_with_KNN_Imputer_with_1.csv', index= False)