In [98]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
#import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [99]:
 def process_data(path, mean_children, mean_age, mean_bmi, mean_avg):
    df = pd.read_csv(path)
    df = df.drop(["height"], axis=1)

    #age
    for index, r in df.iterrows():
        if np.isnan(r["age"]) == True:
            if r["work_type"] == 'children':
                df["age"][index] = mean_children
            else: df["age"][index] = mean_age
    
    #bmi
    for i in range(len(df['bmi'])):
        if df['bmi'][i] >= 45:
            df['bmi'][i] = 30.0
    df['bmi'] = df['bmi'].fillna(mean_bmi)
    
    #avg_glicose
    df['avg_glucose_level'] = df['avg_glucose_level'].fillna(mean_avg)
    df['hypertension'] = df['hypertension'].fillna(0.0)
    df['heart_disease'] = df['heart_disease'].fillna(0.0)
    df['blood'] = df['blood'].fillna('AB')

    #marriage
    marriage_mapping = {'No': 0, 'Yes': 1}
    df['ever_married'] = df['ever_married'].map(marriage_mapping)

    #one_hot encoder for ...
    categorical_features_oh = ['gender', 'blood', 'smoking_status', 'Residence_type', 'work_type']
    for i in categorical_features_oh:
        df[i] = pd.Categorical(df[i])
        dfDummies = pd.get_dummies(df[i], prefix= i)
        df = pd.concat([df, dfDummies], axis=1)
    for i in categorical_features_oh:
        df.drop(columns=i, axis=1, inplace=True)
    
    #normalize
    norm_feature = ['age', 'avg_glucose_level', 'bmi']
    for i in norm_feature:
        max = df[i].to_numpy().max()
        min = df[i].to_numpy().min()
        n = max - min
        for j in range(len(df[i])):
            df[i][j] = (df[i][j] - min)/n
    
    return df

In [100]:
df_train = pd.read_csv('./Dataset/train.csv')

In [101]:
for i in range(len(df_train['bmi'])):
    if df_train['bmi'][i] >= 45:
        df_train['bmi'][i] = 30.0
mean_bmi = np.mean(df_train['bmi'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['bmi'][i] = 30.0


In [102]:
mean_age = np.mean(df_train["age"])

In [103]:
a = df_train["age"].tolist()
b = df_train["work_type"].tolist()
c = []
for i in range(len(a)):
    if b[i] == 'children' and np.isnan(a[i]) == False:
        c.append(a[i])

mean_children = np.mean(c)

In [104]:
mean_avg = np.mean(df_train['avg_glucose_level'])

In [112]:
df_train = process_data('./Dataset/train.csv', mean_children, mean_age, mean_bmi, mean_avg)
df_train = df_train.drop(['id'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["age"][index] = mean_children
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  else: df["age"][index] = mean_age
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bmi'][i] = 30.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i][j] = (df[i][j] - min)/n


In [113]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             4000 non-null   float64
 1   hypertension                    4000 non-null   float64
 2   heart_disease                   4000 non-null   float64
 3   ever_married                    4000 non-null   int64  
 4   avg_glucose_level               4000 non-null   float64
 5   bmi                             4000 non-null   float64
 6   stroke                          4000 non-null   int64  
 7   gender_Female                   4000 non-null   uint8  
 8   gender_Male                     4000 non-null   uint8  
 9   gender_Other                    4000 non-null   uint8  
 10  blood_A                         4000 non-null   uint8  
 11  blood_AB                        4000 non-null   uint8  
 12  blood_B                         40

MODEL

In [124]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
import xgboost as xgb

In [125]:
x = df_train.drop(["stroke"], axis=1).to_numpy()
y = df_train["stroke"].to_numpy()
# smote = SMOTE()
# x_smote, y_smote = smote.fit_resample(x, y)
# x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size= 0.2, random_state= 42)

In [120]:
# model_xgb = xgb.XGBClassifier()
# model_xgb.fit(x_train, y_train)
# y_xgb_pred = model_xgb.predict(x_test)
# f1_score(y_test, y_xgb_pred)

In [126]:
model_xgb = xgb.XGBClassifier()
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
pipeline = Pipeline(steps=[('r', resample), ('m', model_xgb)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring = ['f1']
scores2 = cross_validate(pipeline, x, y, scoring=scoring, cv=cv, n_jobs=-1)
pipeline.fit(x, y)



Pipeline(steps=[('r',
                 SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))),
                ('m',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [127]:
scores2

{'fit_time': array([5.2034266 , 5.30127406, 5.2533648 , 5.16150308, 5.44162488,
        5.41216016, 5.34426498, 5.43726707, 8.50427842, 8.39324737,
        8.33821011, 8.77958846, 7.28124809, 7.24023581, 7.16623354,
        6.54308009, 5.6106863 , 5.65062785, 5.55370688, 5.93752384,
        5.57645726, 5.59646368, 5.55638003, 5.67765903, 6.84096742,
        6.84094405, 6.8419342 , 6.54191113, 3.4212122 , 3.40806746]),
 'score_time': array([0.0159564 , 0.01200438, 0.01994658, 0.01296639, 0.00897622,
        0.01296759, 0.01296544, 0.01396441, 0.00900984, 0.00700617,
        0.0100019 , 0.3704927 , 0.00800848, 0.01399493, 0.01248646,
        0.01247787, 0.01196885, 0.03813767, 0.01097155, 0.01695418,
        0.01000428, 0.01203871, 0.01399136, 0.01403141, 0.01897645,
        0.02000356, 0.01499581, 0.0100224 , 0.00897622, 0.00897646]),
 'test_f1': array([0.21052632, 0.5       , 0.4       , 0.41666667, 0.72      ,
        0.46153846, 0.41666667, 0.27272727, 0.4       , 0.4       ,
       

In [128]:
df_test = process_data('./Dataset/public_test.csv', mean_children, mean_age, mean_bmi, mean_avg)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  else: df["age"][index] = mean_age
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["age"][index] = mean_children
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bmi'][i] = 30.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i][j] = (df[i][j] - min)/n


In [129]:
id_test = df_test['id'].to_numpy()
x_test = df_test.drop(['id'], axis=1).to_numpy()
x_test = np.array(x_test)

In [132]:
y_xgb_pred_test = pipeline.predict(x_test)
data_test_submit = {'id': id_test, 'stroke': y_xgb_pred_test}
df_test_submit = pd.DataFrame(data= data_test_submit)
df_test_submit
df_test_submit.to_csv('./Dataset/Submission.csv', index= False)

In [123]:
x_test.shape

(500, 25)