In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [23]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
import sklearn as sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV


import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as reg
from sklearn.linear_model import LogisticRegression, LassoCV
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e2/test.csv')
ss = pd.read_csv('/kaggle/input/playground-series-s3e2/sample_submission.csv')
ogdata_df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
train_df.drop('id', axis=1, inplace=True)
ogdata_df.drop('id', axis=1, inplace=True)

In [None]:
train_df.head()
ogdata_df = ogdata_df[ogdata_df['stroke']==1]

In [None]:
train_df.info()

In [None]:
ogdata_df.info()

In [None]:
train_df.describe()

In [None]:
train_df = pd.concat([train_df, ogdata_df],axis=0, ignore_index=True)
train_df.info()

In [None]:
# we can observe there is class imbalance
sns.countplot(x='stroke', data=train_df)
plt.show()

In [None]:
sns.countplot(x='heart_disease', data=train_df)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_df.corr(), annot=True, cmap='viridis')

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='age', y='bmi', data=train_df, hue='stroke', alpha=0.5)

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(hue='gender', x='smoking_status', data=train_df)
plt.legend(loc=(1.1,0.5))
plt.show()

In [None]:
sns.displot(x='avg_glucose_level', data=train_df, kde=True)
plt.show()

In [None]:
sns.barplot(x='hypertension', y='age', data=train_df, estimator=np.mean, hue='gender')
plt.title("Avg age of people with hypertension")
plt.show()

In [None]:
x_full = train_df.copy()
y_full = x_full.pop('stroke').to_numpy()


num_cols = ["age", "avg_glucose_level", "bmi"]
cat_cols = x_full.columns.difference(num_cols)
print(cat_cols)

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

tr = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", OneHotEncoder(drop="first"), cat_cols),
])

x_full = tr.fit_transform(x_full)
x_test = tr.transform(test_df)

In [None]:
def get_model():
    d = 0.1
    model = tf.keras.models.Sequential([
        layers.Dense(256, activation='relu'),
        layers.Dropout(d),
        layers.Dense(128, activation='relu'),
        layers.Dropout(d),
        layers.Dense(64, activation='relu'),
        layers.Dropout(d),
        layers.Dense(1, activation='sigmoid'),
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tfa.losses.SigmoidFocalCrossEntropy(alpha=0.80, gamma=2.0),
        metrics="AUC"
    )
    return model

In [None]:
plt = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode='max', patience=3, factor=0.1, min_lr=1e-6, min_delta=0.0001)
es = tf.keras.callbacks.EarlyStopping(monitor="val_auc", mode='max', patience=7, min_delta=0.0001, restore_best_weights = True)

In [None]:
models = []
scores = []
skf = StratifiedKFold(n_splits=12, shuffle=True)


for train_index, val_index in skf.split(x_full, y_full):
    x_train, x_val = x_full[train_index], x_full[val_index]
    y_train, y_val = y_full[train_index], y_full[val_index]
    
    model = get_model()
    h = model.fit(
        x_train, y_train,
        validation_data = (x_val, y_val),
        epochs = 100,
        batch_size = 64,
        callbacks = [plt, es],
        class_weight = { 0: 1.0, 1: 10.0, },
        verbose=False
    ).history
    
    s = roc_auc_score(y_val, model.predict(x_val))
    print(f"val auc: {s:.4f}")
    scores.append(s)
    models.append(model)
    
print(f'mean scores:  {np.mean(scores):.4f}')

In [None]:
import catboost as cb
scores = []

cb_params = {
    'depth': 3,
    'learning_rate': 0.01,
    'rsm': 0.5,
    'subsample': 0.931,
    'l2_leaf_reg': 69,
    'min_data_in_leaf': 20,
    'random_strength': 0.175,
    'use_best_model': True,
    'task_type': 'CPU',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'SymmetricTree',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC'
}
for train_index, val_index in skf.split(x_full, y_full):
    cb_train = cb.Pool(data=x_full[train_index], label = y_full[train_index])
    cb_valid = cb.Pool(data=x_full[val_index], label = y_full[val_index])
    
    model = cb.train(params=cb_params,
                     dtrain=cb_train,
                     num_boost_round=10000,
                     evals=cb_valid, 
                     early_stopping_rounds=500,
                     verbose=False)
    
    s = roc_auc_score(y_full[val_index], model.predict(cb_valid))
    print(f"Best val auc: {s:.4f}")
    scores.append(s)
    models.append(model)
print(f'mean scores:  {np.mean(scores):.4f}')

In [None]:
scores = []
for train_index, val_index in skf.split(x_full, y_full):
    x_train, x_val = x_full[train_index], x_full[val_index]
    y_train, y_val = y_full[train_index], y_full[val_index]
    
    model = LassoCV(
    precompute="auto",
    fit_intercept=True,
    max_iter=1000,
    verbose=False,
    eps=1e-04,
    n_alphas=1000,
    n_jobs=8)
    
    model.fit(x_train, y_train)
    
    s = roc_auc_score(y_val, model.predict(x_val))
    print(f"Best val auc: {s:.4f}")
    scores.append(s)
    models.append(model)
print(f'mean scores:  {np.mean(scores):.4f}')

In [None]:
nn_preds = []
cb_preds = []
lasso_preds = []

for model in models[:12]:
    nn_preds.append(model.predict(x_test))
for model in models[12:24]:
    cb_preds.append(model.predict(x_test))
for model in models[24:]:
    lasso_preds.append(model.predict(x_test))

In [None]:
cb_preds = np.array(cb_preds).mean(0)
nn_preds = np.array(nn_preds).mean(0)
lasso_preds = np.array(lasso_preds).mean(0)

In [None]:
nn_preds = nn_preds.reshape((10204,))
nn_preds

In [None]:
ss['stroke'] = (cb_preds*0.5 + nn_preds*0.5)*0.5 + lasso_preds*0.5
ss.to_csv("submission.csv", index=False)
pd.read_csv("submission.csv").head()