In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

In [5]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/data-set-stroke/stroke_train_set.csv')
test_df = pd.read_csv('/kaggle/input/data-set-stroke/stroke_test_set_nogt.csv')

In [6]:
# Splitting train data into features and target
X = train_df.drop('stroke', axis=1)
y = train_df['stroke']

In [2]:
# Preprocessing for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Identifying numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns


In [10]:
# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [11]:
# Define models
model_rf = RandomForestClassifier(random_state=42)
model_gb = GradientBoostingClassifier(random_state=42)
model_et = ExtraTreesClassifier(random_state=42)
model_lr = LogisticRegression(random_state=42, max_iter=2000)
model_svc = SVC(probability=True, random_state=42)
model_xgb = XGBClassifier(random_state=42)

In [26]:
# Create a preprocessing and modeling pipeline for each model
pipeline_rf = make_pipeline_imb(preprocessor, SMOTE(), SelectFromModel(model_rf), model_rf)
pipeline_gb = make_pipeline_imb(preprocessor, BorderlineSMOTE(), SelectFromModel(model_gb), model_gb)
pipeline_et = make_pipeline_imb(preprocessor, SMOTE(), SelectFromModel(model_et), model_et)
pipeline_lr = make_pipeline_imb(preprocessor, SMOTE(), SelectFromModel(model_lr), model_lr)
pipeline_svc = make_pipeline_imb(
    preprocessor,  # Assuming 'preprocessor' is already defined as before
    SMOTE(),
    SVC(probability=True, random_state=42)
)
pipeline_xgb = make_pipeline_imb(preprocessor, BorderlineSMOTE(), SelectFromModel(model_xgb), model_xgb)

In [27]:
# Parameter grids for hyperparameter tuning
param_grid_rf = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [10, 20, None],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'gradientboostingclassifier__n_estimators': [100, 200, 300],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [3, 5, 7],
    'gradientboostingclassifier__min_samples_split': [2, 5, 10],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 4]
}

param_grid_et = {
    'extratreesclassifier__n_estimators': [100, 200, 300],
    'extratreesclassifier__max_depth': [10, 20, None],
    'extratreesclassifier__min_samples_split': [2, 5, 10],
    'extratreesclassifier__min_samples_leaf': [1, 2, 4]
}

param_grid_lr = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__solver': ['lbfgs', 'liblinear']
}

param_grid_svc = {
    'svc__C': [1, 10, 100],
    'svc__gamma': ['scale', 'auto'],
    'svc__kernel': ['rbf', 'poly', 'sigmoid']  # Example kernels
}

param_grid_xgb = {
    'xgbclassifier__n_estimators': [100, 200, 300],
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.2],
    'xgbclassifier__max_depth': [3, 5, 7],
    'xgbclassifier__min_child_weight': [1, 2, 5],
    'xgbclassifier__subsample': [0.6, 0.8, 1.0]
}

In [14]:
# Fit and tune RandomForestClassifier using GridSearchCV
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_search_rf.fit(X, y)
best_rf = grid_search_rf.best_estimator_

# Fit and tune GradientBoostingClassifier using GridSearchCV
#grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='f1', n_jobs=-1)
#grid_search_gb.fit(X, y)
#best_gb = grid_search_gb.best_estimator_

# Fit and tune ExtraTreesClassifier using GridSearchCV
#grid_search_et = GridSearchCV(pipeline_et, param_grid_et, cv=5, scoring='f1', n_jobs=-1)
#grid_search_et.fit(X, y)
#best_et = grid_search_et.best_estimator_

# Fit and tune LogisticRegression using GridSearchCV
#grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
#grid_search_lr.fit(X, y)
#best_lr = grid_search_lr.best_estimator_

# Fit and tune SVC using GridSearchCV
#grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='f1', n_jobs=-1)
#grid_search_svc.fit(X, y)
#best_svc = grid_search_svc.best_estimator_

# Fit and tune XGBClassifier using GridSearchCV
#grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='f1', n_jobs=-1)
#grid_search_xgb.fit(X, y)
#best_xgb = grid_search_xgb.best_estimator_

In [15]:
# Fit and tune GradientBoostingClassifier using GridSearchCV
grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='f1', n_jobs=-1)
grid_search_gb.fit(X, y)
best_gb = grid_search_gb.best_estimator_

In [16]:
# Fit and tune ExtraTreesClassifier using GridSearchCV
grid_search_et = GridSearchCV(pipeline_et, param_grid_et, cv=5, scoring='f1', n_jobs=-1)
grid_search_et.fit(X, y)
best_et = grid_search_et.best_estimator_

In [17]:

# Fit and tune LogisticRegression using GridSearchCV
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_search_lr.fit(X, y)
best_lr = grid_search_lr.best_estimator_

In [28]:
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='f1', n_jobs=-1)
grid_search_svc.fit(X, y)  
best_svc = grid_search_svc.best_estimator_

In [31]:
# Fit and tune XGBClassifier using GridSearchCV
grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='f1', n_jobs=-1)
grid_search_xgb.fit(X, y)
best_xgb = grid_search_xgb.best_estimator_

In [32]:
# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('et', best_et),
                ('lr', best_lr), ('svc', best_svc), ('xgb', best_xgb)],
    voting='soft'
)
voting_clf.fit(X, y)

In [33]:
# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('et', best_et),
                ('lr', best_lr), ('svc', best_svc), ('xgb', best_xgb)],
    final_estimator=LogisticRegression(random_state=42)
)
stacking_clf.fit(X, y)


In [35]:
# Predict and Evaluate
y_pred = voting_clf.predict(X)  # Replace with voting_clf.predict(X) for voting classifier
print("F1 Score:", f1_score(y, y_pred))
print(classification_report(y, y_pred))

F1 Score: 0.3187560738581146
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      3888
           1       0.20      0.82      0.32       200

    accuracy                           0.83      4088
   macro avg       0.59      0.82      0.61      4088
weighted avg       0.95      0.83      0.87      4088

