In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/ft-xboost-imb-stroke/stroke_train_set.csv')
test_df = pd.read_csv('/kaggle/input/ft-xboost-imb-stroke/stroke_test_set_nogt.csv')

In [9]:
# Splitting train data into features and target
X = train_df.drop('stroke', axis=1)
y = train_df['stroke']

In [7]:
# Identifying numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

In [11]:
# Preprocessing for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [16]:
# Define models
model_rf = RandomForestClassifier(random_state=42)
model_gb = GradientBoostingClassifier(random_state=42)
model_et = ExtraTreesClassifier(random_state=42)
model_lr = LogisticRegression(random_state=42, max_iter=2000)
model_svc = SVC(probability=True, random_state=42)
model_xgb = XGBClassifier(random_state=42)

In [17]:
# Create a preprocessing and modeling pipeline for each model
pipeline_rf = make_pipeline_imb(preprocessor, SMOTE(), model_rf)
pipeline_gb = make_pipeline_imb(preprocessor, SMOTE(), model_gb)
pipeline_et = make_pipeline_imb(preprocessor, SMOTE(), model_et)
pipeline_lr = make_pipeline_imb(preprocessor, SMOTE(), model_lr)
pipeline_svc = make_pipeline_imb(preprocessor, SMOTE(), model_svc)
pipeline_xgb = make_pipeline_imb(preprocessor, SMOTE(), model_xgb)

In [18]:
#parameters
param_grid_rf = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [10, 15, 20, None],
    'randomforestclassifier__min_samples_split': [2, 5, 10]
}

param_grid_gb = {
    'gradientboostingclassifier__n_estimators': [100, 200, 300],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [3, 5, 7]
}

param_grid_et = {
    'extratreesclassifier__n_estimators': [100, 200, 300],
    'extratreesclassifier__max_depth': [10, 15, 20],
    'extratreesclassifier__min_samples_split': [2, 5, 10]
}

param_grid_lr = {
    'logisticregression__C': [0.1, 1, 10]
}

param_grid_svc = {
    'svc__C': [1, 10, 100],
    'svc__gamma': ['scale', 'auto']
}

param_grid_xgb = {
    'xgbclassifier__n_estimators': [100, 200, 300],
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.2],
    'xgbclassifier__max_depth': [3, 5, 7]
}


In [19]:
#grid search
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_search_rf.fit(X, y)

grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='f1', n_jobs=-1)
grid_search_gb.fit(X, y)

grid_search_et = GridSearchCV(pipeline_et, param_grid_et, cv=5, scoring='f1', n_jobs=-1)
grid_search_et.fit(X, y)

grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_search_lr.fit(X, y)

grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='f1', n_jobs=-1)
grid_search_svc.fit(X, y)

grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='f1', n_jobs=-1)
grid_search_xgb.fit(X, y)


In [21]:
#best estimator
best_rf = grid_search_rf.best_estimator_
best_gb = grid_search_gb.best_estimator_
best_et = grid_search_et.best_estimator_
best_lr = grid_search_lr.best_estimator_
best_svc = grid_search_svc.best_estimator_
best_xgb = grid_search_xgb.best_estimator_


In [22]:
# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('et', best_et), 
                ('lr', best_lr), ('svc', best_svc), ('xgb', best_xgb)],
    voting='soft'
)
voting_clf.fit(X, y)

In [23]:
# Stacking Classifier (optional)
stacking_clf = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('et', best_et), 
                ('lr', best_lr), ('svc', best_svc), ('xgb', best_xgb)],
    final_estimator=LogisticRegression(random_state=42)
)
stacking_clf.fit(X, y)

In [24]:
# Evaluation
y_pred = voting_clf.predict(X)  # or y_pred = stacking_clf.predict(X)
print("F1 Score:", f1_score(y, y_pred))
print(classification_report(y, y_pred))

F1 Score: 0.3283877349159248
              precision    recall  f1-score   support

           0       0.99      0.83      0.91      3888
           1       0.20      0.83      0.33       200

    accuracy                           0.83      4088
   macro avg       0.60      0.83      0.62      4088
weighted avg       0.95      0.83      0.88      4088



In [25]:
# Final predictions for submission
final_predictions = voting_clf.predict(test_df)  # or final_predictions = stacking_clf.predict(test_df)

In [26]:
# Preparing the submission file
submission_df = pd.DataFrame({'ID': range(0, len(test_df)), 'stroke': final_predictions})
submission_path = 'final_submission.csv'
submission_df.to_csv(submission_path, index=False)