In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import VotingClassifier


In [26]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/datasets-train-test-stroke/stroke_train_set.csv')
test_df = pd.read_csv('/kaggle/input/datasets-train-test-stroke/stroke_test_set_nogt.csv')


In [27]:
# Splitting data into features and target
X = train_df.drop('stroke', axis=1)
y = train_df['stroke']

In [28]:
# Identifying column types
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns


In [29]:
# Preprocessing pipelines for numerical and categorical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numerical_cols),
    ('cat', cat_transformer, categorical_cols)
])

In [33]:
# Model definitions

models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'ExtraTrees': ExtraTreesClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=2000),
    'SVC': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}


In [34]:
# Hyperparameter grids (simplified for this example)
param_grids = {
    'RandomForest': {'classifier__n_estimators': [100, 200]},
    'GradientBoosting': {'classifier__learning_rate': [0.01, 0.1]},
    'ExtraTrees': {'classifier__n_estimators': [100, 200]},
    'LogisticRegression': {'classifier__C': [0.1, 1]},
    'SVC': {'classifier__C': [1, 10]},
    'XGBoost': {'classifier__learning_rate': [0.01, 0.1]}
}

best_models = {}
for model_name, model in models.items():
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE()),
        ('classifier', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X, y)
    best_models[model_name] = grid_search.best_estimator_

In [35]:
# Voting Classifier
voting_clf = VotingClassifier(estimators=[(name, model) for name, model in best_models.items()], voting='soft')
voting_clf.fit(X, y)

In [36]:
y_pred = voting_clf.predict(X)
print("F1 Score:", f1_score(y, y_pred))
print(classification_report(y, y_pred))

F1 Score: 0.7465069860279441
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      3888
           1       0.62      0.94      0.75       200

    accuracy                           0.97      4088
   macro avg       0.81      0.95      0.86      4088
weighted avg       0.98      0.97      0.97      4088



In [39]:
#X_test = test_df.drop('stroke', axis=1)  # if 'stroke' column exists in test data
predictions = voting_clf.predict(test_df)

In [41]:
# Preparing the submission file
submission_df = pd.DataFrame({'ID': range(0, len(test_df)), 'stroke': predictions})
submission_path = '24_final_submission.csv'
submission_df.to_csv(submission_path, index=False)