In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn import set_config
set_config(transform_output='pandas')

In [19]:
df_train = pd.read_csv('heart_train.csv')
df_test = pd.read_csv('heart_test.csv')

In [20]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 818 entries, 0 to 817
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientId       818 non-null    int64  
 1   Age             818 non-null    int64  
 2   Sex             818 non-null    object 
 3   ChestPainType   818 non-null    object 
 4   RestingBP       818 non-null    int64  
 5   Cholesterol     818 non-null    int64  
 6   FastingBS       818 non-null    int64  
 7   RestingECG      818 non-null    object 
 8   MaxHR           818 non-null    int64  
 9   ExerciseAngina  818 non-null    object 
 10  Oldpeak         818 non-null    float64
 11  ST_Slope        818 non-null    object 
 12  HeartDisease    818 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 83.2+ KB


In [21]:
df_train.head()

Unnamed: 0,PatientId,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,34,M,ATA,150,214,0,ST,168,N,0.0,Up,0
1,1,59,M,ASY,178,0,1,LVH,120,Y,0.0,Flat,1
2,2,58,M,ASY,115,0,1,Normal,138,N,0.5,Up,1
3,3,60,M,ASY,130,253,0,Normal,144,Y,1.4,Up,1
4,4,52,M,ASY,165,0,1,Normal,122,Y,1.0,Up,1


In [22]:
X = df_train.drop(['HeartDisease', 'PatientId'], axis=1)  
y = df_train['HeartDisease']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
num_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
cat_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [25]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [26]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', 
                             sparse_output=False,
                             min_frequency=0.01))
])

In [27]:
preprocessor = ColumnTransformer([
    ('num-pipe', num_pipe, num_features),
    ('cat-pipe', cat_pipe, cat_features)
])

In [28]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
}

In [29]:
results = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    

In [30]:
pipeline.fit(X_train, y_train)

In [31]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 15, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

rf_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, 
                         scoring='accuracy', n_jobs=-1, verbose=1)
rf_search.fit(X_train, y_train)

print(f"\nBest Random Forest params: {rf_search.best_params_}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits

Best Random Forest params: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


In [32]:
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

gb_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1.0]
}

gb_search = GridSearchCV(gb_pipeline, gb_param_grid, cv=3, 
                          scoring='accuracy', n_jobs=-1, verbose=1)
gb_search.fit(X_train, y_train)

print(f"\nBest Gradient Boosting params: {gb_search.best_params_}")


Fitting 3 folds for each of 54 candidates, totalling 162 fits

Best Gradient Boosting params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 1.0}


In [33]:
best_rf_score = rf_search.score(X_test, y_test)
best_gb_score = gb_search.score(X_test, y_test)

In [34]:
if best_rf_score > best_gb_score:
    final_model = rf_search.best_estimator_
    model_name = "Random Forest"
    best_score = best_rf_score
else:
    final_model = gb_search.best_estimator_
    model_name = "Gradient Boosting"
    best_score = best_gb_score

In [35]:
final_model.fit(X, y)

In [36]:
X_competition = df_test.drop('PatientId', axis=1)
preds = final_model.predict(X_competition)

In [37]:
submission = pd.DataFrame({
    'PatientId': df_test['PatientId'],
    'HeartDisease': preds
})

submission.to_csv('competition_submission.csv', index=False)