In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV


In [37]:
#%% Data Loading and Initial Analysis
heart = pd.read_csv("..\dataset\heart.csv")
print("Class Distribution:\n", heart_data["target"].value_counts())
print("\nDataset Description:\n", heart_data.describe())

Class Distribution:
 target
1    165
0    138
Name: count, dtype: int64

Dataset Description:
               age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.366337    0.683168    0.966997  131.623762  246.264026    0.148515   
std      9.082101    0.466011    1.032052   17.538143   51.830751    0.356198   
min     29.000000    0.000000    0.000000   94.000000  126.000000    0.000000   
25%     47.500000    0.000000    0.000000  120.000000  211.000000    0.000000   
50%     55.000000    1.000000    1.000000  130.000000  240.000000    0.000000   
75%     61.000000    1.000000    2.000000  140.000000  274.500000    0.000000   
max     77.000000    1.000000    3.000000  200.000000  564.000000    1.000000   

          restecg     thalach       exang     oldpeak       slope          ca  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean     0.52

In [38]:
def preprocess_heart_data(data):
    # Handle missing values
    data = data.dropna()

In [39]:
def convert_categorical_variables(data):
    # Convert categorical variables
    categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
    data = pd.get_dummies(data, columns=categorical_cols)
    
    return data

# Apply the function to the heart_data
heart_data = convert_categorical_variables(heart_data)

In [40]:
# Preprocess the data
heart_processed = preprocess_heart_data(heart)

In [41]:
#%% Data Preprocessing
# Separate features and target
X = heart_data.drop(columns="target", axis=1)
y = heart_data["target"]

In [42]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [43]:
# Create pipeline with SMOTE and RandomForest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [44]:
# Define hyperparameter search space
param_dist = {
    'classifier__n_estimators': [100, 200, 300, 400],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__class_weight': ['balanced', 'balanced_subsample']
}


In [45]:
# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42
)

In [46]:

# Fit the model
random_search.fit(X_train, y_train)

In [47]:
# Get the best model
best_model = random_search.best_estimator_

In [48]:
# Make predictions
y_pred = best_model.predict(X_test)

In [49]:
# Print results
print("Heart Disease Prediction Results:")
print("================================")
print(f"Best Parameters: {random_search.best_params_}")
print(f"\nAccuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Heart Disease Prediction Results:
Best Parameters: {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 30, 'classifier__class_weight': 'balanced'}

Accuracy Score: 0.8033

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.68      0.76        28
           1       0.77      0.91      0.83        33

    accuracy                           0.80        61
   macro avg       0.82      0.79      0.80        61
weighted avg       0.81      0.80      0.80        61



In [50]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.named_steps['classifier'].feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
    Feature  Importance
28   thal_2    0.139490
7      cp_0    0.120133
29   thal_3    0.103696
4   oldpeak    0.086865
21     ca_0    0.072050
3   thalach    0.071733
17  exang_1    0.052436
16  exang_0    0.051307
0       age    0.045901
2      chol    0.044299


In [52]:
with open('diabetes_model.sav', 'wb') as f:
    pickle.dump({
        'model': best_model,
        'scaler': best_model.named_steps['scaler'],
        
    }, f)

In [53]:
#%% Verification of Balanced Predictions
print("\nPrediction Distribution:")
print(pd.Series(y_pred).value_counts(normalize=True))


Prediction Distribution:
1    0.639344
0    0.360656
Name: proportion, dtype: float64
