In [10]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import seaborn as sns
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables) 


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

In [3]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0


In [4]:
y.head()

Unnamed: 0,num
0,0
1,2
2,1
3,0
4,0


In [6]:
combined_df = pd.concat([X, y], axis=1)

# Export to CSV
csv_file_path = './heart_disease.csv'
combined_df.to_csv(csv_file_path, index=False)

csv_file_path

'./heart_disease.csv'

In [13]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load your data
heart_disease_data = pd.read_csv('heart_disease.csv')



In [14]:
# Define a dictionary for renaming the columns
renaming_dict = {
    'age': 'age',
    'sex': 'gender',
    'cp': 'chest_pain_type',
    'trestbps': 'resting_blood_pressure',
    'chol': 'serum_cholesterol_mg_per_dl',
    'fbs': 'fasting_blood_sugar_gt_120_mg_per_dl',
    'restecg': 'resting_ecg_results',
    'thalach': 'maximum_heart_rate_achieved',
    'exang': 'exercise_induced_angina',
    'oldpeak': 'st_depression_induced_by_exercise_relative_to_rest',
    'slope': 'slope_of_peak_exercise_st_segment',
    'ca': 'number_of_major_vessels_colored_by_flourosopy',
    'thal': 'thalassemia',
    'num': 'diagnosis_of_heart_disease'
}

# Code to rename the columns
heart_disease_data_renamed = heart_disease_data.rename(columns=renaming_dict)

In [15]:
from sklearn.impute import KNNImputer

# Initialize the KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Impute the missing values using K-Nearest Neighbors
heart_disease_data_imputed = pd.DataFrame(imputer.fit_transform(heart_disease_data_renamed),
                                          columns=heart_disease_data_renamed.columns)

# Check if there are any missing values left
missing_values_after_imputation = heart_disease_data_imputed.isnull().sum()

heart_disease_data_imputed.head(), missing_values_after_imputation


(    age  gender  chest_pain_type  resting_blood_pressure  \
 0  63.0     1.0              1.0                   145.0   
 1  67.0     1.0              4.0                   160.0   
 2  67.0     1.0              4.0                   120.0   
 3  37.0     1.0              3.0                   130.0   
 4  41.0     0.0              2.0                   130.0   
 
    serum_cholesterol_mg_per_dl  fasting_blood_sugar_gt_120_mg_per_dl  \
 0                        233.0                                   1.0   
 1                        286.0                                   0.0   
 2                        229.0                                   0.0   
 3                        250.0                                   0.0   
 4                        204.0                                   0.0   
 
    resting_ecg_results  maximum_heart_rate_achieved  exercise_induced_angina  \
 0                  2.0                        150.0                      0.0   
 1                  2.0       

## Preprocessing

In [20]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns
categorical_cols = heart_disease_data_imputed.select_dtypes(include=['object', 'category']).columns
numerical_cols = heart_disease_data_imputed.select_dtypes(include=['int64', 'float64']).columns

# Create a transformer for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create a transformer for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Apply transformations to the dataset
heart_disease_data_preprocessed = preprocessor.fit_transform(heart_disease_data_imputed)

array([[ 0.94872647,  0.68620244, -2.25177456, ..., -0.72166849,
         0.6558948 , -0.76419779],
       [ 1.39200191,  0.68620244,  0.87798549, ...,  2.49715712,
        -0.89754025,  0.86644961],
       [ 1.39200191,  0.68620244,  0.87798549, ...,  1.42421525,
         1.17370648,  0.05112591],
       ...,
       [ 0.28381332,  0.68620244,  0.87798549, ...,  0.35127338,
         1.17370648,  1.68177331],
       [ 0.28381332, -1.4572959 , -1.20852121, ...,  0.35127338,
        -0.89754025,  0.05112591],
       [-1.82174501,  0.68620244, -0.16526786, ..., -0.72166849,
        -0.89754025, -0.76419779]])

## Modeling

In [24]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

target = heart_disease_data_imputed['diagnosis_of_heart_disease'].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(heart_disease_data_preprocessed, target, test_size=0.2, random_state=42)

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Define the grid of parameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    # Add other parameters here
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Predict on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.9344262295081968