In [32]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
import warnings
warnings.filterwarnings("ignore")



In [33]:
import zipfile
zip_path = r'C:\Users\vivek\OneDrive\Desktop\Machine Learning(IA)\ML_Assignments\Datasets\statlog+heart.zip'

# Open the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # List all files in the zip
    print(zip_ref.namelist())



['heart.dat', 'heart.doc', 'Index']


In [34]:
import zipfile
import os

# Define the path to the ZIP file
zip_path = r'C:\Users\vivek\OneDrive\Desktop\Machine Learning(IA)\ML_Assignments\Datasets\statlog+heart.zip'

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(r'C:\Users\vivek\OneDrive\Desktop\Machine Learning(IA)\ML_Assignments\Datasets\statlog+heart')

# Define the path to the extracted dataset
file_path = r'C:\Users\vivek\OneDrive\Desktop\Machine Learning(IA)\ML_Assignments\Datasets\statlog+heart\heart.dat'

# Load the dataset
data = pd.read_csv(file_path, delimiter=' ')  # Change the delimiter if needed
print(data.head())


   70.0  1.0  4.0  130.0  322.0  0.0  2.0  109.0  0.0.1  2.4  2.0.1  3.0  \
0  67.0  0.0  3.0  115.0  564.0  0.0  2.0  160.0    0.0  1.6    2.0  0.0   
1  57.0  1.0  2.0  124.0  261.0  0.0  0.0  141.0    0.0  0.3    1.0  0.0   
2  64.0  1.0  4.0  128.0  263.0  0.0  0.0  105.0    1.0  0.2    2.0  1.0   
3  74.0  0.0  2.0  120.0  269.0  0.0  2.0  121.0    1.0  0.2    1.0  1.0   
4  65.0  1.0  4.0  120.0  177.0  0.0  0.0  140.0    0.0  0.4    1.0  0.0   

   3.0.1  2  
0    7.0  1  
1    7.0  2  
2    7.0  1  
3    3.0  1  
4    7.0  1  


In [35]:
# Assign column names based on the description you provided
columns = [
    'age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'serum_cholesterol',
    'fasting_blood_sugar', 'resting_ecg', 'max_heart_rate', 'exercise_induced_angina',
    'oldpeak', 'slope', 'num_major_vessels', 'thal', 'target'
]
data.columns = columns

# Show the first few rows of the dataset
print(data.head())

    age  sex  chest_pain_type  resting_blood_pressure  serum_cholesterol  \
0  67.0  0.0              3.0                   115.0              564.0   
1  57.0  1.0              2.0                   124.0              261.0   
2  64.0  1.0              4.0                   128.0              263.0   
3  74.0  0.0              2.0                   120.0              269.0   
4  65.0  1.0              4.0                   120.0              177.0   

   fasting_blood_sugar  resting_ecg  max_heart_rate  exercise_induced_angina  \
0                  0.0          2.0           160.0                      0.0   
1                  0.0          0.0           141.0                      0.0   
2                  0.0          0.0           105.0                      1.0   
3                  0.0          2.0           121.0                      1.0   
4                  0.0          0.0           140.0                      0.0   

   oldpeak  slope  num_major_vessels  thal  target  
0      1.

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      269 non-null    float64
 1   sex                      269 non-null    float64
 2   chest_pain_type          269 non-null    float64
 3   resting_blood_pressure   269 non-null    float64
 4   serum_cholesterol        269 non-null    float64
 5   fasting_blood_sugar      269 non-null    float64
 6   resting_ecg              269 non-null    float64
 7   max_heart_rate           269 non-null    float64
 8   exercise_induced_angina  269 non-null    float64
 9   oldpeak                  269 non-null    float64
 10  slope                    269 non-null    float64
 11  num_major_vessels        269 non-null    float64
 12  thal                     269 non-null    float64
 13  target                   269 non-null    int64  
dtypes: float64(13), int64(1)
m

In [37]:
data.isnull().sum()

age                        0
sex                        0
chest_pain_type            0
resting_blood_pressure     0
serum_cholesterol          0
fasting_blood_sugar        0
resting_ecg                0
max_heart_rate             0
exercise_induced_angina    0
oldpeak                    0
slope                      0
num_major_vessels          0
thal                       0
target                     0
dtype: int64

In [38]:
continuous_features = ['age', 'resting_blood_pressure', 'serum_cholesterol', 
                       'max_heart_rate', 'oldpeak', 'num_major_vessels']
categorical_features = ['sex', 'chest_pain_type', 'resting_ecg', 'exercise_induced_angina', 'thal']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),  # Scaling continuous features
        ('cat', OneHotEncoder(), categorical_features)   # One-hot encoding categorical features
    ])

In [39]:
X = data.drop('target', axis=1)  # Features
y = data['target'].replace({1: 0, 2: 1})

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [42]:
# Models to apply
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(silent=True)
}

# Training and evaluation for each model
for model_name, model in models.items():
    print(f"Training and Evaluating {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("-" * 50)


Training and Evaluating Logistic Regression...
Accuracy: 0.8333333333333334
Confusion Matrix:
[[23  3]
 [ 6 22]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.88      0.84        26
           1       0.88      0.79      0.83        28

    accuracy                           0.83        54
   macro avg       0.84      0.84      0.83        54
weighted avg       0.84      0.83      0.83        54

--------------------------------------------------
Training and Evaluating SVM...
Accuracy: 0.8518518518518519
Confusion Matrix:
[[25  1]
 [ 7 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.96      0.86        26
           1       0.95      0.75      0.84        28

    accuracy                           0.85        54
   macro avg       0.87      0.86      0.85        54
weighted avg       0.87      0.85      0.85        54

---------------------------------------

Accuracy: 0.7407407407407407
Confusion Matrix:
[[22  4]
 [10 18]]
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.85      0.76        26
           1       0.82      0.64      0.72        28

    accuracy                           0.74        54
   macro avg       0.75      0.74      0.74        54
weighted avg       0.76      0.74      0.74        54

--------------------------------------------------
Training and Evaluating XGBoost...
Accuracy: 0.7777777777777778
Confusion Matrix:
[[21  5]
 [ 7 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.81      0.78        26
           1       0.81      0.75      0.78        28

    accuracy                           0.78        54
   macro avg       0.78      0.78      0.78        54
weighted avg       0.78      0.78      0.78        54

--------------------------------------------------
Training and Evaluating CatBoos

In [43]:
# Initialize lists to store results
results = []

# Training and evaluation for each model
for model_name, model in models.items():
    start_time = time.time()  # Start the timer
    
    print(f"Training and Evaluating {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)
    
    # End the timer and calculate runtime
    end_time = time.time()
    runtime = end_time - start_time
    
    # Store results for comparison
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision (Class 0)': class_report['0']['precision'],
        'Precision (Class 1)': class_report['1']['precision'],
        'Recall (Class 0)': class_report['0']['recall'],
        'Recall (Class 1)': class_report['1']['recall'],
        'F1-Score (Class 0)': class_report['0']['f1-score'],
        'F1-Score (Class 1)': class_report['1']['f1-score'],
        'Confusion Matrix': conf_matrix,
        'Runtime (seconds)': runtime
    })

# Create a DataFrame for comparison
comparison_df = pd.DataFrame(results)

# Display the results
print(comparison_df)


Training and Evaluating Logistic Regression...
Training and Evaluating SVM...
Training and Evaluating Decision Tree...
Training and Evaluating AdaBoost...
Training and Evaluating XGBoost...
Training and Evaluating CatBoost...
                 Model  Accuracy  Precision (Class 0)  Precision (Class 1)  \
0  Logistic Regression  0.833333             0.793103             0.880000   
1                  SVM  0.851852             0.781250             0.954545   
2        Decision Tree  0.703704             0.692308             0.714286   
3             AdaBoost  0.740741             0.687500             0.818182   
4              XGBoost  0.777778             0.750000             0.807692   
5             CatBoost  0.796296             0.741935             0.869565   

   Recall (Class 0)  Recall (Class 1)  F1-Score (Class 0)  F1-Score (Class 1)  \
0          0.884615          0.785714            0.836364            0.830189   
1          0.961538          0.750000            0.862069        