In [2]:
# Heart Disease Prediction - Classification

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load dataset
df = pd.read_csv("heart.csv")  # Make sure 'heart.csv' is in your working directory
print(df.head())

# Basic Info
print(df.info())

# Target variable is 'target'
X = df.drop("target", axis=1)
y = df["target"]

# Identify categorical and numerical columns
categorical_cols = ["cp", "thal", "slope"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])


# Pipelines
svm_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", SVC(kernel='rbf', probability=True))
])

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# Train Random Forest
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Save models separately
joblib.dump(rf_pipeline.named_steps['classifier'], "rf_model.pkl")
joblib.dump(svm_pipeline.named_steps['classifier'], "svm_model.pkl")
# Save preprocessor
joblib.dump(preprocessor, "preprocessor.pkl")



   age  sex              cp  trestbps  chol  fbs  restecg  thalach  exang  \
0   52    1  typical_angina       125   212    0        1      168      0   
1   53    1  typical_angina       140   203    1        0      155      1   
2   70    1  typical_angina       145   174    0        1      125      1   
3   61    1  typical_angina       148   203    0        1      161      0   
4   62    0  typical_angina       138   294    1        1      106      0   

   oldpeak        slope  ca               thal  target  
0      1.0  downsloping   2  reversible_defect       0  
1      3.1    upsloping   0  reversible_defect       0  
2      2.6    upsloping   0  reversible_defect       0  
3      0.0  downsloping   1  reversible_defect       0  
4      1.9         flat   3       fixed_defect       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age  

['preprocessor.pkl']