 Load the Datasets
 

In [3]:
import pandas as pd
import numpy as np

# Load the datasets
heart_df = pd.read_csv('heart (1).csv')  # Adjust the file name if needed

# Display the first few rows of each dataset
print("\nHeart Disease Dataset:")
print(heart_df.head())




Heart Disease Dataset:
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [4]:
# Check for Missing Values
print("Missing values in Heart Disease Dataset:")
print(heart_df.isnull().sum())


Missing values in Heart Disease Dataset:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [5]:
#Encode Categorical Variables (One-Hot Encoding)
# One-Hot Encoding for Heart Disease Dataset
heart_df = pd.get_dummies(heart_df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], drop_first=True)



In [6]:
#Scale Numerical Features (Standardization)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Scale relevant columns
heart_df[['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(heart_df[['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']])


In [7]:
#Split the Data into Train & Test Sets
from sklearn.model_selection import train_test_split

# Heart Disease Dataset
X_heart = heart_df.drop('HeartDisease', axis=1)  # Adjust column name if needed
y_heart = heart_df['HeartDisease']
X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(X_heart, y_heart, test_size=0.2, random_state=42)


Train Logistic Regression Models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Function to train and evaluate the model
def train_logistic_regression(X_train, X_test, y_train, y_test, disease_name):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"📌 **{disease_name} - Logistic Regression Performance**")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    
    return model

# Train models for each disease
logreg_heart = train_logistic_regression(X_train_heart, X_test_heart, y_train_heart, y_test_heart, "Heart Disease")



📌 **Heart Disease - Logistic Regression Performance**
Accuracy: 0.8533
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184



Train Random Forest Classifier (Better Performance)

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Function to train and evaluate Random Forest
def train_random_forest(X_train, X_test, y_train, y_test, disease_name):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"📌 **{disease_name} - Random Forest Performance**")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    
    return model

# Train models for each disease
rf_heart = train_random_forest(X_train_heart, X_test_heart, y_train_heart, y_test_heart, "Heart Disease")


📌 **Heart Disease - Random Forest Performance**
Accuracy: 0.8750
              precision    recall  f1-score   support

           0       0.85      0.86      0.85        77
           1       0.90      0.89      0.89       107

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.88      0.88      0.88       184



In [10]:
print(y_heart.value_counts())  # For heart model


HeartDisease
1    508
0    410
Name: count, dtype: int64


In [11]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE on each dataset
X_heart_balanced, y_heart_balanced = smote.fit_resample(X_heart, y_heart)

# Verify the new dataset sizes
print("Balanced Heart Disease Class Distribution:", y_heart_balanced.value_counts())


Balanced Heart Disease Class Distribution: HeartDisease
0    508
1    508
Name: count, dtype: int64


In [12]:
# Train Random Forest on the balanced Heart dataset
rf_heart = RandomForestClassifier(n_estimators=100, random_state=42)
rf_heart.fit(X_heart_balanced, y_heart_balanced)


In [13]:
# Test prediction on a sample input (adjust based on your model)
sample_input = np.array([[45, 130, 250, 140, 1.5, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0]]).reshape(1, -1)

print("Heart Disease Prediction:", rf_heart.predict(sample_input))


Heart Disease Prediction: [0]




In [14]:
from joblib import dump

dump(rf_heart, "rf_heart_balanced.joblib")



['rf_heart_balanced.joblib']