In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve

# Additional utilities
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

In [None]:
# Load the dataset
from google.colab import files
uploaded = files.upload()

# Read the CSV file
df = pd.read_csv('/content/heart.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Dataset information
print("Dataset Info:")
df.info()

print("\nMissing values:")
print(df.isnull().sum())

print("\nBasic statistics:")
df.describe()

In [None]:
# Check the target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.title('Distribution of Heart Disease Cases')
plt.xlabel('Target (0 = No Disease, 1 = Disease)')
plt.ylabel('Count')
plt.show()

print("Target value counts:")
print(df['target'].value_counts())

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Distribution of key features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Age distribution
axes[0,0].hist(df['age'], bins=20, color='skyblue', edgecolor='black')
axes[0,0].set_title('Age Distribution')
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Frequency')

# Cholesterol distribution
axes[0,1].hist(df['chol'], bins=20, color='lightgreen', edgecolor='black')
axes[0,1].set_title('Cholesterol Distribution')
axes[0,1].set_xlabel('Cholesterol')

# Blood pressure distribution
axes[0,2].hist(df['trestbps'], bins=20, color='lightcoral', edgecolor='black')
axes[0,2].set_title('Resting Blood Pressure')
axes[0,2].set_xlabel('Blood Pressure')

# Maximum heart rate distribution
axes[1,0].hist(df['thalach'], bins=20, color='gold', edgecolor='black')
axes[1,0].set_title('Maximum Heart Rate')
axes[1,0].set_xlabel('Max Heart Rate')

# Oldpeak distribution
axes[1,1].hist(df['oldpeak'], bins=20, color='violet', edgecolor='black')
axes[1,1].set_title('ST Depression')
axes[1,1].set_xlabel('Oldpeak')

# Sex distribution
df['sex'].value_counts().plot(kind='bar', ax=axes[1,2], color=['pink', 'lightblue'])
axes[1,2].set_title('Sex Distribution (0=Female, 1=Male)')
axes[1,2].set_xlabel('Sex')

plt.tight_layout()
plt.show()

In [None]:
# Separate features and target variable
X = df.drop('target', axis=1)
y = df['target']

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

In [None]:
# Feature scaling (optional for Random Forest, but good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")

In [None]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    random_state=42,       # For reproducibility
    max_depth=10,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples required to split
    min_samples_leaf=2     # Minimum samples required at leaf node
)

# Train the model
rf_model.fit(X_train_scaled, y_train)

print("Random Forest model trained successfully!")

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test_scaled)
y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for class 1

print("Predictions completed!")

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC Score: {auc_roc:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# ROC Curve
plt.figure(figsize=(8, 6))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_roc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance Ranking:")
print(feature_importance)

In [None]:
# Visualize feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

In [None]:
# Recommended: RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [5, 10, 15, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['auto', 'sqrt']
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,  # Even fewer iterations for speed
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("Starting Fast Randomized Search...")
random_search.fit(X_train_scaled, y_train)

print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

In [None]:
# Let's create and use a good Random Forest model without waiting for grid search
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Create a well-tuned Random Forest model
best_rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training model with good default parameters...")
best_rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_optimized = best_rf_model.predict(X_test_scaled)
y_pred_proba_optimized = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
optimized_auc = roc_auc_score(y_test, y_pred_proba_optimized)

print(f"Model Accuracy: {optimized_accuracy:.4f}")
print(f"AUC-ROC Score: {optimized_auc:.4f}")

In [None]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """
    Comprehensive evaluation of the model
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_proba)

    print(f"=== {model_name} Evaluation ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 2, 1)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Disease', 'Disease'],
                yticklabels=['No Disease', 'Disease'])
    plt.title(f'{model_name} - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

    # Plot ROC curve
    plt.subplot(1, 2, 2)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_roc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - ROC Curve')
    plt.legend(loc="lower right")
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    return accuracy, auc_roc

# Evaluate both models
print("Evaluating models...")
base_accuracy, base_auc = evaluate_model(rf_model, X_test_scaled, y_test, "Base Random Forest")
optimized_accuracy, optimized_auc = evaluate_model(best_rf_model, X_test_scaled, y_test, "Optimized Random Forest")

In [None]:
import joblib

# Save the trained model
joblib.dump(best_rf_model, 'random_forest_heart_disease_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully!")

# Save feature importance to CSV
feature_importance.to_csv('feature_importance.csv', index=False)
print("Feature importance saved to CSV!")

In [None]:
def predict_heart_disease(model, scaler):
    """
    Predict heart disease for a new patient with user input
    """
    print("=== Heart Disease Prediction System ===")
    print("Please enter the following patient information:\n")

    # Get user input for each feature
    age = float(input("Age: "))
    sex = float(input("Sex (0 = Female, 1 = Male): "))
    cp = float(input("Chest Pain Type (0-3): "))
    trestbps = float(input("Resting Blood Pressure: "))
    chol = float(input("Serum Cholesterol (mg/dl): "))
    fbs = float(input("Fasting Blood Sugar > 120 mg/dl (0 = No, 1 = Yes): "))
    restecg = float(input("Resting ECG Results (0-2): "))
    thalach = float(input("Maximum Heart Rate Achieved: "))
    exang = float(input("Exercise Induced Angina (0 = No, 1 = Yes): "))
    oldpeak = float(input("ST Depression induced by exercise: "))
    slope = float(input("Slope of the peak exercise ST segment (0-2): "))
    ca = float(input("Number of major vessels (0-3) colored by fluoroscopy: "))
    thal = float(input("Thalassemia (1 = normal, 2 = fixed defect, 3 = reversible defect): "))

    # Create patient data array
    patient_data = [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]

    # Scale the patient data
    patient_data_scaled = scaler.transform([patient_data])

    # Make prediction
    prediction = model.predict(patient_data_scaled)
    probability = model.predict_proba(patient_data_scaled)[0, 1]

    result = "Heart Disease Detected" if prediction[0] == 1 else "No Heart Disease"
    risk_level = "High" if probability > 0.7 else "Medium" if probability > 0.3 else "Low"

    print("\n" + "="*50)
    print("PREDICTION RESULTS:")
    print("="*50)
    print(f"Result: {result}")
    print(f"Probability of Heart Disease: {probability:.4f} ({probability*100:.2f}%)")
    print(f"Risk Level: {risk_level}")

    if prediction[0] == 1:
        print("⚠️  Recommendation: Consult a cardiologist for further evaluation.")
    else:
        print("✅  Recommendation: Continue with regular health checkups.")

    return prediction[0], probability, patient_data

# Run the prediction with user input
print("Starting Heart Disease Prediction...")
prediction, probability, patient_data = predict_heart_disease(best_rf_model, scaler)