In [None]:
!pip install lazypredict
!pip install pandas numpy scikit-learn seaborn matplotlib

In [None]:
# Load common libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    RocCurveDisplay
)

# Load the CSV file
df = pd.read_csv('heart.csv')

# Display basic info
print(df.info())  # Check column types and missing values
print(df.head())  

In [None]:
# Emma's Code section
# Basic dataset overview.
print("----- INFO -----")
print(df.info())

print("\n----- DESCRIBE -----")
display(df.describe())

print("\n----- MISSING VALUES -----")
print(df.isnull().sum())

print("\n----- TARGET BALANCE (HeartDisease) -----")
print(df["HeartDisease"].value_counts())

print("\n----- GENDER BALANCE -----")
print(df["Sex"].value_counts())

In [None]:
# Categorical columns to one-hot encode for logistic regression.
categorical_cols = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Shape before encoding:", df.shape)
print("Shape after encoding:", df_encoded.shape)

df_encoded.head()

In [None]:
# Separate features and target.
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

# Train/test split (80/20).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

In [None]:
# Logistic regression with balanced class weighting.
log_reg_balanced = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("log_reg", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

# Train model.
log_reg_balanced.fit(X_train, y_train)

# Predictions and probabilities.
y_pred = log_reg_balanced.predict(X_test)
y_prob = log_reg_balanced.predict_proba(X_test)[:, 1]

# Metrics.
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)
auc  = roc_auc_score(y_test, y_prob)

print("Logistic Regression with class_weight='balanced'")
print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1 Score : {f1:.3f}")
print(f"ROC-AUC  : {auc:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix – Logistic Regression (Class-Weighted)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
RocCurveDisplay.from_estimator(log_reg_balanced, X_test, y_test)
plt.title("ROC Curve – Logistic Regression (Class-Weighted)")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(y_prob[y_test == 0], bins=20, alpha=0.6, label="No Heart Disease (0)")
plt.hist(y_prob[y_test == 1], bins=20, alpha=0.6, label="Heart Disease (1)")
plt.xlabel("Predicted Probability")
plt.ylabel("Frequency")
plt.title("Distribution of Predicted Probabilities\n(Logistic Regression, Class-Weighted)")
plt.legend()
plt.show()

In [None]:
# Get feature names.
feature_names = X_train.columns

# Coefficients from logistic regression inside the pipeline.
coefficients = log_reg_balanced.named_steps["log_reg"].coef_[0]

coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

coef_df["AbsCoeff"] = coef_df["Coefficient"].abs()
coef_df = coef_df.sort_values(by="AbsCoeff", ascending=False)

coef_df.head(15)

In [None]:
# Plot coefficient-based feature importance.
plt.figure(figsize=(10, 6))
plt.barh(coef_df["Feature"], coef_df["Coefficient"])
plt.xlabel("Coefficient Value")
plt.title("Logistic Regression Feature Importance")
plt.axvline(0, color="black", linewidth=0.8)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
coef_df["OddsRatio"] = np.exp(coef_df["Coefficient"])
coef_df[["Feature", "Coefficient", "OddsRatio"]].head(15)

In [None]:
# Use original df (before encoding) for age/sex analysis.
df_bins = df.copy()

age_min, age_max = df_bins["Age"].min(), df_bins["Age"].max()
bins = np.linspace(age_min, age_max, 11)   # 10 bins → 11 edges
labels = [f"{int(bins[i])}-{int(bins[i+1])}" for i in range(10)]

df_bins["AgeBin"] = pd.cut(
    df_bins["Age"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

age_sex_summary = (
    df_bins
    .groupby(["Sex", "AgeBin"])["HeartDisease"]
    .agg(count="count", heart_disease_rate="mean")
    .reset_index()
)

age_sex_summary.head()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    data=age_sex_summary,
    x="AgeBin",
    y="heart_disease_rate",
    hue="Sex"
)
plt.xticks(rotation=45)
plt.ylabel("Heart Disease Rate")
plt.xlabel("Age Range")
plt.title("Heart Disease Rate by Sex and Age Bin")
plt.tight_layout()
plt.show()

In [None]:
# Nicholson's Code Section

# For ML purposes, all strings are converted to int
df['Sex'] = df['Sex'].replace({'M': 0, 'F': 1})
df['ChestPainType'] = df['ChestPainType'].replace({'ATA': 0, 'NAP': 1, 'ASY': 2, 'TA': 3})
df['RestingECG'] = df['RestingECG'].replace({'Normal': 0, 'ST': 1, 'LVH': 2})
df['ExerciseAngina'] = df['ExerciseAngina'].replace({'N': 0, 'Y': 1})
df['ST_Slope'] = df['ST_Slope'].replace({'Up': 0, 'Down': 1, 'Flat': 2})
print(df.head())
# IGNORE WARNING

In [None]:
# Split genders
male_df = df[df['Sex'] == 0]
female_df = df[df['Sex'] == 1]

male = male_df.drop(columns=['Sex'])
female = female_df.drop(columns=['Sex'])


In [None]:
# Library needed
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X_male = male.drop("HeartDisease", axis=1)
y_male = male["HeartDisease"]

X_female = female.drop("HeartDisease", axis=1)
y_female = female["HeartDisease"]

# Split the dataset into training and testing sets (80%-20%)
X_male_train, X_male_test, y_male_train, y_male_test = train_test_split(
    X_male, y_male, test_size=0.2, random_state=42
)

X_female_train, X_female_test, y_female_train, y_female_test = train_test_split(
    X_female, y_female, test_size=0.2, random_state=42
)


In [None]:
# Library needed
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
scaler = StandardScaler()
X_male_train_scaled = scaler.fit_transform(X_male_train)
X_male_test_scaled = scaler.transform(X_male_test)

X_female_train_scaled = scaler.fit_transform(X_female_train)
X_female_test_scaled = scaler.transform(X_female_test)

In [None]:
# Library needed
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
model_male = RandomForestClassifier(n_estimators=100, random_state=42)
model_male.fit(X_male_train_scaled, y_male_train)



In [None]:
model_female = RandomForestClassifier(n_estimators=100, random_state=42)
model_female.fit(X_female_train_scaled, y_female_train)

In [None]:
# Make predictions
y_male_pred = model_male.predict(X_male_test_scaled)
y_female_pred = model_female.predict(X_female_test_scaled)

In [None]:
# Library needed
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model (male)
accuracy_male = accuracy_score(y_male_test, y_male_pred)
print(f"Model Accuracy: {accuracy_male:.4f}")

print("\nClassification Report:")
print(classification_report(y_male_test, y_male_pred))

In [None]:
# Evaluate the model
accuracy_female = accuracy_score(y_female_test, y_female_pred)
print(f"Model Accuracy: {accuracy_female:.4f}")

print("\nClassification Report:")
print(classification_report(y_female_test, y_female_pred))

In [None]:
# Get feature importance from the trained model
feature_importances = model_male.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'Feature': X_male.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance in Random Forest Model')
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.savefig('male_importance.png') 
plt.show()

In [None]:
# Get feature importance from the trained model
feature_importances = model_female.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'Feature': X_female.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance in Random Forest Model')
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.savefig('female_importance.png') 

plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# -------------------------
# 1. Hyperparameter grid
# -------------------------
param_grid = {
    'n_estimators': [50, 100, 200],      # Number of trees
    'max_depth': [None, 10, 20, 30],     # Tree depth
    'min_samples_split': [2, 5, 10],     # Min samples to split
    'min_samples_leaf': [1, 2, 4]        # Min samples in leaf
}

# -------------------------
# 2. MALE GRID SEARCH
# -------------------------
rf_male = RandomForestClassifier(random_state=42)

grid_search_male = GridSearchCV(
    rf_male,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_search_male.fit(X_male_train, y_male_train)

print("Best hyperparameters (male):", grid_search_male.best_params_)

best_model_male = grid_search_male.best_estimator_

y_pred_male_opt = best_model_male.predict(X_male_test)
male_opt_acc = accuracy_score(y_male_test, y_pred_male_opt)

print("Optimized male model accuracy:", male_opt_acc)

# -------------------------
# 3. FEMALE GRID SEARCH
# -------------------------
rf_female = RandomForestClassifier(random_state=42)

grid_search_female = GridSearchCV(
    rf_female,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_search_female.fit(X_female_train, y_female_train)

print("Best hyperparameters (female):", grid_search_female.best_params_)

best_model_female = grid_search_female.best_estimator_

y_pred_female_opt = best_model_female.predict(X_female_test)
female_opt_acc = accuracy_score(y_female_test, y_pred_female_opt)

print("Optimized female model accuracy:", female_opt_acc)


In [None]:
import joblib

# Save optimized Random Forest models
joblib.dump(best_model_male, "rf_male_optimized.pkl")
joblib.dump(best_model_female, "rf_female_optimized.pkl")

print("Optimized male and female Random Forest models saved successfully!")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
sns.heatmap(
    male.corr(),
    annot=True,
    fmt=".2f",
    cmap='coolwarm'
)
plt.title("Male Feature Correlation Heatmap")
plt.savefig('male_heatmap.png') 

plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(
    female.corr(),
    annot=True,
    fmt=".2f",
    cmap='coolwarm'
)
plt.title("Female Feature Correlation Heatmap")
plt.savefig('female_heatmap.png') 

plt.show()


In [None]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, ignore_warnings=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models.head(10)


In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(50,35))
plot_tree(
    model_male.estimators_[0],
    feature_names=X_male.columns,
    class_names=["No Disease", "Disease"],
    filled=True,
    max_depth=111
)
plt.savefig('male.png') 

plt.show()


In [None]:
plt.figure(figsize=(50,40))
plot_tree(
    model_female.estimators_[0],
    feature_names=X_female.columns,
    class_names=["No Disease", "Disease"],
    filled=True,
    max_depth=8
)
plt.savefig('female.png') 
plt.show()


In [None]:
# Jeffery's Code Section

heart_df = pd.read_csv("/content/heart.csv")
heart_df.head()

In [None]:
OneHotEncoder = OneHotEncoder(sparse_output=False)

target_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


for i in target_columns:
  heart_df[i] = OneHotEncoder.fit_transform(heart_df[[i]])
  print(heart_df.value_counts([i]), '\n\n------')

In [None]:
heart_df.isnull().sum()

In [None]:
from random import randrange, uniform
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state=42)

param_grid = {
      'max_depth': [3, 5, 7],
      'min_samples_split': [2, 5, 10],
      'criterion': ['gini', 'entropy']
      }

In [None]:
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy', # 'accuracy', 'f1', 'roc_auc', etc
                           n_jobs=-1
                           ) # Use all available processors

In [None]:
y = heart_df['HeartDisease']
X = heart_df.drop(['HeartDisease'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

In [None]:
print(y.value_counts())
print("Best parameters found:", grid_search.best_params_)
print(f"Best cross-validation score: {grid_search.best_score_: .2f}")

In [None]:
from sklearn.metrics import classification_report

In [None]:


hist_model = HistGradientBoostingClassifier(max_bins=12, max_iter=100)
hist_model.fit(X_train, y_train) # Fit training set to model
y_pred = hist_model.predict(X_test) # Train prediciton
accuracy = accuracy_score(y_test, y_pred)


print(f'Accuracy: {accuracy:.3f}')
print("Classification Report ", classification_report(y_test, y_pred))


In [None]:
X.describe()

In [None]:
y.head()

In [None]:
df = X.copy()
df['target'] = y

# Mapping for the target variable (0 and 1) to descriptive names
target_names_map = {0: 'No Heart Disease', 1: 'Heart Disease'}
df['type'] = df['target'].map(target_names_map)

sns.pairplot(df, hue="type")
plt.suptitle("Pairplot of Heart Cancer Dataset", y=1.02, fontsize=20)
plt.show()

In [None]:
print(y.value_counts())

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_resample, y_resample = smote.fit_resample(X_train, y_train)

print("Resampled target variable distribution:\n", y_resample.value_counts())

In [None]:
# Train new Hist model
model_refit = HistGradientBoostingClassifier(max_bins=12, max_depth=4, max_iter=100)
model_refit.fit(X_resample, y_resample) # Fit training set to model

y_pred = model_refit.predict(X_test) # Train prediciton

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn import preprocessing
import pandas as pd

In [None]:
# Copy SMOTE X variable
X_resample_copy = X_resample.copy()

# Call Standard Scaler to normalize Z-Scores
scaler = StandardScaler()
X_resample_scaled = scaler.fit_transform(X_resample_copy)

# Conducted a LogisticRegression Model on both Male and Females
model = LogisticRegression(random_state=42, solver='liblinear') # Added random_state and solver for reproducibility and stability
model.fit(X_resample_scaled, y_resample)
importances = pd.Series(abs(model.coef_[0]), index=X_resample.columns)
print(importances.sort_values(ascending=False))

# Scale X_test
X_test_scaled = scaler.transform(X_test)
y_pred_lr = model.predict(X_test_scaled)


accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Accuracy on test set after SMOTE and Scaling: {accuracy_lr:.3f}')
print("Classification Report for Logistic Regression after SMOTE and Scaling:\n", classification_report(y_test, y_pred_lr))

In [None]:
model = DecisionTreeClassifier(random_state=42)

# param_grid = {
#       'max_depth': [3, 5, 7, None],
#       'min_samples_split': [2, 5, 10, None],
#       'criterion': ['gini', 'entropy']
#       }

param_grid = {
    'max_leaf_nodes': [3, 5, 7, 10],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

In [None]:
grid_search_resam = GridSearchCV(estimator=model,
                                 param_grid=param_grid,
                                 cv=5,
                                 scoring='accuracy',
                                 n_jobs=-1)

X_resampled_abs = abs(X_resample)
y_resampled_abs = abs(y_resample)

print(X_resampled_abs.info())

grid_search_resam.fit(X_resampled_abs, y_resampled_abs)

In [None]:
best_grid_search_resam = grid_search_resam.best_estimator_

y_pred_dt_resampled = best_grid_search_resam.predict(X_test)

accuracy_dt_resampled = accuracy_score(y_test, y_pred_dt_resampled)

print(f'Decision Tree Accuracy on test set after SMOTE: {accuracy_dt_resampled:.3f}')

In [None]:
from sklearn.metrics import classification_report

In [None]:

print("Classification Report for Decision Tree after SMOTE:\n", classification_report(y_test, y_pred))

In [None]:
df.head()

In [None]:
males_df = heart_df[heart_df['Sex'] == 0.0]
females_df = heart_df[heart_df['Sex'] == 1.0]

print("Males DataFrame head:")
print(males_df.head())
print("\nFemales DataFrame head:")
print(females_df.head())

In [None]:
males_df.head()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(males_df['Age'], bins=10, edgecolor='black') # Plotting histogram for 'Age' column
plt.title('Histogram of Age for Males')
plt.xlabel('Age') # Corrected xlabel to a string
plt.ylabel('Frequency') # Corrected ylabel to a string
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(females_df['Age'], bins=10, edgecolor='black') # Plotting histogram for 'Age' column
plt.title('Histogram of Age for Females')
plt.xlabel('Age') # Corrected xlabel to a string
plt.ylabel('Frequency') # Corrected ylabel to a string
plt.show()

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [None]:
# Conducted a LogisticRegression Model on both Male and Females
model = LogisticRegression()
model.fit(X, y)
importances = pd.Series(abs(model.coef_[0]), index=X.columns)
print(importances.sort_values(ascending=False))

# Results of trained model and X_test, y_test corrolation data
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
sorted_idx = result.importances_mean.argsort()
importances = pd.Series(result.importances_mean[sorted_idx], index=X_test.columns[sorted_idx])
print(importances.sort_values(ascending=False))