# 9428 Aromin - Encarnacion

## 1. Data Exploration and Preprocessing

### Data Exploration

In [6]:
import pandas as pd

#Load the dataset
df = pd.read_csv('heart_disease_prediction.csv')

#Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

#Check the shape of the dataset
print("\nDataset shape (rows, columns):", df.shape)

#Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

#Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())
print(df.iloc[:, -1].value_counts())


First 5 rows of the dataset:
   gender  age  educationLevel  currentSmoker  cigsPerDay  BPMeds  \
0       1   39             4.0              0         0.0     0.0   
1       0   46             2.0              0         0.0     0.0   
2       1   48             1.0              1        20.0     0.0   
3       0   61             3.0              1        30.0     0.0   
4       0   46             3.0              1        23.0     0.0   

   prevalentStroke  prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  \
0                0             0         0    195.0  106.0   70.0  26.97   
1                0             0         0    250.0  121.0   81.0  28.73   
2                0             0         0    245.0  127.5   80.0  25.34   
3                0             1         0    225.0  150.0   95.0  28.58   
4                0             0         0    285.0  130.0   84.0  23.10   

   heartRate  glucose  tenYearCHD  
0       80.0     77.0           0  
1       95.0     76.0      

### Data Preprocessing

In [3]:
# STEP 1: DATA PREPROCESSING
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('heart_disease_prediction.csv')

# 1. Drop constant columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
if constant_cols:
    print(f"Dropping constant columns: {constant_cols}")
    df = df.drop(columns=constant_cols)

# 2. Handle missing values with imputation
continuous_vars = ['cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose']
categorical_vars = ['BPMeds', 'educationLevel']

# Impute continuous variables with median
for col in continuous_vars:
    if col in df.columns:  # Check if column exists after removing constants
        df[col] = df[col].fillna(df[col].median())

# Impute categorical variables with mode
for col in categorical_vars:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

# 3. Handle outliers
def handle_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return np.clip(series, lower_bound, upper_bound)

outlier_cols = ['cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
for col in outlier_cols:
    if col in df.columns:
        df[col] = handle_outliers(df[col])

# 4. Identify binary columns (won't normalize these)
binary_cols = [col for col in df.columns if
               df[col].nunique() == 2 and
               df[col].dtype in [np.int64, np.float64] and
               col != 'tenYearCHD']

# 5. Normalize non-binary continuous columns
non_binary_continuous = [col for col in df.columns if
                         col not in binary_cols and
                         col != 'tenYearCHD' and
                         df[col].dtype in [np.int64, np.float64]]

scaler = StandardScaler()
df[non_binary_continuous] = scaler.fit_transform(df[non_binary_continuous])

print("\nNormalized columns:", non_binary_continuous)
print("Binary columns (not normalized):", binary_cols)

# 6. Save processed data
df.to_csv('processed_heart_data.csv', index=False)
print("\nPreprocessing complete. Data saved to 'processed_heart_data.csv'")

# Show final dataset info
print("\nFinal dataset information:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())


Normalized columns: ['age', 'educationLevel', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
Binary columns (not normalized): ['gender', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']

Preprocessing complete. Data saved to 'processed_heart_data.csv'

Final dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           4238 non-null   int64  
 1   age              4238 non-null   float64
 2   educationLevel   4238 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4238 non-null   float64
 5   BPMeds           4238 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4238 non-null   float64
 10  sys

## 2. Dataset Splitting and Model Training

In [None]:
# FEATURE SELECTION AND MODEL TRAINING
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import joblib

# Load processed data
df = pd.read_csv('processed_heart_data.csv')

# Feature selection using ANOVA F-test (alternative to p-values)
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']

# Calculate F-scores and p-values
f_scores, p_values = f_classif(X, y)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'F_Score': f_scores,
    'P_Value': p_values
}).sort_values('P_Value')

# Select features with p-value < 0.05
significant_features = feature_importance[feature_importance['P_Value'] < 0.05]['Feature'].tolist()
print(f"Selected features: {significant_features}")

# Data splitting
# Split into 90% development and 10% unseen
dev_df, unseen_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df['tenYearCHD']
)

# Split development data into train and test (80-20)
X_dev = dev_df[significant_features]
y_dev = dev_df['tenYearCHD']
X_train, X_test, y_train, y_test = train_test_split(
    X_dev, y_dev, test_size=0.2, random_state=42, stratify=y_dev
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression(
    max_iter=1000,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Save artifacts
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
unseen_df.to_csv('unseen_data.csv', index=False)
print("\nModel trained and artifacts saved")

# Display coefficients
coeff_df = pd.DataFrame({
    'Feature': significant_features,
    'Coefficient': model.coef_[0],
    'Odds_Ratio': np.exp(model.coef_[0])
}).sort_values('Odds_Ratio', ascending=False)

print("\nModel Coefficients and Odds Ratios:")
print(coeff_df)

Selected features: ['age', 'sysBP', 'prevalentHyp', 'diaBP', 'diabetes', 'gender', 'BPMeds', 'totChol', 'BMI', 'glucose', 'prevalentStroke', 'cigsPerDay', 'educationLevel']

Model trained and artifacts saved

Model Coefficients and Odds Ratios:
            Feature  Coefficient  Odds_Ratio
0               age     0.559102    1.749101
11       cigsPerDay     0.318429    1.374966
1             sysBP     0.296949    1.345747
5            gender     0.196778    1.217474
7           totChol     0.111710    1.118189
10  prevalentStroke     0.094802    1.099441
4          diabetes     0.094601    1.099220
2      prevalentHyp     0.084798    1.088497
6            BPMeds     0.058416    1.060156
9           glucose     0.057447    1.059130
8               BMI     0.009458    1.009502
3             diaBP     0.004723    1.004735
12   educationLevel    -0.006002    0.994016


## 3. Model Evaluation

In [None]:
# MODEL EVALUATION
import pandas as pd
import joblib
from sklearn.metrics import (confusion_matrix, classification_report,
                           accuracy_score, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import seaborn as sns

SELECTED_FEATURES = ['age', 'sysBP', 'prevalentHyp', 'diaBP', 'diabetes', 'gender', 'BPMeds', 'totChol', 'BMI', 'glucose', 'prevalentStroke', 'cigsPerDay', 'educationLevel']

# Load artifacts
model = joblib.load('logistic_regression_model.pkl')
scaler = joblib.load('feature_scaler.pkl')

# Load and prepare test data
test_df = pd.read_csv('processed_heart_data.csv')
unseen_indices = pd.read_csv('unseen_data.csv').index
X_test = test_df[~test_df.index.isin(unseen_indices)].copy()
y_test = X_test['tenYearCHD']

# Select features using predefined list
X_test = X_test[SELECTED_FEATURES]

# Scale features
X_test_scaled = scaler.transform(X_test)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No CHD', 'CHD'],
            yticklabels=['No CHD', 'CHD'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Test Data')
plt.savefig('confusion_matrix.png')
plt.close()

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Key metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Interpretation
print("\nPERFORMANCE INTERPRETATION:")
print(f"- Model detects {recall*100:.1f}% of actual CHD cases (Recall)")
print(f"- {precision*100:.1f}% of CHD predictions are correct (Precision)")
print(f"- Overall balanced performance: {f1*100:.1f}% (F1 Score)")

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      3248
           1       0.66      0.08      0.14       566

    accuracy                           0.86      3814
   macro avg       0.76      0.54      0.53      3814
weighted avg       0.83      0.86      0.81      3814

Accuracy: 0.8571
Precision: 0.6567
Recall: 0.0777
F1 Score: 0.1390

PERFORMANCE INTERPRETATION:
- Model detects 7.8% of actual CHD cases (Recall)
- 65.7% of CHD predictions are correct (Precision)
- Overall balanced performance: 13.9% (F1 Score)


## 4. Prediction on Unseen Data

In [None]:
# PREDICTION ON UNSEEN DATA
import pandas as pd
import joblib

# Define selected features
SELECTED_FEATURES = ['age', 'sysBP', 'prevalentHyp', 'diaBP', 'diabetes', 'gender', 'BPMeds', 'totChol', 'BMI', 'glucose', 'prevalentStroke', 'cigsPerDay', 'educationLevel']

# Load artifacts
model = joblib.load('logistic_regression_model.pkl')
scaler = joblib.load('feature_scaler.pkl')
unseen_df = pd.read_csv('unseen_data.csv')

# Prepare unseen data
X_unseen = unseen_df[SELECTED_FEATURES]
y_unseen = unseen_df['tenYearCHD']
X_unseen_scaled = scaler.transform(X_unseen)

# Make predictions
y_unseen_pred = model.predict(X_unseen_scaled)
y_unseen_prob = model.predict_proba(X_unseen_scaled)[:, 1]  # Probability of CHD

# Create results dataframe
results_df = unseen_df.copy()
results_df['Predicted_CHD'] = y_unseen_pred
results_df['CHD_Probability'] = y_unseen_prob
results_df['Prediction_Correct'] = results_df['tenYearCHD'] == results_df['Predicted_CHD']

# Save results
results_df.to_csv('final_predictions.csv', index=False)

# Print results
print("Unseen Data Predictions:")
print(f"Number of cases: {len(unseen_df)}")
print(f"Actual CHD cases: {y_unseen.sum()} (1 = Yes)")
print(f"Predicted CHD cases: {y_unseen_pred.sum()} (1 = Yes)")
print(f"Accuracy: {(results_df['Prediction_Correct'].mean()*100):.1f}%")

# Create display-friendly version for sample output
display_df = results_df[['age', 'sysBP', 'glucose', 'tenYearCHD', 'Predicted_CHD', 'CHD_Probability']].copy()
display_df['Actual_CHD_Label'] = results_df['tenYearCHD'].map({0: 'No', 1: 'Yes'})
display_df['Predicted_CHD_Label'] = results_df['Predicted_CHD'].map({0: 'No', 1: 'Yes'})

print("\nSample predictions (0 = No, 1 = Yes):")
print(display_df[['age', 'sysBP', 'glucose', 'tenYearCHD', 'Actual_CHD_Label',
                 'Predicted_CHD', 'Predicted_CHD_Label', 'CHD_Probability']].head(10))

print("\nFinal predictions saved to 'final_predictions.csv'")

Unseen Data Predictions:
Number of cases: 424
Actual CHD cases: 64 (1 = Yes)
Predicted CHD cases: 8 (1 = Yes)
Accuracy: 84.4%

Sample predictions (0 = No, 1 = Yes):
        age     sysBP   glucose  tenYearCHD Actual_CHD_Label  Predicted_CHD  \
0 -0.534928 -0.698155 -1.087146           1              Yes              0   
1 -1.001610 -0.431744 -0.126777           0               No              0   
2 -0.068246  2.038609 -0.126777           0               No              0   
3  0.165095 -0.480183 -0.126777           0               No              0   
4 -0.068246 -1.158319  1.968573           0               No              0   
5 -1.118280  0.488583 -0.214084           0               No              0   
6 -1.351621 -0.698155 -0.126777           0               No              0   
7  0.981788  0.004200 -0.825227           1              Yes              0   
8 -0.651598 -0.770813  0.658979           0               No              0   
9 -1.001610 -1.206757  0.658979           0  