In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px

from sqlalchemy import create_engine, Column, Integer, Float, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

heart_data = pd.read_csv('heart_disease_uci.csv')
engine = create_engine('sqlite:///heart_disease.db')
heart_data.to_sql('heart_disease', engine, if_exists='replace', index=False)
query = "SELECT * FROM heart_disease"
df = pd.read_sql(query, engine)
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print(f"Columns: {df.columns.tolist()}")

FileNotFoundError: [Errno 2] No such file or directory: 'heart_disease_uci.csv'

In [None]:
df.describe(include='all')

In [None]:
df.head(5)

In [None]:
df.info()

## Handling Missing values

In [None]:
df.isnull().sum()

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
numeric_cols

In [None]:
categorical_cols

In [None]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [None]:
df[numeric_cols].isnull().sum()

In [None]:
for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
df[categorical_cols].isnull().sum()

In [None]:
df.duplicated().sum()

## Handling outliers

In [None]:
for col in numeric_cols:
    fig = px.box(df, y=col, title=f'Box Plot of {col}')
    fig.show()

In [None]:
df= df[((df['trestbps'] >= 92) & (df['trestbps'] <= 170))]

In [None]:
fig = px.box(df, y='trestbps', title='Box Plot of trestbps')
fig.show()

In [None]:
df= df[((df['chol'] >= 117) & (df['chol'] <= 369))]

In [None]:
fig = px.box(df, y='chol', title='Box Plot of chol')
fig.show()

In [None]:
df= df[((df['thalch'] >= 80) )]

In [None]:
fig = px.box(df, y='thalch', title='Box Plot of thalch')
fig.show()

In [None]:
df= df[((df['oldpeak'] >= -2) & (df['oldpeak'] <= 3.7))]

In [None]:
fig = px.box(df, y='oldpeak', title='oldpeak')
fig.show()

In [None]:
df.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numeric_cols])  # numeric_cols: list of numerical column names
df_scaled = pd.DataFrame(scaled_data, columns=numeric_cols)

In [None]:
df.duplicated().sum()

## analyze schema 

In [None]:
print(df.info())

In [None]:
df.describe()

In [None]:
df[numeric_cols].hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

In [None]:
for col in numeric_cols:
    if col in df.columns:  
        plt.figure(figsize=(6, 4))
        sns.histplot(df[col], kde=True, bins=20, color='skyblue')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()

In [None]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    plt.figure(figsize=(5, 4))
    sns.countplot(x=df[col], palette='pastel')
    plt.title(f'Count of Categories in {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
grouped = df.groupby('num')[df.select_dtypes(include=['int64', 'float64']).columns].mean().T
grouped.plot(kind='bar', figsize=(10, 6), colormap='viridis')
plt.title('Mean Values of Features by Heart Disease Presence (num)')
plt.ylabel('Mean Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

## 2. QA & visualizations 

### Question 1: What is the distribution of heart disease cases in the dataset?

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Create pie chart of heart disease distribution
labels = ['No Heart Disease', 'Heart Disease']
sizes = df['num'].value_counts()
colors = ['#66b3ff', '#ff9999']
explode = (0, 0.1)  # explode the 2nd slice (Heart Disease)

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Distribution of Heart Disease Cases', fontsize=16)
plt.tight_layout()
plt.savefig('heart_disease_distribution.png')
plt.show()

### Question 2: How does heart disease prevalence differ between males and females?

In [None]:
plt.figure(figsize=(10, 6))

# Create a count plot showing gender distribution by heart disease status
sns.countplot(x='sex', hue='num', data=df , palette='pastel')

# Add labels and title
plt.title('Heart Disease Cases by Gender', fontsize=16)
plt.xlabel('Gender (0 = Female, 1 = Male)', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(labels=['No Heart Disease', 'Heart Disease'], title='Diagnosis')

# Add count labels on top of each bar
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height()}', 
                      (p.get_x() + p.get_width() / 2., p.get_height()),
                      ha = 'center', va = 'bottom', fontsize=12)

plt.tight_layout()
plt.savefig('gender_heart_disease.png')
plt.show()

### Question 3: How does age relate to heart disease?

In [None]:
plt.figure(figsize=(12, 6))

# Create a histogram of age distribution colored by heart disease status
sns.histplot(data= df, x='age', hue='num', bins=10, 
             multiple='stack', palette='Set2')

# Add labels and title
plt.title('Age Distribution by Heart Disease Status', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(labels=['No Heart Disease', 'Heart Disease'], title='Diagnosis')

plt.tight_layout()
plt.savefig('age_distribution.png')
plt.show()

### Question 4: How does maximum heart rate compare between patients with and without heart disease?


In [None]:
plt.figure(figsize=(10, 6))

# Create boxplots for maximum heart rate by heart disease status
sns.boxplot(x='num', y='thalch', data=df, palette='Set3')

# Add individual data points
sns.stripplot(x='num', y='thalch', data=df, 
              color='black', alpha=0.3, size=4)

# Add labels and title
plt.title('Maximum Heart Rate by Heart Disease Status', fontsize=16)
plt.xlabel('Heart Disease Diagnosis (0 = No, 1 = Yes)', fontsize=14)
plt.ylabel('Maximum Heart Rate', fontsize=14)

plt.tight_layout()
plt.savefig('max_heart_rate.png')
plt.show()

### Question 5: What is the relationship between cholesterol levels and heart disease?


In [None]:
plt.figure(figsize=(10, 6))

# Create violin plots for cholesterol by heart disease status
sns.violinplot(x='num', y='chol', data=df, palette='Pastel1')

# Add individual data points
sns.stripplot(x='num', y='chol', data=df, 
              color='black', alpha=0.3, size=4)

# Add labels and title
plt.title('Cholesterol Levels by Heart Disease Status', fontsize=16)
plt.xlabel('Heart Disease Diagnosis (0 = No, 1 = Yes)', fontsize=14)
plt.ylabel('Serum Cholesterol (mg/dl)', fontsize=14)

plt.tight_layout()
plt.savefig('cholesterol_levels.png')
plt.show()

## 2.3 basic visuaization


In [None]:
# 1. Sex Distribution
plt.subplot(2, 3, 2)
sex_counts = df['sex'].value_counts()
plt.pie(sex_counts, labels=['Female', 'Male'], autopct='%1.1f%%', startangle=90)
plt.title('Sex Distribution')

In [None]:
# 2. Age Distribution
plt.figure(figsize=(14, 10))    
plt.subplot(2, 3, 1)
sns.histplot(df['age'], kde=True, bins=20)
plt.title('Age Distribution')

In [None]:
# 3. Chest Pain Type Distribution
plt.subplot(2, 3, 3)
cp_counts = df['cp'].value_counts().sort_index()
sns.barplot(x=cp_counts.index, y=cp_counts.values)
plt.title('Chest Pain Type Distribution')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.xticks([0, 1, 2, 3], ['Typical Angina', 'Atypical Angina', 'Non-anginal Pain', 'Asymptomatic'])

In [None]:
# 4. Resting Blood Pressure Distribution
plt.subplot(2, 3, 4)
sns.histplot(df['trestbps'], kde=True, bins=20)
plt.title('Resting Blood Pressure Distribution')
plt.xlabel('Resting Blood Pressure (mm Hg)')

In [None]:
# 5. Cholesterol Distribution
plt.subplot(2, 3, 5)
sns.histplot(df['chol'], kde=True, bins=20)
plt.title('Cholesterol Distribution')
plt.xlabel('Serum Cholesterol (mg/dl)')

## 2.4 Advanced visualization


In [None]:
if len(numeric_cols) > 1:
    correlation_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(12, 10))
    plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none', aspect='auto')
    plt.colorbar()
    plt.xticks(range(len(correlation_matrix)), correlation_matrix.columns, rotation=90)
    plt.yticks(range(len(correlation_matrix)), correlation_matrix.columns)

    # Add correlation values in the cells
    for i in range(len(correlation_matrix)):
        for j in range(len(correlation_matrix)):
            plt.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
                     ha='center', va='center',
                     color='white' if abs(correlation_matrix.iloc[i, j]) > 0.5 else 'black')

    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.show()

In [None]:
# 2. Age vs. Max Heart Rate Colored by Disease Status
plt.figure(figsize=(12, 8))
sns.scatterplot(x='age', y='thalch', hue='num', data=df, palette='viridis', s=100, alpha=0.7)
plt.title('Age vs. Maximum Heart Rate by Disease Status')
plt.xlabel('Age')
plt.ylabel('Maximum Heart Rate')
plt.legend(title='Heart Disease', labels=['Absent', 'Present'])
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('age_vs_heartrate_scatter.png')

## 3.Modelling

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
)


# Apply Label Encoding to all object (string) columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Separate features and target
X = df.drop('num', axis=1)
y = df['num']

print("\nFeature set shape:", X.shape)
print("Target set shape:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print("\nTraining set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# Create a pipeline with scaling and logistic regression
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

In [None]:
df.isnull().sum()

In [None]:
# Fit the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)
y_pred_prob = pipe.predict_proba(X_test)[:, 1]

In [None]:
# Performance metrics
print("\n===== BASELINE MODEL PERFORMANCE =====")
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)



In [None]:
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix.png')
plt.close()

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("\nSummary Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")


## 3.2 Model Expirmntation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_test)
y_proba = dt_model.predict_proba(X_test)[:, 1] 

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"\nDecision Tree Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Decision Tree')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# Feature scaling (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Train the model with default parameters (but make sure to enable probability estimation)
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_scaled)
y_proba = svm_model.predict_proba(X_test_scaled)[:, 1]  

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"\nSVM Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - SVM')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
pip install xgboost

In [None]:
import xgboost as xgb 
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)
y_proba = xgb_model.predict_proba(X_test)[:, 1] 

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"\nXGBoost Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - XGBoost')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV



# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters (GridSearchCV):", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
