
# Week 1â€“2 Exploratory Data Analysis
This notebook documents the exploratory data analysis (EDA) for the health risk prediction project.
We inspect the cleaned dataset, quantify data quality, and generate visuals saved to the shared results folders.


## Setup

In [1]:

from pathlib import Path
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

plt.style.use('seaborn-v0_8')
sns.set_theme(style='whitegrid')

DATA_PATH = Path('../data/processed/health_clean.csv')
PLOTS_DIR = Path('../results/plots')
METRICS_DIR = Path('../results/metrics')
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
METRICS_DIR.mkdir(parents=True, exist_ok=True)
TARGET_COLUMN = 'hltprhc'

print(f'Using dataset: {DATA_PATH.resolve()}')


Using dataset: /Users/peter/Desktop/AI_MLProjects_Research_Project/health_xai_project/data/processed/health_clean.csv


### Load Cleaned Dataset

In [None]:

df = pd.read_csv(DATA_PATH)
row_count, col_count = df.shape
print(f'Rows: {row_count} | Columns: {col_count}')
df.head()


### Dataset Overview

In [None]:

buffer = io.StringIO()
df.info(buf=buffer)
info_text = buffer.getvalue()
print(info_text)

numeric_desc = df.describe().T
display(numeric_desc)

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if categorical_cols:
    categorical_desc = df[categorical_cols].describe().T
    display(categorical_desc)
else:
    print('No categorical columns detected.')


### Checking Missing Values

In [None]:

missing_counts = df.isna().sum().sort_values(ascending=False)
missing_df = pd.DataFrame({
    'missing_count': missing_counts,
    'missing_pct': (missing_counts / row_count * 100).round(2)
})
display(missing_df)
missing_df.to_csv(METRICS_DIR / 'missing_values_summary.csv')


### Missing Value Heatmap

In [None]:

sample_size = min(1000, len(df))
subset = df.sample(n=sample_size, random_state=42) if len(df) > sample_size else df.copy()
plt.figure(figsize=(12, 6))
sns.heatmap(subset.isna(), cbar=False)
plt.title('Missing Values Heatmap (sampled)' if len(df) > sample_size else 'Missing Values Heatmap')
heatmap_path = PLOTS_DIR / 'missing_values_heatmap.png'
plt.tight_layout()
plt.savefig(heatmap_path)
plt.show()


### Target Distribution (hltprhc)

In [None]:

target_counts = df[TARGET_COLUMN].value_counts().sort_index()
target_pct = (target_counts / row_count * 100).round(2)
class_balance_df = pd.DataFrame({'count': target_counts, 'percentage': target_pct})
display(class_balance_df)
plt.figure(figsize=(6, 4))
sns.barplot(x=class_balance_df.index.astype(str), y='count', data=class_balance_df.reset_index(drop=True), palette='pastel')
plt.title('Class Balance for hltprhc')
plt.xlabel('hltprhc')
plt.ylabel('Count')
plt.tight_layout()
class_balance_path = PLOTS_DIR / 'class_balance.png'
plt.savefig(class_balance_path)
plt.show()


### Numeric Variable Distributions

In [None]:

numeric_cols = df.drop(columns=[TARGET_COLUMN], errors='ignore').select_dtypes(include=[np.number]).columns.tolist()
print(f'Numeric features ({len(numeric_cols)}): {numeric_cols}')
for column in numeric_cols:
    series = df[column].dropna()
    if series.empty:
        continue
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(series, bins=30, kde=True, ax=axes[0], color='steelblue')
    axes[0].set_title(f'Histogram - {column}')
    axes[0].set_xlabel(column)
    sns.boxplot(x=series, ax=axes[1], color='salmon')
    axes[1].set_title(f'Boxplot - {column}')
    axes[1].set_xlabel(column)
    fig.suptitle(f'Distribution for {column}')
    fig.tight_layout()
    plot_path = PLOTS_DIR / f'{column}_distribution.png'
    fig.savefig(plot_path)
    plt.show()


### Categorical Variable Frequencies

In [None]:

categorical_cols = df.drop(columns=[TARGET_COLUMN], errors='ignore').select_dtypes(exclude=[np.number]).columns.tolist()
print(f'Categorical features ({len(categorical_cols)}): {categorical_cols}')
if categorical_cols:
    for column in categorical_cols:
        counts = df[column].value_counts().head(20)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=counts.values, y=counts.index, palette='viridis')
        plt.title(f'Top Categories - {column}')
        plt.xlabel('Frequency')
        plt.ylabel(column)
        plot_path = PLOTS_DIR / f'{column}_frequency.png'
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.show()
else:
    print('No categorical variables detected.')


### Correlation Matrix and Heatmap

In [None]:

import pandas as pd  # reaffirm for clarity when re-running cells individually
target_corr = pd.Series(dtype=float)
if numeric_cols:
    corr = df[numeric_cols + [TARGET_COLUMN]].corr()
    display(corr)
    plt.figure(figsize=(14, 12))
    sns.heatmap(corr, cmap='coolwarm', linewidths=0.5, square=True)
    plt.title('Correlation Heatmap')
    corr_path = PLOTS_DIR / 'correlation_heatmap.png'
    plt.tight_layout()
    plt.savefig(corr_path)
    plt.show()
    target_corr = corr[TARGET_COLUMN].drop(labels=[TARGET_COLUMN]).sort_values(key=lambda s: s.abs(), ascending=False)
    display(pd.DataFrame({'correlation_with_target': target_corr}))
else:
    print('Correlation matrix requires numeric features.')


### Multicollinearity Check (VIF)

In [None]:

def compute_vif(dataframe):
    if dataframe.shape[1] < 2:
        return pd.DataFrame()
    imputer = SimpleImputer(strategy='median')
    numeric_imputed = pd.DataFrame(
        imputer.fit_transform(dataframe), columns=dataframe.columns
    )
    vif_records = []
    for column in numeric_imputed.columns:
        y = numeric_imputed[column]
        X = numeric_imputed.drop(columns=[column])
        if X.empty:
            vif = np.nan
        else:
            model = LinearRegression()
            model.fit(X, y)
            r_squared = model.score(X, y)
            vif = np.inf if r_squared >= 1 else 1.0 / max(1 - r_squared, 1e-6)
        vif_records.append({'feature': column, 'vif': round(float(vif), 3)})
    return pd.DataFrame(vif_records)

vif_df = compute_vif(df[numeric_cols]) if numeric_cols else pd.DataFrame()
if not vif_df.empty:
    display(vif_df)
    vif_df.to_csv(METRICS_DIR / 'vif_summary.csv', index=False)
else:
    print('Not enough numeric features to compute VIF.')


### Outlier Summary (IQR Method)

In [None]:

iqr_records = []
for column in numeric_cols:
    series = df[column].dropna()
    if series.empty:
        continue
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outlier_mask = (series < lower) | (series > upper)
    iqr_records.append({
        'feature': column,
        'iqr': round(iqr, 3),
        'lower_bound': round(lower, 3),
        'upper_bound': round(upper, 3),
        'outlier_count': int(outlier_mask.sum()),
        'outlier_pct': round(outlier_mask.mean() * 100, 2),
    })

iqr_df = pd.DataFrame(iqr_records)
if not iqr_df.empty:
    display(iqr_df)
    iqr_df.to_csv(METRICS_DIR / 'outlier_summary_notebook.csv', index=False)
else:
    print('No numeric features available for IQR-based outlier detection.')


### EDA Summary Table

In [None]:

overall_missing_pct = round((df.isna().sum().sum() / (row_count * col_count)) * 100, 2)
class_balance_summary = '; '.join([f"{int(cls)}: {pct}%" for cls, pct in target_pct.items()])
top_corr_summary = ''
if not target_corr.empty:
    top_corr_summary = '; '.join([f"{idx + 1}. {feature} ({value:.3f})" for idx, (feature, value) in enumerate(target_corr.head(3).items())])
eda_summary = pd.DataFrame([
    {'metric': 'row_count', 'value': row_count},
    {'metric': 'column_count', 'value': col_count},
    {'metric': 'overall_missing_pct', 'value': overall_missing_pct},
    {'metric': 'class_balance', 'value': class_balance_summary},
])
if top_corr_summary:
    eda_summary = pd.concat([eda_summary, pd.DataFrame([{'metric': 'top_correlations', 'value': top_corr_summary}])], ignore_index=True)
display(eda_summary)
eda_summary.to_csv(METRICS_DIR / 'eda_summary.csv', index=False)


### Summary of EDA Findings

In [None]:

summary_points = [
    f'Dataset contains {row_count} rows and {col_count} columns with an overall missing rate of {overall_missing_pct}%.',
    f'Class balance for hltprhc shows: {class_balance_summary}.',
]
if top_corr_summary:
    summary_points.append(f'Top correlated features with hltprhc: {top_corr_summary}.')
if not iqr_df.empty:
    top_outliers = iqr_df.sort_values(by='outlier_pct', ascending=False).head(3)
    formatted_outliers = '; '.join([f"{row.feature} ({row.outlier_pct}% outliers)" for row in top_outliers.itertuples()])
    summary_points.append(f'Outlier check (IQR) highlights: {formatted_outliers}.')
display(Markdown('
'.join(f'- {point}' for point in summary_points)))
