# Cardiovascular Risk Prediction - Exploratory Data Analysis

This notebook explores the cardiovascular dataset and provides insights for model development.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print('Libraries imported successfully!')

In [None]:
# Load the data
data_path = '../data/cardiovascular_data.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
df.head()

## 1. Dataset Overview

In [None]:
# Dataset information
print("Dataset Information:")
print("="*50)
df.info()

print("\nStatistical Summary:")
print("="*50)
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Percentage', ascending=False)

if len(missing_df) > 0:
    print("Missing Values:")
    print(missing_df)
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    missing_df['Percentage'].plot(kind='barh')
    plt.xlabel('Percentage (%)')
    plt.title('Missing Values by Feature')
    plt.show()
else:
    print("No missing values found in the dataset!")

## 2. Target Variable Analysis

In [None]:
# Analyze target variable (TenYearCHD)
target_col = 'TenYearCHD'

if target_col in df.columns:
    target_counts = df[target_col].value_counts()
    target_percentages = df[target_col].value_counts(normalize=True) * 100
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Count plot
    axes[0].bar(target_counts.index, target_counts.values, color=['green', 'red'])
    axes[0].set_xlabel('10-Year CHD Risk')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Target Variable Distribution')
    axes[0].set_xticks([0, 1])
    axes[0].set_xticklabels(['No Risk', 'Risk'])
    
    # Pie chart
    axes[1].pie(target_counts.values, labels=['No Risk', 'Risk'], 
                autopct='%1.1f%%', colors=['green', 'red'])
    axes[1].set_title('Target Variable Proportion')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Class Distribution:")
    print(f"No Risk: {target_counts[0]} ({target_percentages[0]:.2f}%)")
    print(f"Risk: {target_counts[1]} ({target_percentages[1]:.2f}%)")
    print(f"\nClass Imbalance Ratio: 1:{target_counts[0]/target_counts[1]:.2f}")

## 3. Feature Distribution Analysis

In [None]:
# Analyze numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

# Create distribution plots
fig, axes = plt.subplots(4, 4, figsize=(20, 16))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols[:16]):
    axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'Distribution of {col}')
    
    # Add mean line
    mean_val = df[col].mean()
    axes[idx].axvline(mean_val, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_val:.2f}')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols + [target_col]].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(correlation_matrix), k=1)
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, square=True, linewidths=1,
            cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

# Features most correlated with target
target_correlations = correlation_matrix[target_col].sort_values(ascending=False)
print(f"\nTop 10 Features Correlated with {target_col}:")
print(target_correlations.head(11)[1:])  # Exclude self-correlation

## 5. Feature Importance Analysis

In [None]:
# Quick feature importance using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Prepare data
X = df[numerical_cols].fillna(df[numerical_cols].median())
y = df[target_col]

# Train a quick Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({
    'feature': numerical_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'][:15], feature_importance['importance'][:15])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Top 10 Important Features:")
print(feature_importance.head(10))

## 6. Outlier Detection

In [None]:
# Detect outliers using IQR method
outlier_summary = {}

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_summary[col] = len(outliers)

# Create outlier summary DataFrame
outlier_df = pd.DataFrame(list(outlier_summary.items()), 
                          columns=['Feature', 'Outlier_Count'])
outlier_df['Outlier_Percentage'] = (outlier_df['Outlier_Count'] / len(df)) * 100
outlier_df = outlier_df.sort_values('Outlier_Percentage', ascending=False)

# Plot outlier summary
plt.figure(figsize=(12, 6))
plt.bar(outlier_df['Feature'][:10], outlier_df['Outlier_Percentage'][:10])
plt.xlabel('Feature')
plt.ylabel('Outlier Percentage (%)')
plt.title('Top 10 Features with Outliers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Outlier Summary (Top 10):")
print(outlier_df.head(10))

## 7. Recommendations

In [None]:
print("="*60)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*60)

print("\n📊 Dataset Overview:")
print(f"  • Total samples: {len(df)}")
print(f"  • Total features: {df.shape[1]}")
print(f"  • Numerical features: {len(numerical_cols)}")
print(f"  • Target variable: {target_col}")

print("\n⚠️ Key Findings:")
print(f"  • Class imbalance detected (Risk cases: {target_percentages[1]:.2f}%)")
print(f"  • Missing values in {len(missing_df)} features")
print(f"  • Outliers detected in multiple features")

print("\n💡 Recommendations:")
print("  1. Handle class imbalance using SMOTE or class weights")
print("  2. Impute missing values using appropriate strategies")
print("  3. Consider outlier treatment for robust modeling")
print("  4. Focus on top correlated features for initial models")
print("  5. Apply feature scaling for distance-based algorithms")
print("  6. Consider feature engineering for age groups and risk categories")

print("\n✅ Next Steps:")
print("  • Proceed with data preprocessing")
print("  • Implement feature engineering")
print("  • Train multiple models and compare performance")