In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# =========================================================================
# 3. DATA SPLITTING
# =========================================================================

# Convert target y to a 1D array (required by train_test_split)
y_flat = y.values.ravel()

# Split data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_flat, test_size=0.2, random_state=42, stratify=y_flat
)

print(f"Data Split: X_train shape {X_train.shape}, X_test shape {X_test.shape}")

# =========================================================================
# 4. VISUALIZATION AND PCA ANALYSIS

#1. Prepare Data for Correlation: Combine features and the target
df_corr_viz = X_encoded.copy()
df_corr_viz['target'] = y.values.ravel() # Add the target variable as a 1D array

# 2. Calculate the correlation of all columns with the 'target' column
# This gives us a single column of correlation coefficients
target_corr = df_corr_viz.corr()[['target']].sort_values(by='target', ascending=False)

# 3. Visualize the correlation
plt.figure(figsize=(8, 15)) # Set the size for a vertical, readable plot
sns.heatmap(
    target_corr,
    annot=True,          # **annot=True**: Display the correlation coefficient number inside each cell.
    cmap='coolwarm',     # **cmap='coolwarm'**: Color map where red/warm colors show positive correlation (closer to +1) and blue/cool colors show negative correlation (closer to -1).
    fmt=".2f",           # **fmt=".2f"**: Format the displayed numbers to two decimal places.
    linewidths=.5,       # Add small white lines to separate the cells.
    cbar=False           # Do not display the color bar since we are only visualizing one column of correlations.
)
plt.title('Feature Correlation with Heart Disease Target', fontsize=16)
plt.yticks(rotation=0) # Keep feature names horizontal for easy reading
plt.show()

# Print the values directly for analysis
print("\n--- Top Correlation Values with Target ---")
print(target_corr.head(10))

# 3. Distribution Plots for Numerical Features (Age)
plt.figure(figsize=(8, 6))
sns.histplot(data=df_corr_viz, x='age', kde=True, bins=20, color='darkgreen')
plt.title('Standardized Age Distribution', fontsize=16)
plt.xlabel('Age (Scaled)')
plt.show()


# 4. Boxplot for Numerical Features (Focused View)
plt.figure(figsize=(14, 6))
sns.boxplot(data=df_corr_viz[numerical_cols], orient='h', palette='Set2')
plt.title('Boxplot of Standardized Numerical Features', fontsize=16)
plt.show()

# --- B. PCA Analysis to Find Optimal Components ---
pca_full = PCA(n_components=None, random_state=42)
pca_full.fit(X_train) # Fit on training data
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
n_components_95 = np.where(cumulative_variance >= 0.95)[0][0] + 1
print(f"Optimal number of components to retain 95% variance: {n_components_95}")

# --- C. Cumulative Explained Variance Plot ---
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='blue')
plt.axhline(y=0.95, color='r', linestyle='-', label='95% Cutoff')
plt.axvline(x=n_components_95, color='g', linestyle='--', label=f'Optimal {n_components_95} Components')
plt.title('Cumulative Explained Variance by Principal Components', fontsize=14)
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.legend()
plt.show()

# --- D. PC1 vs PC2 Scatter Plot ---
pca_2 = PCA(n_components=2, random_state=42)
X_train_pca_2 = pca_2.fit_transform(X_train)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=X_train_pca_2[:, 0],
    y=X_train_pca_2[:, 1],
    hue=y_train,
    palette='viridis',
    legend='full'
)
plt.title('Visualization of Training Data in PC1 vs PC2 Space', fontsize=14)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


NameError: name 'y' is not defined