In [None]:
import pandas as pd

# Load raw dataset again (clean and consistent)
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df["target"] = cancer.target

df.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(14, 10))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (All Features + Target)")
plt.tight_layout()
plt.show()


In [None]:
target_corr = corr["target"].sort_values(ascending=False)
target_corr.head(10)


In [None]:
top_pos = target_corr.drop("target").head(10)

plt.figure(figsize=(10, 5))
top_pos.sort_values().plot(kind="barh")
plt.title("Top 10 Features Positively Correlated with Target (Benign=1)")
plt.xlabel("Correlation")
plt.tight_layout()
plt.show()


## Quick insight
- The heatmap shows strong correlations between related measurements (e.g., radius, perimeter, area).
- Several “worst” and “mean” features have noticeable correlation with the target, suggesting they may be useful for classification.


In [None]:
from pathlib import Path

fig_dir = Path("../reports/figures")
fig_dir.mkdir(parents=True, exist_ok=True)

# Example: save the correlation heatmap again as a file
plt.figure(figsize=(14, 10))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (All Features + Target)")
plt.tight_layout()
plt.savefig(fig_dir / "correlation_heatmap.png", dpi=200)
plt.show()

print("Saved to:", fig_dir / "correlation_heatmap.png")
