# 🧠 Breast Cancer Diagnosis - Exploratory Data Analysis
Exploring the Wisconsin Breast Cancer dataset to identify patterns and prepare it for machine learning.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")

## 📥 Load Dataset

In [None]:
df = pd.read_csv("data/breast_cancer_data.csv")
print("Dataset shape:", df.shape)
df.head()

## ℹ️ Data Info & Summary

In [None]:
df.info()
df.describe()

## 🧹 Data Cleaning

In [None]:
df = df.drop(['id', 'Unnamed: 32'], axis=1)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

## 📊 Class Distribution

In [None]:
sns.countplot(x='diagnosis', data=df)
plt.title("Diagnosis Distribution")
plt.xticks([0, 1], ['Benign', 'Malignant'])
plt.show()

## 🔗 Correlation Matrix

In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
plt.title("Correlation Matrix")
plt.show()

In [None]:
corr_target = df.corr()['diagnosis'].sort_values(ascending=False)
corr_target

## 📈 Feature Distributions

In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
df[features].hist(bins=15, figsize=(15, 8))
plt.suptitle("Distribution of Selected Features")
plt.show()

## 🔍 Pairplot

In [None]:
sns.pairplot(df[['diagnosis', 'radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean']], hue='diagnosis')

## 📦 Boxplot Example

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='diagnosis', y='radius_mean', data=df)
plt.title("Radius Mean by Diagnosis")

## ✅ Conclusion
- Malignant tumors tend to have higher values in several features.
- Features like `radius_mean`, `perimeter_mean`, and `area_mean` are highly correlated.
- Minimal class imbalance makes logistic regression an appropriate model choice.