In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## Load dataset

In [None]:
df = pd.read_csv()

## Preview

In [None]:
df.head()

## Structure

In [None]:
print("\nDataset info:")
df.info()
print("\nDescriptive statistics:")
display(df.describe())

## Missing data

In [None]:
missing = df.isnull().sum()
missing = missing[missing > 0]
print("\nMissing values per column:")
print(missing if not missing.empty else "No missing values found.")

In [None]:
# Define key column groups for later use
continuous_cols = ['Age', 'Pregnancies', 'Number of Children']
binary_cols = ['Family history', 'Personal history']  # Assuming these are yes/no (0/1)
menopause_cols = [c for c in df.columns if 'Pre/Peri/Post' in c]  # One-hot: e.g., 'Pre/Peri/Post_0.0', etc.
cup_size_cols = [c for c in df.columns if 'Cup size' in c]  # One-hot: e.g., 'Cup size_A-C', etc.
diagnosis_cols = [c for c in df.columns if 'Final diagnosis' in c]
medication_cols = [c for c in df.columns if 'Medication' in c]

In [None]:
#Ensure numeric types
df[continuous_cols + binary_cols + menopause_cols + cup_size_cols + diagnosis_cols + medication_cols] = df[
    continuous_cols + binary_cols + menopause_cols + cup_size_cols + diagnosis_cols + medication_cols
].apply(pd.to_numeric, errors='coerce')

## 1. Basic Patient demographics

### 1.1 Age distribition

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['Age'], bins=20, kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Age'])
plt.title("Age Boxplot")
plt.show()

### 1.2 Pregnancies 

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['Pregnancies'], bins=10, kde=False)
plt.title("Pregnancies Distribution")
plt.xlabel("Number of Pregnancies")
plt.ylabel("Count")
plt.show()

### 1.3 Number of children

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['Number of Children'], bins=10, kde=False)
plt.title("Number of Children Distribution")
plt.xlabel("Number of Children")
plt.ylabel("Count")
plt.show()

### 1.4 Familiy history

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Family history'])
plt.title("Family History Distribution")
plt.xticks([0, 1], ['No', 'Yes'])
plt.ylabel("Count")
plt.show()

### 1.5 Personal history

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Personal history'])
plt.title("Personal History Distribution")
plt.xticks([0, 1], ['No', 'Yes'])
plt.ylabel("Count")
plt.show()

### 1.6 Menopause status 

In [None]:
menopause_map = {"0": "Pre", "1": "Peri", "2": "Post"}  # Adjust keys based on actual column suffixes
counts = df[menopause_cols].sum()
labels = [menopause_map.get(c.split("_")[-1].replace(".0", ""), c) for c in counts.index]
plt.figure(figsize=(8, 4))
plt.bar(labels, counts.values)
plt.title("Menopause Status Distribution")
plt.ylabel("Count")
plt.show()

### 1.7 Cup size 

In [None]:
cup_map = None  # e.g., {"A-C": "A-C", "D-F": "D-F", ">F": ">F"} if needed
counts = df[cup_size_cols].sum()
labels = [c.split("_")[-1] for c in counts.index]
plt.figure(figsize=(8, 4))
plt.bar(labels, counts.values)
plt.title("Cup Size Distribution")
plt.ylabel("Count")
plt.show()

# 2. Diagnosis Overview 

### 2.1 Count of each diagnosis

In [None]:
counts = df[diagnosis_cols].sum()
labels = [c.replace("Final diagnosis_", "") for c in counts.index]
plt.figure(figsize=(10, 6))
plt.bar(labels, counts.values)
plt.title("Count of Each Diagnosis")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.show()

### 2.2 Benign vs maligant

In [None]:
malignant_cols = [c for c in diagnosis_cols if any(k in c for k in ['IDC', 'DCIS', 'other carcinoma'])]
benign_cols = [c for c in diagnosis_cols if any(k in c for k in ['fibroadenoma', 'cyst', 'negative'])]
df['Malignant'] = df[malignant_cols].max(axis=1).astype(int)
df['Benign'] = df[benign_cols].max(axis=1).astype(int)  # Note: Assuming mutual exclusivity

In [None]:
plt.figure(figsize=(6, 6))
plt.pie([df['Benign'].sum(), df['Malignant'].sum()], labels=['Benign', 'Malignant'], autopct='%1.1f%%')
plt.title("Benign vs Malignant")
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Malignant'])
plt.title("Benign vs Malignant Distribution")
plt.xticks([0, 1], ['Benign', 'Malignant'])
plt.ylabel("Count")
plt.show()

## 3. Clinical Risk factors vs Diagnosis

In [None]:
# Create a single 'Diagnosis' column for plotting (multi-class)
df['Diagnosis'] = df[diagnosis_cols].idxmax(axis=1).apply(lambda x: x.replace("Final diagnosis_", ""))

### 3.1 Age vs diagnosis

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Diagnosis', y='Age', data=df)
plt.title("Age vs Diagnosis")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.boxplot(x='Malignant', y='Age', data=df)
plt.title("Age by Malignant / Benign")
plt.xticks([0, 1], ['Benign', 'Malignant'])
plt.ylabel("Age")
plt.show()

### 3.2 Familiy history vs diagnosis

In [None]:
ct = pd.crosstab(df['Diagnosis'], df['Family history'], normalize='index')
ct.plot(kind='bar', stacked=False)
plt.title("Family History vs Diagnosis")
plt.ylabel("Proportion")
plt.show()

In [None]:
ct = pd.crosstab(df['Malignant'], df['Family history'], normalize='index')
ct.index = ['Benign', 'Malignant']
ct.plot(kind='bar', stacked=False, figsize=(7, 5))
plt.title("Family History by Malignant / Benign")
plt.ylabel("Proportion")
plt.xlabel("")
plt.legend(title='Family history', labels=['No', 'Yes'])
plt.show()

# Alternative: probability of malignant by family history
plt.figure(figsize=(6, 5))
sns.barplot(x='Family history', y='Malignant', data=df, estimator='mean')
plt.title("Probability of Malignant by Family History")
plt.xticks([0, 1], ['No', 'Yes'])
plt.ylabel("Proportion Malignant")
plt.show()

### 3.3 Menopause vs diagnosis

In [None]:
df['Menopause'] = df[menopause_cols].idxmax(axis=1).apply(
    lambda x: menopause_map.get(x.split("_")[-1].replace(".0", ""), x)
)
ct = pd.crosstab(df['Menopause'], df['Diagnosis'], normalize='index')
ct.plot(kind='bar', stacked=True)
plt.title("Menopause vs Diagnosis")
plt.ylabel("Proportion")
plt.show()

In [None]:
ct = pd.crosstab(df['Menopause'], df['Malignant'], normalize='index')
ct.plot(kind='bar', stacked=True, figsize=(8, 5))
plt.title("Menopause Status by Malignant / Benign")
plt.ylabel("Proportion")
plt.legend(title='Diagnosis', labels=['Benign', 'Malignant'])
plt.xticks(rotation=0)
plt.show()

### 3.4 Cup size vs diagnosis

In [None]:
df['Cup_size'] = df[cup_size_cols].idxmax(axis=1).apply(lambda x: x.split("_")[-1])
ct = pd.crosstab(df['Cup_size'], df['Diagnosis'], normalize='index')
ct.plot(kind='bar', stacked=True)
plt.title("Cup Size vs Diagnosis")
plt.ylabel("Proportion")
plt.show()

In [None]:
ct = pd.crosstab(df['Cup_size'], df['Malignant'], normalize='index')
ct.plot(kind='bar', stacked=True, figsize=(8, 5))
plt.title("Cup Size by Malignant / Benign")
plt.ylabel("Proportion")
plt.legend(title='Diagnosis', labels=['Benign', 'Malignant'])
plt.xticks(rotation=0)
plt.show()

### 3.5 Pregancies vs diagnosis

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Diagnosis', y='Pregnancies', data=df)
plt.title("Pregnancies vs Diagnosis")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.boxplot(x='Malignant', y='Pregnancies', data=df)
plt.title("Number of Pregnancies by Malignant / Benign")
plt.xticks([0, 1], ['Benign', 'Malignant'])
plt.ylabel("Pregnancies")
plt.show()

### 3.6 Number of children vs diagnosis

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Diagnosis', y='Number of Children', data=df)
plt.title("Number of Children vs Diagnosis")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.boxplot(x='Malignant', y='Number of Children', data=df)
plt.title("Number of Children by Malignant / Benign")
plt.xticks([0, 1], ['Benign', 'Malignant'])
plt.ylabel("Number of Children")
plt.show()

## 4. Correlation Analysis

In [None]:
corr_cols = continuous_cols + menopause_cols + medication_cols + cup_size_cols + diagnosis_cols
corr = df[corr_cols].corr()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap='coolwarm', center=0, annot=False)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
focus_cols = [
    'Age',
    'Pregnancies',
    'Number of Children',
    'Mpause Pre/Peri/Post_0.0',   # Pre
    'Mpause Pre/Peri/Post_1.0',   # Peri   ← adjust exact column names if different!
    'Mpause Pre/Peri/Post_2.0',   # Post
    'Malignant',
    'Family history',
    'Personal history'
]

# Make sure all selected columns exist and are numeric
df_focus = df[focus_cols].copy()
df_focus = df_focus.apply(pd.to_numeric, errors='coerce')

# ───────────────────────────────────────────────────────────────
# 2. Calculate Spearman correlation 
# ───────────────────────────────────────────────────────────────

corr_spearman = df_focus.corr(method='spearman')

# ───────────────────────────────────────────────────────────────
# 3. Plot – clean & focused version
# ───────────────────────────────────────────────────────────────

plt.figure(figsize=(10, 8))

# Mask upper triangle (optional – makes it less cluttered)
mask = np.triu(np.ones_like(corr_spearman, dtype=bool))

sns.heatmap(
    corr_spearman,
    mask=mask,
    annot=True,                     # show correlation values
    fmt=".2f",
    cmap='coolwarm',
    vmin=-1, vmax=1,
    center=0,
    linewidths=0.5,
    cbar_kws={'label': 'Spearman ρ'},
    annot_kws={'size': 10}
)

plt.title("Focused Spearman Correlation – Key Clinical Variables\n", fontsize=14, pad=20)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()

plt.show()

## 5. Medication Analysis 

### 5.1 Medication vs diagnosis

In [None]:
df['Medication'] = df[medication_cols].idxmax(axis=1).apply(lambda x: x.split("_")[-1].replace(".0", ""))
ct = pd.crosstab(df['Medication'], df['Diagnosis'], normalize='index')
ct.plot(kind='bar', stacked=False)
plt.title("Medication vs Diagnosis")
plt.ylabel("Proportion")
plt.show()

### 5.2 Medication vs Age

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Medication', y='Age', data=df)
plt.title("Medication vs Age")
plt.show()

In [None]:
pivot = df.pivot_table(index='Medication', columns='Menopause', values='Malignant', aggfunc='mean')
plt.figure(figsize=(8, 6))
sns.heatmap(pivot, cmap='YlGnBu', annot=True)
plt.title("Mean Malignant Probability by Medication and Menopause")
plt.show()

## 6. Summary

In [None]:
# Summary Table 
summary = pd.DataFrame()

# Age (mean ± SD)
summary.loc['Age', 'Overall'] = f"{df['Age'].mean():.1f} ± {df['Age'].std():.1f}"

# Pregnancies (median, IQR)
q1, q3 = df['Pregnancies'].quantile([0.25, 0.75])
summary.loc['Pregnancies', 'Overall'] = f"{df['Pregnancies'].median():.0f} ({q1:.0f}-{q3:.0f})"

# Number of children (median, IQR)
q1, q3 = df['Number of Children'].quantile([0.25, 0.75])
summary.loc['Number of Children', 'Overall'] = f"{df['Number of Children'].median():.0f} ({q1:.0f}-{q3:.0f})"

# Family history (% yes)
summary.loc['Family history (% yes)', 'Overall'] = f"{df['Family history'].mean() * 100:.1f}%"

# Personal history (% yes)
summary.loc['Personal history (% yes)', 'Overall'] = f"{df['Personal history'].mean() * 100:.1f}%"

# Menopause distribution (%)
for col in menopause_cols:
    label = menopause_map.get(col.split("_")[-1].replace(".0", ""), col)
    summary.loc[f'Menopause {label} (%)', 'Overall'] = f"{df[col].mean() * 100:.1f}%"

# Cup size distribution (%)
for col in cup_size_cols:
    label = col.split("_")[-1]
    summary.loc[f'Cup size {label} (%)', 'Overall'] = f"{df[col].mean() * 100:.1f}%"

# Diagnosis distribution (%)
for col in diagnosis_cols:
    label = col.replace("Final diagnosis_", "")
    summary.loc[f'Diagnosis {label} (%)', 'Overall'] = f"{df[col].mean() * 100:.1f}%"

print("\nSummary Table:")
display(summary)