# Breast Cancer Detection: Data Preparation & Exploratory Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print("✓ Libraries imported successfully")

1. Business Understandig

2. Data Understandig

In [None]:
# Load the Wisconsin Diagnostic Breast Cancer dataset
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total samples: {df.shape[0]}")
print(f"Total features: {df.shape[1]}")

# Display first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head(10))

In [None]:
# Check for columns with all NaN values
print("\nColumns with all NaN values:")
nan_cols = df.columns[df.isna().all()].tolist()
print(nan_cols if nan_cols else "None")

In [None]:
# CHECK FOR MISSING VALUES

missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Percentage': missing_percentage.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print("\nColumns with missing values:")
    print(missing_df)
else:
    print("\n✓ No missing values found in the dataset")

In [None]:
# DATA INFORMATION

print("\nData types:")
print(df.dtypes.value_counts())

print("\nDetailed information:")
df.info()

In [None]:
# TARGET VARIABLE ANALYSIS

# Check diagnosis distribution
diagnosis_counts = df['diagnosis'].value_counts()
diagnosis_percentages = df['diagnosis'].value_counts(normalize=True) * 100

print("\nDiagnosis Distribution:")
print(f"Benign (B): {diagnosis_counts['B']} samples ({diagnosis_percentages['B']:.2f}%)")
print(f"Malignant (M): {diagnosis_counts['M']} samples ({diagnosis_percentages['M']:.2f}%)")

# Encode target variable: M (Malignant) = 1, B (Benign) = 0
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
print("\n✓ Target variable encoded: M=1 (Malignant), B=0 (Benign)")

# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
diagnosis_labels = ['Benign', 'Malignant']
diagnosis_values = [diagnosis_counts['B'], diagnosis_counts['M']]
axes[0].bar(diagnosis_labels, diagnosis_values, color=['#2ecc71', '#e74c3c'])
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
for i, v in enumerate(diagnosis_values):
    axes[0].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')

# Pie chart
colors = ['#2ecc71', '#e74c3c']
axes[1].pie(diagnosis_values, labels=diagnosis_labels, autopct='%1.1f%%', 
            startangle=90, colors=colors, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved as 'class_distribution.png'")

In [None]:
# FEATURE CATEGORIZATION

# Categorize features as in the paper
mean_features = [col for col in df.columns if col.endswith('_mean')]
se_features = [col for col in df.columns if col.endswith('_se')]
worst_features = [col for col in df.columns if col.endswith('_worst')]

print(f"\nMean features (10): {len(mean_features)}")
print(mean_features)
print(f"\nStandard Error features (10): {len(se_features)}")
print(se_features)
print(f"\nWorst features (10): {len(worst_features)}")
print(worst_features)

In [None]:
# STATISTICAL SUMMARY

# Separate features by diagnosis
benign = df[df['diagnosis'] == 0]
malignant = df[df['diagnosis'] == 1]

print("\nOverall Statistics:")
print(df.describe())

print("Statistics for Benign Cases:")
print(benign.describe())

print("Statistics for Malignant Cases:")
print(malignant.describe())

In [None]:
# ============================================================================
#  EXPLORATORY DATA ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("EXPLORATORY DATA ANALYSIS")
print("="*80)

# 1 Distribution of Mean Features
print("\n[1/7] Creating distribution plots for mean features...")
fig, axes = plt.subplots(5, 2, figsize=(15, 18))
axes = axes.ravel()

for idx, col in enumerate(mean_features):
    axes[idx].hist(benign[col], bins=30, alpha=0.6, label='Benign', color='#2ecc71', edgecolor='black')
    axes[idx].hist(malignant[col], bins=30, alpha=0.6, label='Malignant', color='#e74c3c', edgecolor='black')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].set_title(f'Distribution of {col}', fontsize=11, fontweight='bold')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('mean_features_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Saved as 'mean_features_distribution.png'")

# 2 Box plots for mean features
print("\n[2/7] Creating box plots for mean features...")
fig, axes = plt.subplots(5, 2, figsize=(15, 18))
axes = axes.ravel()

for idx, col in enumerate(mean_features):
    df.boxplot(column=col, by='diagnosis', ax=axes[idx])
    axes[idx].set_xlabel('Diagnosis (0=Benign, 1=Malignant)', fontsize=10)
    axes[idx].set_ylabel(col, fontsize=10)
    axes[idx].set_title(f'Box Plot: {col}', fontsize=11, fontweight='bold')
    axes[idx].get_figure().suptitle('')

plt.tight_layout()
plt.savefig('mean_features_boxplot.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Saved as 'mean_features_boxplot.png'")

# 3 Correlation Matrix
print("\n[3/7] Creating correlation matrix...")
# Select only numeric columns (excluding diagnosis for now)
numeric_features = df.drop('diagnosis', axis=1)

plt.figure(figsize=(20, 16))
correlation_matrix = numeric_features.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - All Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_matrix_full.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Saved as 'correlation_matrix_full.png'")

# 4 Correlation with diagnosis
print("\n[4/7] Analyzing correlation with diagnosis...")
correlations_with_diagnosis = df.corr()['diagnosis'].drop('diagnosis').sort_values(ascending=False)

plt.figure(figsize=(12, 10))
correlations_with_diagnosis.plot(kind='barh', color='steelblue')
plt.xlabel('Correlation with Diagnosis', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Feature Correlation with Diagnosis\n(Positive = Malignant, Negative = Benign)', 
          fontsize=14, fontweight='bold')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('correlation_with_diagnosis.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Saved as 'correlation_with_diagnosis.png'")

print("\nTop 10 features most correlated with Malignant diagnosis:")
print(correlations_with_diagnosis.head(10))

# 9.5 Scatter plots (replicating Figure 3, 4, 5 from paper)
print("\n[5/7] Creating scatter plots for feature categories...")

# Mean features scatter plot
fig, axes = plt.subplots(5, 2, figsize=(15, 18))
axes = axes.ravel()

for idx, col in enumerate(mean_features):
    axes[idx].scatter(benign[col], benign.index, alpha=0.5, s=10, c='#2ecc71', label='Benign')
    axes[idx].scatter(malignant[col], malignant.index, alpha=0.5, s=10, c='#e74c3c', label='Malignant')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Sample Index', fontsize=10)
    axes[idx].set_title(f'Scatter: {col}', fontsize=11, fontweight='bold')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Scatter Plot of Mean Features (Replicating Paper Figure 3)', 
             fontsize=14, fontweight='bold', y=1.001)
plt.tight_layout()
plt.savefig('scatter_mean_features.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Saved as 'scatter_mean_features.png'")



**3. Data Preparation**# 

In [None]:

# Remove 'id' column as it's not relevant for classification
if 'id' in df.columns:
    df = df.drop('id', axis=1)
    print("\n✓ 'id' column removed")

# Remove 'Unnamed: 32' column if it exists (common in this dataset)
if 'Unnamed: 32' in df.columns:
    df = df.drop('Unnamed: 32', axis=1)
    print("✓ 'Unnamed: 32' column removed")

print(f"\nDataset shape after cleaning: {df.shape}")

In [None]:
# 10. DATA NORMALIZATION
# ============================================================================
print("\n" + "="*80)
print("DATA NORMALIZATION")
print("="*80)

# Separate features and target
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Standardize features using StandardScaler (as mentioned in the paper)
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Convert back to DataFrame for easier handling
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)

print("\n✓ Features normalized using StandardScaler (mean=0, std=1)")
print("\nNormalized data statistics:")
print(X_normalized_df.describe())

In [None]:
# 11. TRAIN-TEST SPLIT
# ============================================================================
print("\n" + "="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

# Split data: 80% train, 20% test (paper used 70/30, but you specified 80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y, test_size=0.20, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples ({(X_train.shape[0]/len(df))*100:.1f}%)")
print(f"Testing set size: {X_test.shape[0]} samples ({(X_test.shape[0]/len(df))*100:.1f}%)")

print(f"\nTraining set class distribution:")
print(f"  Benign: {(y_train == 0).sum()} samples")
print(f"  Malignant: {(y_train == 1).sum()} samples")

print(f"\nTesting set class distribution:")
print(f"  Benign: {(y_test == 0).sum()} samples")
print(f"  Malignant: {(y_test == 1).sum()} samples")

In [None]:
# 12. SAVE PROCESSED DATA
# ============================================================================
print("\n" + "="*80)
print("SAVING PROCESSED DATA")
print("="*80)

# Save normalized data
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

print("\n✓ Data saved successfully:")
print("  - X_train.npy")
print("  - X_test.npy")
print("  - y_train.npy")
print("  - y_test.npy")

# Save feature names
feature_names = X.columns.tolist()
with open('feature_names.txt', 'w') as f:
    for feature in feature_names:
        f.write(f"{feature}\n")
print("  - feature_names.txt")

# Save scaler for future use
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("  - scaler.pkl")


In [None]:
# 13. SUMMARY REPORT
# ============================================================================
print("\n" + "="*80)
print("SUMMARY REPORT")
print("="*80)

summary_report = f"""
WISCONSIN DIAGNOSTIC BREAST CANCER DATASET - EDA REPORT
{'='*80}

1. DATASET OVERVIEW
   - Total samples: {df.shape[0]}
   - Total features: {df.shape[1] - 1} (after removing ID)
   - Classes: Binary (Benign=0, Malignant=1)

2. CLASS DISTRIBUTION
   - Benign (B): {diagnosis_counts['B']} samples ({diagnosis_percentages['B']:.2f}%)
   - Malignant (M): {diagnosis_counts['M']} samples ({diagnosis_percentages['M']:.2f}%)
   - Balance: {'Relatively balanced' if abs(diagnosis_percentages['B'] - diagnosis_percentages['M']) < 20 else 'Imbalanced'}

3. FEATURE CATEGORIES
   - Mean features: {len(mean_features)}
   - Standard Error features: {len(se_features)}
   - Worst features: {len(worst_features)}
   - Total: {len(mean_features) + len(se_features) + len(worst_features)} features

4. DATA QUALITY
   - Missing values: {'None' if len(missing_df) == 0 else f'{len(missing_df)} columns'}
   - Duplicates: {df.duplicated().sum()}
   - Data types: All numeric (after encoding)

5. DATA PREPROCESSING
   - Normalization: StandardScaler (mean=0, std=1)
   - Train-Test Split: {(X_train.shape[0]/len(df))*100:.0f}% / {(X_test.shape[0]/len(df))*100:.0f}%
   - Random State: 42 (for reproducibility)

6. KEY FINDINGS
   - Most correlated features with Malignant diagnosis:
{chr(10).join([f'     • {feat}: {correlations_with_diagnosis[feat]:.3f}' for feat in correlations_with_diagnosis.head(5).index])}
   
   - Least correlated features with Malignant diagnosis:
{chr(10).join([f'     • {feat}: {correlations_with_diagnosis[feat]:.3f}' for feat in correlations_with_diagnosis.tail(5).index])}

7. LINEAR SEPARABILITY
   - The dataset appears to be linearly separable based on visualization
   - This aligns with the paper's findings that linear classifiers performed well
   - Mean features show clear separation between benign and malignant cases

8. FILES GENERATED
   - Training data: X_train.npy, y_train.npy
   - Testing data: X_test.npy, y_test.npy
   - Scaler: scaler.pkl
   - Feature names: feature_names.txt
   - Visualizations: 
     • class_distribution.png
     • mean_features_distribution.png
     • mean_features_boxplot.png
     • correlation_matrix_full.png
     • correlation_with_diagnosis.png
     • scatter_mean_features.png

9. NEXT STEPS (FOR OTHER TEAM MEMBERS)
   - Person 2-6: Load the processed data using:
     ```python
     X_train = np.load('X_train.npy')
     X_test = np.load('X_test.npy')
     y_train = np.load('y_train.npy')
     y_test = np.load('y_test.npy')
     ```
   - Implement ML algorithms: Linear Regression, MLP, Nearest Neighbor, 
     Softmax Regression, SVM, GRU-SVM
   - Target: >90% test accuracy (paper achieved ~99% with MLP)

{'='*80}
Report generated successfully!
"""

print(summary_report)

# Save report to file
with open('EDA_REPORT.txt', 'w', encoding='utf-8') as f:
    f.write(summary_report)

print("\n✓ Complete EDA report saved as 'EDA_REPORT.txt'")

print("\n" + "="*80)
print("DATA PREPARATION & EDA COMPLETED SUCCESSFULLY!")
print("="*80)
print("\nYou can now share the processed data files with your team members.")
print("All visualizations and the detailed report are ready for your project submission.")

# Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = Sequential([
    Dense(32, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
],name='ANN_Model')


In [None]:
model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

# Evaluation

In [None]:
model.summary()


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


In [None]:

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

y_pred_prob = model.predict(X_test) 
y_pred = (y_pred_prob > 0.5).astype(int) 

cm = confusion_matrix(y_test, y_pred)

class_names = ['Benign (non-cancerous)', 'Malignant (cancerous)']  
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix for ANN Model')
plt.show()
