# Breast Cancer Prediction System - Model Development
## Student: Onipede Toluwani
## Matric No: 22CG031936

This notebook implements a machine learning model to predict whether a breast tumor is benign or malignant.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

## 2. Load the Breast Cancer Wisconsin Dataset

In [None]:
# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['diagnosis'] = data.target

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())

## 3. Data Preprocessing

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check target distribution
print("\nTarget Distribution:")
print(df['diagnosis'].value_counts())
print("\n0 = Malignant, 1 = Benign")

## 4. Feature Selection
Selecting 5 features from the recommended list:
- radius_mean
- texture_mean
- perimeter_mean
- area_mean
- concavity_mean

In [None]:
# Select 5 features
selected_features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean concavity']

X = df[selected_features]
y = df['diagnosis']

print("Selected Features:")
print(X.head())
print("\nFeature Statistics:")
print(X.describe())

## 5. Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

## 6. Feature Scaling
Mandatory for distance-based models and improves performance for Logistic Regression

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled Training Data (first 5 rows):")
print(X_train_scaled[:5])

## 7. Model Training - Logistic Regression
Using Logistic Regression as the machine learning algorithm

In [None]:
# Initialize and train the model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")
print(f"Model coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_}")

## 8. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("=" * 50)
print("MODEL EVALUATION METRICS")
print("=" * 50)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
print("=" * 50)

In [None]:
# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Malignant', 'Benign']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
print("\nInterpretation:")
print(f"True Negatives (Malignant correctly predicted): {cm[0][0]}")
print(f"False Positives (Malignant predicted as Benign): {cm[0][1]}")
print(f"False Negatives (Benign predicted as Malignant): {cm[1][0]}")
print(f"True Positives (Benign correctly predicted): {cm[1][1]}")

## 9. Save the Trained Model
Using Joblib for model persistence

In [None]:
# Save the model and scaler
joblib.dump(model, 'breast_cancer_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(selected_features, 'selected_features.pkl')

print("Model saved successfully as 'breast_cancer_model.pkl'")
print("Scaler saved successfully as 'scaler.pkl'")
print("Selected features saved successfully as 'selected_features.pkl'")

## 10. Demonstrate Model Reloading and Prediction
Reload the saved model and make predictions without retraining

In [None]:
# Load the saved model
loaded_model = joblib.load('breast_cancer_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
loaded_features = joblib.load('selected_features.pkl')

print("Model loaded successfully!")
print(f"Loaded features: {loaded_features}")

In [None]:
# Test with sample data
sample_data = X_test.iloc[0:5]
print("Sample Input Data:")
print(sample_data)

# Scale the sample data
sample_scaled = loaded_scaler.transform(sample_data)

# Make predictions
predictions = loaded_model.predict(sample_scaled)
prediction_proba = loaded_model.predict_proba(sample_scaled)

print("\nPredictions:")
for i, (pred, proba) in enumerate(zip(predictions, prediction_proba)):
    diagnosis = "Benign" if pred == 1 else "Malignant"
    confidence = max(proba) * 100
    print(f"Sample {i+1}: {diagnosis} (Confidence: {confidence:.2f}%)")

print("\nActual values:")
for i, actual in enumerate(y_test.iloc[0:5]):
    diagnosis = "Benign" if actual == 1 else "Malignant"
    print(f"Sample {i+1}: {diagnosis}")

## 11. Summary

**Model Details:**
- Algorithm: Logistic Regression
- Features Used: mean radius, mean texture, mean perimeter, mean area, mean concavity
- Model Persistence: Joblib
- Dataset: Breast Cancer Wisconsin (Diagnostic)

**Note:** This system is strictly for educational purposes and must not be presented as a medical diagnostic tool.