# Breast Cancer Prediction Model Development

This notebook covers Part A of the project:
1. Loading the Breast Cancer Wisconsin dataset
2. Data Preprocessing
3. Feature Selection
4. Model Training (Logistic Regression)
5. Evaluation
6. Model Saving

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

%matplotlib inline

## 1. Load Dataset

In [None]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['diagnosis'] = data.target # 0: malignant, 1: benign (sklearn default)

# Rename columns to match prompt requirements (snake_case)
feature_mapping = {
    'mean radius': 'radius_mean',
    'mean texture': 'texture_mean',
    'mean perimeter': 'perimeter_mean',
    'mean area': 'area_mean',
    'mean concavity': 'concavity_mean'
}
df = df.rename(columns=feature_mapping)

print(f"Target names: {data.target_names}")
df.head()

## 2. Feature Selection

In [None]:
selected_features = [
    'radius_mean', 
    'texture_mean', 
    'perimeter_mean', 
    'area_mean', 
    'concavity_mean'
]

X = df[selected_features]
y = df['diagnosis']

print("Selected features:")
print(X.head())

## 3. Preprocessing

In [None]:
# Check for missing values
print(X.isnull().sum())

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Model Training

In [None]:
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

## 5. Evaluation

In [None]:
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=data.target_names, yticklabels=data.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 6. Save Model

In [None]:
model_data = {
    'model': model,
    'scaler': scaler,
    'features': selected_features,
    'target_names': data.target_names
}

joblib.dump(model_data, 'breast_cancer_model.pkl')
print("Model saved successfully.")

## 7. Reload Verification

In [None]:
loaded_data = joblib.load('breast_cancer_model.pkl')
loaded_model = loaded_data['model']
sample = X_test_scaled[0].reshape(1, -1)
print(f"Prediction for sample: {data.target_names[loaded_model.predict(sample)[0]]}")