In [1]:
# Importing necessary libraries for data manipulation, model building, and evaluation
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Step 1: Load the Dataset

In [2]:
# Load the breast cancer dataset provided by sklearn and create a DataFrame
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target  # Append the target variable to the DataFrame

In [3]:
# Display the first few entries and basic information about the dataset
print("DataFrame Head:\n", df.head())
print("\nDataFrame Info:\n", df.info())

DataFrame Head:
    mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area 

# Step 2: Data Preparation

In [4]:
# Separate features (X) and target (y) for model training
X = df.drop('target', axis=1)
y = df['target']

In [5]:
# Split the data into training and test sets with a test size of 20% and a fixed random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Standardize features to have zero mean and unit variance for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Output the shapes of the training and testing data to ensure correct splitting
print("X_train Shape:", X_train.shape)
print("X_test Shape:", X_test.shape)

X_train Shape: (455, 30)
X_test Shape: (114, 30)


# Step 3: Feature Selection

In [8]:
# Initialize and apply SelectKBest to reduce dimensionality to the top 10 features based on ANOVA F-value
selector = SelectKBest(score_func=f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [9]:
# Display the shapes of the datasets after feature selection
print("X_train_selected Shape:", X_train_selected.shape)
print("X_test_selected Shape:", X_test_selected.shape)

X_train_selected Shape: (455, 10)
X_test_selected Shape: (114, 10)


# Step 4: Grid Search CV for Model Tuning

In [10]:
# Define an MLPClassifier with some predetermined parameters and early stopping

mlp = MLPClassifier(activation='relu', max_iter=500, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)


In [11]:
# Set up a grid of hyperparameters to fine-tune the model using cross-validation
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 100)],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}

In [12]:
# Execute Grid Search CV across the specified grid and 5-fold cross-validation
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

In [13]:
# Output the best parameters and the highest score achieved during Grid Search
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'learning_rate_init': 0.01, 'solver': 'adam'}
Best Score: 0.9538461538461538


# Step 5: Train the Best Model

In [14]:
# Retrieve the best model from Grid Search and evaluate it on the test set
best_mlp = grid_search.best_estimator_
predictions = best_mlp.predict(X_test_selected)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

In [15]:
# Print the model's accuracy and a detailed classification report
print("Model Accuracy:", accuracy)
print("Classification Report:\n", report)

Model Accuracy: 0.956140350877193
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



# Step 6: Save the model, scaler, and selector using pickle

In [16]:
# Serialize the trained model, scaler, and feature selector to files for later use
with open('breast_cancer_model.pkl', 'wb') as model_file:
    pickle.dump(best_mlp, model_file)
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
with open('selector.pkl', 'wb') as selector_file:
    pickle.dump(selector, selector_file)

In [17]:
# Confirm successful saving of model components
print("Model, scaler, and selector saved successfully.")

Model, scaler, and selector saved successfully.
