# Oluwatimileyin Oyedele
# C0902571
# AML 3104 Assignment 4

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, chi2
import joblib

In [2]:
# Load the breast cancer dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [3]:
# Check for missing values
print("Checking for missing values...")
print(df.isnull().sum())

Checking for missing values...
mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [4]:
# Explore descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())


Descriptive Statistics:
       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000         

In [5]:
# Normalize the features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df.drop('target', axis=1))
scaled_df = pd.DataFrame(scaled_features, columns=data.feature_names)
scaled_df['target'] = df['target'].values

In [6]:
# Feature Selection
X = scaled_df.drop('target', axis=1)
y = scaled_df['target']
selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X, y)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, selector.get_support(indices=True)], y, test_size=0.2, random_state=42)

In [8]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'max_iter': [200, 300]
}

grid_search = GridSearchCV(MLPClassifier(), param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [9]:
# Best parameters and score
print("\nBest Parameters:")
print(grid_search.best_params_)
print("\nBest Cross-Validation Score:")
print(grid_search.best_score_)


Best Parameters:
{'activation': 'tanh', 'hidden_layer_sizes': (150,), 'max_iter': 300, 'solver': 'adam'}

Best Cross-Validation Score:
0.9494505494505494


In [10]:
# Implementing the ANN Model
ann_model = MLPClassifier(hidden_layer_sizes=(150,), activation='tanh', solver='adam', max_iter=300)
ann_model.fit(X_train, y_train)



In [11]:
# Save the trained model and scaler
joblib.dump(ann_model, 'breast_cancer_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [12]:
# Evaluate the Model
y_pred = ann_model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[42  1]
 [ 2 69]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.99      0.97      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [15]:
# Save the feature selector
joblib.dump(selector, "feature_selector.pkl")

['feature_selector.pkl']