# Machine Learning Assignment


In [1]:
# Data, Datasets & Utils
import pandas as pd
from pandas.plotting import scatter_matrix
import pprint
import numpy as np
from time import time
from numpy import log2 as log

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plot 
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz

## Loading the dataset

In [2]:
import pandas as pd

df = pd.read_csv('data/breast-cancer.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          568 non-null    float64
 1   Radius (Mean)               550 non-null    float64
 2   Texture (Mean)              568 non-null    float64
 3   Perimeter (Mean)            567 non-null    float64
 4   Area (Mean)                 566 non-null    float64
 5   Smoothness (Mean)           567 non-null    float64
 6   Compactness (Mean)          566 non-null    float64
 7   Concavity (Mean)            569 non-null    float64
 8   Concave Points (Mean)       569 non-null    float64
 9   Symmetry (Mean)             566 non-null    float64
 10  Fractal Dimension (Mean)    567 non-null    float64
 11  Radius (Error)              551 non-null    float64
 12   Texture (Error)            567 non-null    float64
 13   Perimeter (Error)          568 non

Unnamed: 0,ID,Radius (Mean),Texture (Mean),Perimeter (Mean),Area (Mean),Smoothness (Mean),Compactness (Mean),Concavity (Mean),Concave Points (Mean),Symmetry (Mean),...,Texture (Worst),Perimeter (Worst),Area (Worst),Smoothness (Worst),Compactness (Worst),Concavity (Worst),Concave Points (Worst),Symmetry (Worst),Fractal Dimension (Worst),Diagnosis
0,842302.0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,,0.4601,0.1189,M
1,842517.0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,M
2,84300903.0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,M
3,84348301.0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,M
4,84358402.0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,M


## Processing the dataset

First, you need to clean the data.

In [3]:
# TODO: your own data cleaning, e.g., dealing with the missing values and converting the feature column (to numeric)

df = df.fillna(df.mean())
df.isnull().sum()

col = df.columns
print(col)

Index(['ID', 'Radius (Mean)', 'Texture (Mean)', 'Perimeter (Mean)',
       'Area (Mean)', 'Smoothness (Mean)', 'Compactness (Mean)',
       'Concavity (Mean)', 'Concave Points (Mean)', 'Symmetry (Mean)',
       'Fractal Dimension (Mean)', 'Radius (Error)', ' Texture (Error)',
       ' Perimeter (Error)', ' Area (Error)', ' Smoothness (Error)',
       ' Compactness (Error)', ' Concavity (Error)', ' Concave Points (Error)',
       ' Symmetry (Error)', ' Fractal Dimension (Error)', ' Radius (Worst)',
       ' Texture (Worst)', ' Perimeter (Worst)', ' Area (Worst)',
       ' Smoothness (Worst)', ' Compactness (Worst)', ' Concavity (Worst)',
       'Concave Points (Worst)', 'Symmetry (Worst)',
       'Fractal Dimension (Worst)', 'Diagnosis'],
      dtype='object')


### Preparing the X and y arrays in preparation for running the machine learning classifier below

In [4]:
from sklearn import preprocessing

df['Diagnosis'].replace('M', 1,inplace=True)
df['Diagnosis'].replace('B', 0,inplace=True)

names = df.columns
scaler = MinMaxScaler() 
df = scaler.fit_transform(df) 
df = pd.DataFrame(df, columns=names)

# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,1:31]
y = df['Diagnosis']

# Transform class labels to numeric labels
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)


## Running Machine Learning classifier

You should replace some code indicated below to import, load and run your own classifier.

Literally just two lines where you need to update the code. Otherwise, this should be able to run your classifier, to demonstrate that it is working and running properly against the dataset provided.

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.metrics import classification_report
from classifier import knn
#meuclidean
# arrays for storing performance metrics
accuracy_list = np.array([])
precision_list = np.array([])
recall_list = np.array([])
n=20

for x in range(n):
    # Hold-out validation - 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=None, stratify=df['Diagnosis'])

            # Creating the model, training and testing it
    model = knn.KNearestNeighbor(k=5) # REPLACE WITH YOUR CLASSIFIER
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

            # Appending performance metrics to arrays created above
    accuracy_list = np.append(accuracy_list, metrics.accuracy_score(y_test, y_pred))
    precision_list = np.append(precision_list, metrics.precision_score(y_test, y_pred, average='macro', zero_division=0))
    recall_list = np.append(recall_list, metrics.recall_score(y_test, y_pred, average='macro', zero_division=0))

# printing out the performance metrics (mean + standard deviation)
print("Accuracy:  %0.2f (+/- %0.2f)" % (accuracy_list.mean(), accuracy_list.std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (precision_list.mean(), precision_list.std() * 2))
print("Recall:    %0.2f (+/- %0.2f)" % (recall_list.mean(), recall_list.std() * 2))
print(confusion_matrix(y_test, y_pred))


Accuracy:  0.54 (+/- 0.24)
Precision: 0.27 (+/- 0.12)
Recall:    0.50 (+/- 0.00)
[[  0 143]
 [  0  85]]


In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.metrics import classification_report
from classifier import wokrM
#manhattan
# arrays for storing performance metrics
accuracy_list = np.array([])
precision_list = np.array([])
recall_list = np.array([])
n=20

for x in range(n):
    # Hold-out validation - 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=None, stratify=df['Diagnosis'])

            # Creating the model, training and testing it
    model = wokrM.KNearestNeighbor(k=5) # REPLACE WITH YOUR CLASSIFIER
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

            # Appending performance metrics to arrays created above
    accuracy_list = np.append(accuracy_list, metrics.accuracy_score(y_test, y_pred))
    precision_list = np.append(precision_list, metrics.precision_score(y_test, y_pred, average='macro', zero_division=0))
    recall_list = np.append(recall_list, metrics.recall_score(y_test, y_pred, average='macro', zero_division=0))

# printing out the performance metrics (mean + standard deviation)
print("Accuracy:  %0.2f (+/- %0.2f)" % (accuracy_list.mean(), accuracy_list.std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (precision_list.mean(), precision_list.std() * 2))
print("Recall:    %0.2f (+/- %0.2f)" % (recall_list.mean(), recall_list.std() * 2))
print(confusion_matrix(y_test, y_pred))


Accuracy:  0.58 (+/- 0.20)
Precision: 0.29 (+/- 0.10)
Recall:    0.50 (+/- 0.00)
[[143   0]
 [ 85   0]]
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       143
           1       0.00      0.00      0.00        85

    accuracy                           0.63       228
   macro avg       0.31      0.50      0.39       228
weighted avg       0.39      0.63      0.48       228



  _warn_prf(average, modifier, msg_start, len(result))
