In [19]:
import sklearn
import pandas as pd


The dataset we will be working with is the Breast Cancer
Wisconsin Diagnostic Database. The dataset includes various information
about breast cancer tumors, as well as classification labels of malignant or
benign. The dataset has 569 instances, or data, on 569 tumors and
includes information on 30 attributes, or features, such as the radius of
the tumor, texture, smoothness, and area.

Using this dataset, we will build a machine learning model to use
tumor information to predict whether or not a tumor is malignant or benign.

In [2]:
from sklearn.datasets import load_breast_cancer

In [12]:
data = load_breast_cancer()

The data variable is a 
dictionary. The important dictionary keys to consider are the
classification label names (target_names), the actual labels (target),
the attribute/feature names (feature_names), and the attributes
(data).

In [13]:
# extracting te required keys and data

label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

In [16]:
# We split our data into training and testing sets randomly

from sklearn.model_selection import train_test_split

In [50]:
Br_Cancer_df_full = pd.DataFrame(features, columns = feature_names)

In [68]:
features[0][1]

10.38

In [52]:
Br_Cancer_df_full.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [53]:
# use 20% of the data for testing

train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state = 20)

In [54]:
# displaying our taining data as a data frame
Br_Cancer_df = pd.DataFrame(train, columns = feature_names)
Br_Cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,9.397,21.68,59.75,268.8,0.07969,0.06053,0.03735,0.005128,0.1274,0.06724,...,9.965,27.99,66.61,301.0,0.1086,0.1887,0.1868,0.02564,0.2376,0.09206
1,27.42,26.27,186.9,2501.0,0.1084,0.1988,0.3635,0.1689,0.2061,0.05623,...,36.04,31.37,251.2,4254.0,0.1357,0.4256,0.6833,0.2625,0.2641,0.07427
2,13.68,16.33,87.76,575.5,0.09277,0.07255,0.01752,0.0188,0.1631,0.06155,...,15.85,20.2,101.6,773.4,0.1264,0.1564,0.1206,0.08704,0.2806,0.07782
3,14.87,20.21,96.12,680.9,0.09587,0.08345,0.06824,0.04951,0.1487,0.05748,...,16.01,28.48,103.9,783.6,0.1216,0.1388,0.17,0.1017,0.2369,0.06599
4,15.85,23.95,103.7,782.7,0.08401,0.1002,0.09938,0.05364,0.1847,0.05338,...,16.84,27.66,112.0,876.5,0.1131,0.1924,0.2322,0.1119,0.2809,0.06287


In [39]:
# Checking the size of the train an testing sets

print("Shape of the training set:{}".format(train.shape))
print("Shape of the testing set:{}".format(test.shape))

Shape of the training set:(455, 30)
Shape of the testing set:(114, 30)


In [24]:
# Building the model
# importing the knn and NaiveBayes Classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [25]:
# Initializing the classifiers

knn = KNeighborsClassifier()
gnb = GaussianNB()

In [40]:
# Training the classifiers

model_knn = knn.fit(train, train_labels)
model_gnb = gnb.fit(train, train_labels)

In [41]:
# Predict
pred_gnb = gnb.predict(test)
pred_knn = knn.predict(test)

In [42]:
# Evaluate accuracy of the models
print("The knn model is {:.0f}% accurate".format(100*knn.score(test, test_labels)))
print("The Naive Bayes model is {:.0f}% accurate".format(100*gnb.score(test, test_labels)))

The knn model is 94% accurate
The Naive Bayes model is 96% accurate


In [47]:
# Checking the correlation

Br_Cancer_df.corr()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean radius,1.0,0.335562,0.997825,0.986915,0.186999,0.538553,0.68219,0.828758,0.170871,-0.285813,...,0.968073,0.294429,0.963354,0.940624,0.120439,0.41329,0.515769,0.740911,0.165959,0.022445
mean texture,0.335562,1.0,0.342777,0.334494,0.011513,0.277757,0.326041,0.321453,0.075981,-0.027356,...,0.373951,0.914819,0.379543,0.370442,0.111979,0.307283,0.320569,0.316833,0.116711,0.166057
mean perimeter,0.997825,0.342777,1.0,0.986124,0.222977,0.588484,0.721228,0.856845,0.206332,-0.234645,...,0.96831,0.303321,0.969214,0.941746,0.151121,0.454593,0.552609,0.767914,0.191504,0.065204
mean area,0.986915,0.334494,0.986124,1.0,0.195324,0.529235,0.692146,0.829458,0.173631,-0.256038,...,0.959367,0.285993,0.955176,0.957597,0.124065,0.388004,0.501401,0.716324,0.145,0.01925
mean smoothness,0.186999,0.011513,0.222977,0.195324,1.0,0.641251,0.514858,0.55431,0.564916,0.563446,...,0.228875,0.078924,0.255813,0.225781,0.802334,0.451136,0.420694,0.502455,0.394315,0.448947
mean compactness,0.538553,0.277757,0.588484,0.529235,0.641251,1.0,0.893336,0.844777,0.605563,0.550447,...,0.570473,0.301602,0.628298,0.546232,0.552542,0.860262,0.825079,0.835149,0.507706,0.663282
mean concavity,0.68219,0.326041,0.721228,0.692146,0.514858,0.893336,1.0,0.918336,0.511946,0.365618,...,0.695639,0.323206,0.738212,0.687577,0.439851,0.747691,0.879734,0.859523,0.410607,0.51921
mean concave points,0.828758,0.321453,0.856845,0.829458,0.55431,0.844777,0.918336,1.0,0.48414,0.182388,...,0.837297,0.318146,0.86425,0.819283,0.444909,0.658253,0.738878,0.908402,0.382004,0.363022
mean symmetry,0.170871,0.075981,0.206332,0.173631,0.564916,0.605563,0.511946,0.48414,1.0,0.481269,...,0.212465,0.11252,0.247636,0.204938,0.43875,0.470226,0.441813,0.459118,0.713224,0.422129
mean fractal dimension,-0.285813,-0.027356,-0.234645,-0.256038,0.563446,0.550447,0.365618,0.182388,0.481269,1.0,...,-0.224854,0.024722,-0.170814,-0.199405,0.496443,0.469138,0.388584,0.20674,0.344899,0.756023


In [None]:
 # feature engineering

In [69]:
# columns to remove from features

col_del = ['worst radius', 'worst perimeter', 'worst area', 'mean perimeter']

In [71]:
# get their index locations

col_del_index  = [Br_Cancer_df.columns.get_loc(c) for c in col_del]

In [74]:
import numpy as np

In [81]:
features_new = np.delete(features,col_del_index, axis=1)

In [82]:
features_new.shape

(569, 26)

In [88]:
# use 20% of the data for testing


train_1, test_1, train_labels, test_labels = train_test_split(features_new, labels, test_size=0.2, random_state = 20)

In [89]:
# Checking the size of the train an testing sets

print("Shape of the training set:{}".format(train_1.shape))
print("Shape of the testing set:{}".format(test_1.shape))

Shape of the training set:(455, 26)
Shape of the testing set:(114, 26)


In [90]:
model_knn = knn.fit(train_1, train_labels)
model_gnb = gnb.fit(train_1, train_labels)

In [91]:
pred_gnb_1 = gnb.predict(test_1)
pred_knn_1= knn.predict(test_1)

In [92]:
# Evaluate accuracy of the models
print("The knn model is {:.0f}% accurate".format(100*knn.score(test_1, test_labels)))
print("The Naive Bayes model is {:.0f}% accurate".format(100*gnb.score(test_1, test_labels)))

The knn model is 95% accurate
The Naive Bayes model is 90% accurate


In [None]:
# reducing multilple correlated features reduced the accuracy of Naive Bayes but increased that of Knn