Problem Statement:
     
  Build a model which classifies patients who have breast cancer. 

 Dataset is downloaded from kaggle: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [29]:
#Load required libararies
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [6]:
#Unzip the Breast_cancer zip file
import zipfile

with zipfile.ZipFile('breast_cancer.zip','r') as f:
    f.printdir();
    f.extractall();  #dataset is extracted into current directory

File Name                                             Modified             Size
data.csv                                       2019-09-19 21:54:02       125204


In [48]:
#load dataset
data=pd.read_csv('data.csv')
data.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [49]:
#print shape
data.shape

(569, 33)

In [50]:
#drop unsummary feature i.e Unnamed: 32
data.drop('Unnamed: 32',axis=1,inplace=True)

In [51]:
#Independent features
#drop id and diagnosis features 
inp=data.drop(['id','diagnosis'],1)
inp.head(2)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [52]:
#Dependent feature
out=data['diagnosis']
out.head(2)

0    M
1    M
Name: diagnosis, dtype: object

In [53]:
#replace M and B with 1 and 0 respectively
out=out.replace({'M':1,'B':0})

In [54]:
#Normally scaling is used for distance based algorithms(Logistic regression etc)
#But here we doing scaling bcoz for gradient descent(convergence will take place very quickly) 

from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
inp_sc=sc.fit_transform(inp)
inp_sc=pd.DataFrame(inp_sc,columns=inp.columns)

inp_sc.head(2)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119


In [55]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(inp_sc,out,test_size=0.3,random_state=48)

In [56]:
#Print Shape
print("shape of xtrain :",xtrain.shape)
print("shape of xtest :",xtest.shape)
print("shape of ytrain :",ytrain.shape)
print("shape of ytest :",ytest.shape)

shape of xtrain : (398, 30)
shape of xtest : (171, 30)
shape of ytrain : (398,)
shape of ytest : (171,)


### Model Building

Typically model building in Neural network follows 2 ways 

1. Sequential model (Most preferred )
2. Functional model

In [57]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [58]:
nn_classifier=Sequential()
#first hideen layer
#units=16,number of neurons in the first hidden layer
#input_dim=number of neurons in input layers
nn_classifier.add(Dense(units=16,activation='relu',input_dim=30))
#second hidden layer
#units=8 number of neurons in the second layer
#no need to mention input_dim bcoz its a sequential layer(it knows the previous layer)
nn_classifier.add(Dense(units=8,activation='relu'))
nn_classifier.add(Dense(units=1,activation='sigmoid'))

In [59]:
#Print summary
nn_classifier.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 16)                496       
_________________________________________________________________
dense_15 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 9         
Total params: 641
Trainable params: 641
Non-trainable params: 0
_________________________________________________________________


In [60]:
#Number of weights and bias between input and 1st hidden layers is: (30*16)+16=496
#Number of weights and bias between 1st hidden and second hidden layers is: (16*8)+8=136
##Number of weights and bias between second hidden layer and output layers is: (8*1)+1=9

In [61]:
#optimizer: which gradient descent to use(mini,stochastic,batch)    #adam is mini-batch gradient descent(it choose the learning rate smartly)
#binary_crossentropy=binary classification
#Categorical_crossentropy=multiclass classification
nn_classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [62]:
#batch_size: number of samples to be consider before updating weights
#epochs: number of times a full operation to run

nn_classifier.fit(xtrain,ytrain,batch_size=32,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x24ec7c1b280>

In [63]:
#testing
nn_classifier.evaluate(xtest,ytest)



[0.0888427197933197, 0.9766082167625427]

In [66]:
#predictions
ypred=nn_classifier.predict(xtest).round()

*Interpretation:*

  The Model is not suffering any overfitting issue because both train and test accuracy are same. 

In [67]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,ypred)

array([[94,  1],
       [ 3, 73]], dtype=int64)

In [68]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        95
           1       0.99      0.96      0.97        76

    accuracy                           0.98       171
   macro avg       0.98      0.97      0.98       171
weighted avg       0.98      0.98      0.98       171



 *Interpretation:*
        
  The model correctly classifies patients who have malignant and benign with 98% of accuaracy.

                                                    ---thank you---