In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
#Loading the dataset
data = pd.read_csv('Breast_Cancer.csv')

In [13]:
# Visualising the data set
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [14]:
# Dropping a unused fields
fields_to_drop = ['id', 'Unnamed: 32'] 
data = data.drop(fields_to_drop, axis=1)


In [17]:
# Converting diagnosis to integer - 1 for malignant, 0 - for benign
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['diagnosis']=label_encoder.fit_transform(data['diagnosis'])
data['diagnosis'].unique()


array([1, 0], dtype=int64)

In [19]:
# Visualising the data set
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [21]:
# Using 10% of dataset for testing
test_split_idx = int(data.shape[0]*0.9) 
val_split_idx = int(data.shape[0]*0.8) 
test_data = data[test_split_idx:]
val_data = data[val_split_idx:test_split_idx]
data = data[:val_split_idx]

In [22]:
# Separating data to features and targets
train_Y, train_X = data['diagnosis'], data.drop('diagnosis', axis=1)
val_Y, val_X = val_data['diagnosis'], val_data.drop('diagnosis', axis=1)
test_Y, test_X = test_data['diagnosis'], test_data.drop('diagnosis', axis=1)


In [23]:
#detemining rows and columns

data.shape

(455, 31)

In [24]:
#sigmoid function
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [26]:
# model testing function

def test_model(X, y, beta):
    prob = np.array(sigmoid(X.dot(beta)))
    
    ## Converting prob to prediction, >.5 = True, <.5 = False
    prob = np.greater(prob, 0.5*np.ones((prob.shape[1],1)))
    accuracy = np.count_nonzero(np.equal(prob, y))/prob.shape[0] * 100
    return accuracy

In [27]:
#Gradient step function

def gd_step(curr, y, X, lr=0.0000001):
    hx = X.dot(curr)
    p = np.array(sigmoid(hx))
    change = lr * (X.T.dot(y-p))
    beta = curr + change  
    
    return beta


In [28]:
# Hyperparameters
batch_size = 50 #(can also use different number for batch size and result varies accordingly)
lr = 0.0001
max_iters = 51

beta_old, beta = np.ones((30,1)), np.zeros((30,1))
iter_count = 0

while iter_count < max_iters:
    if iter_count % 10 == 0:
        # print('Epoch: {}'.format(iter_count))
       # print('Validation Accuracy: {}%'.format(test_model(val_X, val_Y.to_frame(), beta)))
        beta_old = beta
    for i in range(0, train_X.shape[0], batch_size):
        beta = gd_step(beta, train_Y[i:i+batch_size].to_frame(), 
                        train_X[i:i+batch_size], lr)
    iter_count += 1



In [29]:
#Printing Accuracy
print('After {} Iterations'.format(iter_count))
print('Test Accuracy: {}%'.format(test_model(test_X, test_Y.to_frame(), beta)))


After 51 Iterations
Test Accuracy: 91.22807017543859%
