In [24]:
# importing the needed libraries/modules/packages

import math
import numpy as np
import pandas as pd
from sklearn import datasets, model_selection, preprocessing as PP

In [25]:
# loading the breast cancer dataset and getting the input, output and the feature names

data=datasets.load_breast_cancer()

X=data.data
Y=data.target

columns=data.feature_names
N=len(columns)

In [26]:
# describing the input data through a dataframe

df=pd.DataFrame(X, columns=columns)

df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [27]:
# scaling the data, and we're standardising it due to the presence of outliers as it will make the calculations fast and will remove the dominance of the features with large scale over those with small scale

scaler=PP.StandardScaler()

X=scaler.fit_transform(X)

X=np.concatenate((X, np.ones(X.shape[0]).reshape(-1, 1)), axis=1)

In [28]:
# splitting the data into training and testing, getting numpy arrays for the training part and concatinating a column with all 1s  to the last

X_train, X_test, Y_train, Y_test=model_selection.train_test_split(X, Y, random_state=1)

M=X_train.shape[0]

np_x=np.array(X_train)
np_y=np.array(Y_train)

In [29]:
# score function which predicts the mean average score or the accuracy for the predictions made

def score(Y_pred, Y_true):
    count_right_predictions=0
    total_predictions=len(Y_pred)

    for i in range(len(Y_pred)):        
        if Y_pred[i]==Y_true[i]:
            count_right_predictions+=1

    score=(count_right_predictions/total_predictions)

    return score

In [30]:
# predicting the type of breast cancer as malignant or benign depending on the feature values, using the optimal coefficients calculated through gradient descent 

def predict(x, coeffs):
    y_pred=np.zeros(x.shape[0])

    for i in range(x.shape[0]):
        mtxi=(coeffs*x[i]).sum()
        hxi=1/(1+math.exp(-mtxi))

        y_pred[i]=hxi

    return y_pred

In [31]:
# step gradient function which makes a change in the value of the coefficients, so that we move closer towards the optimal cost 

def step_gradient(coeffs, learning_rate):
    slope=np.zeros(N+1)    

    for j in range(N+1):
        for i in range(M):
            yi=np_y[i]
            mtxi=(coeffs*np_x[i]).sum()
            hxi=1/(1+math.exp(-mtxi))
            xij=np_x[i][j]

            slope[j]+=(xij*(yi-hxi))

    slope/=(-M)
    coeffs-=(learning_rate*slope)

    return coeffs

In [32]:
# cost function which calculates the cost/error for a particular value of the coefficients 

def cost(coeffs):  
    cost=0
     
    for i in range(M):
        yi=np_y[i]        
        mtxi=(coeffs*np_x[i]).sum()

        ei=(yi*mtxi)-math.log(1+math.exp(mtxi))

        cost+=ei

    cost/=(-M)

    return cost

In [33]:
# gd function which performs the gradient descent and returns the optimal coefficients for which the cost is minimised

def gd(learning_rate):    
    coeffs=np.zeros(N+1)

    prev_cost=cost(coeffs)

    # choosing the value of learning rate or aplha for which we just don't overshoot and the cost starts decreasing

    while True:                
        new_coeffs=step_gradient(coeffs, learning_rate)
        new_cost=cost(new_coeffs)    

        if new_cost>=prev_cost:
            learning_rate/=10                 
        else:
            break

    i=0

    while True:   
        prev_cost=cost(coeffs)

        coeffs=step_gradient(coeffs, learning_rate)
        new_cost=cost(coeffs)

        print("Cost", i, new_cost)

        # when the difference between the new and prev cost is <= 0.0001, then we simply break and return the optimal coefficients 

        if abs(new_cost-prev_cost)<=0.0001:
            break

        i+=1

    return coeffs 

In [34]:
# run function which runs the gradient descent code

def run():
    # initial value of learning rate or alpha
    
    learning_rate=0.1

    coeffs=gd(learning_rate)

    return coeffs

In [35]:
# getting the optimal coefficients through the gradient descent algorithm 

coeffs=run()

Cost 0 0.42624090990666574
Cost 1 0.3719832908404567
Cost 2 0.3348206054307278
Cost 3 0.30741806679756645
Cost 4 0.28617363691709097
Cost 5 0.26909556230910586
Cost 6 0.25498572215655624
Cost 7 0.24307646906945968
Cost 8 0.23285118467683255
Cost 9 0.2239481977200138
Cost 10 0.21610596863283316
Cost 11 0.20913016428533712
Cost 12 0.2028730182678479
Cost 13 0.1972199179346669
Cost 14 0.19208041505449802
Cost 15 0.18738203833948452
Cost 16 0.1830659341015939
Cost 17 0.17908373111393008
Cost 18 0.17539524430182865
Cost 19 0.17196676506075712
Cost 20 0.16876976939150398
Cost 21 0.1657799285567015
Cost 22 0.16297634206644387
Cost 23 0.16034093629102103
Cost 24 0.15785798800424253
Cost 25 0.15551374324714137
Cost 26 0.15329610969690893
Cost 27 0.15119440628247777
Cost 28 0.14919915779924076
Cost 29 0.14730192520451685
Cost 30 0.14549516443750313
Cost 31 0.14377210821935796
Cost 32 0.14212666650205352
Cost 33 0.14055334215545798
Cost 34 0.13904715918696456
Cost 35 0.13760360133179012
Cost 36 0

In [36]:
# getting the predictions for the test data and setting them to either 0 or 1

Y_pred=predict(X_test, coeffs)

for i in range(len(Y_pred)):
    if Y_pred[i]<=0.5:
        Y_pred[i]=0
    else:
        Y_pred[i]=1

In [37]:
# storing the predictions in a csv file

df=pd.DataFrame(np.array(Y_pred, dtype=int))

df.to_csv("predictions.csv", header=False, index=False)

In [38]:
# printing the score for the gradient descent algorithm 

score=score(Y_pred, Y_test)

print(score)

0.958041958041958
