In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


#DATA PREPROCESSING

#read data from file
data=pd.read_csv('wdbc.csv',header = None)

#Map values of B/M to 0/1
data[1] = data[1].apply({'B':0, 'M':1}.get)

#drop the first column : ID
to_drop1=[0]
data.drop(to_drop1,inplace=True,axis=1)

#splitting the data into training, validation and testing sets
up_data=data

Y=np.array([data[1]])   #Output data space
print(Y.shape)
Y=Y.transpose()

to_drop2=[1]
data.drop(to_drop2,inplace=True,axis=1)
X=data.to_numpy()          #Input data space

#training data set = 85% and test data set = 15%
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.15,random_state=0)

#normalizing the data
scaler=Normalizer().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

#splitting training set into validation and training
X_train,X_val,Y_train,Y_val=train_test_split(X_train,Y_train,test_size=0.05,random_state=1)
scaler=Normalizer().fit(X_train)
X_train=scaler.transform(X_train)
X_val=scaler.transform(X_val)


#Implementing Logistic Regression

#sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

#prediction function
def predict_data(X_data,Y_data,W,bias):
    pred_z = np.dot(X_data,W) + bias
    pred_p = sigmoid(pred_z)
    pred_Y =[]
    for i in range(0,len(pred_p)):
        if pred_p[i] >= 0.5 :
            pred_Y.append([1])
        else:
            pred_Y.append([0])
    pred_Y = np.array(pred_Y)
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(0,len(pred_Y)):
        if pred_Y[i][0] == Y_data[i][0] and pred_Y[i] == 1:
            TP += 1
        elif pred_Y[i] == Y_data[i][0] and pred_Y[i] == 0:
            TN += 1
        elif pred_Y[i] != Y_data[i][0] and pred_Y[i] == 1:
            FP += 1
        elif pred_Y[i][0] != Y_data[i][0] and pred_Y[i] == 0:
            FN += 1
    return TP,TN,FP,FN


losstrack1 = []
losstrack2 = []
accuracy_list = []
acc_list = []
train_list = []

#Initializing weight,bias,epochs and learning rates
W = np.random.randn(X_train.shape[1], 1)*0.99
bias=2.99
epochs = 1000
learningrate = [0.5,0.7,0.9]

#splitting data into batches
batch = 50
X_train_batch = []
Y_train_batch = []
for i in range(0, len(X_train), batch):
    X_train_batch.append(X_train[i:i+batch])
    Y_train_batch.append(Y_train[i:i+batch])

for l in learningrate:
    for epoch in range(epochs):
        total_cost = 0
        #for each batch we train the dataset and manipulate values for W and bias
        for i in range(0, len(X_train_batch)):
            m = X_train_batch[i].shape[0]
            z = np.dot(X_train_batch[i], W) + bias
            p = sigmoid(z)
            dz = np.subtract(p,Y_train_batch[i])
            
            #Gradient descent for W and bias
            dw = (1 / m) * np.dot(dz.transpose(),X_train_batch[i])      
            dw=dw.transpose()
            db = (1 / m) * np.sum(dz.transpose())
            W = W - l * dw
            bias = bias - l * db
        
        #Calculating Cross Entropy for training dataset
        m_train = X_train.shape[0]
        z_train = np.dot(X_train, W) + bias
        p_train = sigmoid(z_train)
        train_cost = -np.sum(np.multiply(np.log(p_train), Y_train) + np.multiply((1 - Y_train), np.log(1 - p_train)))/m_train
        losstrack1.append(np.squeeze(train_cost))
        
        #Calculating Cross Entropy for validation dataset
        m_val = len(X_val)
        z_val = np.dot(X_val, W) + bias
        p_val = sigmoid(z_val)
        validation_cost = -np.sum(np.multiply(np.log(p_val), Y_val) + np.multiply((1 - Y_val), np.log(1 - p_val)))/m_val
        losstrack2.append(np.squeeze(validation_cost))

        #Calculating confusion matrix metrics for training dataset
        TPtr,TNtr,FPtr,FNtr = predict_data(X_train,Y_train,W,bias)
        accuracy_tr = (TPtr+TNtr)/(TPtr+TNtr+FPtr+FNtr)          #accuracy for training dataset
        
        #Calculating confusion matrix metrics for validation dataset
        TPv,TNv,FPv,FNv = predict_data(X_val,Y_val,W,bias)
        accuracy_v = (TPv+TNv)/(TPv+TNv+FPv+FNv)                 #accuracy for validation dataset
        
        accuracy_list.append(np.squeeze(accuracy_v))
        train_list.append(np.squeeze(accuracy_tr))
    
    acc_list.append(np.squeeze(accuracy_v))

TP,TN,FP,FN = predict_data(X_test,Y_test,W,bias)      #predicting on the test dataset

print('CALCULATED VALUE OF ACCURACY, PRECISION AND RECALL ON TEST DATA FOR WDBC DATASET')
print('--------------------------------------------------------------------------------')
accuracy_test = (TP+TN)/(TP+TN+FP+FN)
print('ACCURACY VALUE: ' +str(accuracy_test))

precision_test = TP/(TP+FP)
print('PRECISION VALUE: ' +str(precision_test))

recall_test = TP/(TP+FN)
print('RECALL VALUE: '+str(recall_test))
print('\n')
print('\n')


#Graph plotted for evaluation of the Logistic Regression model
plt.title('Training Data and Validation Data vs Epoch')   
plt.plot(losstrack1,color = 'blue', label ='training error',lw=1)
plt.plot(losstrack2,color = 'red', label = 'validation error',lw=1)
plt.xlabel('Epoch')
plt.ylabel('Cost')
plt.legend(loc = 'best')
plt.show()
print('\n')
print('\n')

plt.title('Accuracy of Training Data and Validation Data vs Epoch')
plt.plot(train_list,color = 'blue', label ='training accuracy',lw=1)
plt.plot(accuracy_list,color = 'red', label ='validation accuracy',lw=1)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc = 'best')
plt.show()
print('\n')
print('\n')

plt.title('Accuracy of Validation Data vs Learning Rate')
plt.plot(acc_list,color = 'red', label ='accuracy',lw=1)
plt.xlabel('Learning Rate')
plt.ylabel('Accuracy')
plt.show()

(1, 569)


KeyboardInterrupt: 