In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import math

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df.drop('id', axis='columns', inplace=True)
df=df.dropna()

In [4]:
#preparing the data, the .values converts to numpy array
df_train = df.sample(frac=0.67,random_state=42)
df_test = df.drop(df_train.index)

X_train = df_train.drop('diagnosis', axis = 'columns').values
X_train = np.c_[np.ones(len(X_train)), X_train]
X_test = df_test.drop('diagnosis', axis = 'columns').values
X_test = np.c_[np.ones(len(X_test)), X_test]

y_train = df_train['diagnosis'].values
y_train[y_train == 'M'] = 1
y_train[y_train == 'B'] = 0
y_test = df_test['diagnosis'].values
y_test[y_test == 'M'] = 1
y_test[y_test == 'B'] = 0

In [5]:
#STARTING OFF WITH THE LOGISTIC REGRESSION CLASS

class Logreg:
    def sigmoid(self, x):
        sig = 1/(1 + np.exp(-x))
        return sig
  
    def __init__(self):    #constructor
        self.weights = None #bias lite for now
        
        
    def graddesc(self, X, y, learnrate):
        self.weights = np.zeros(df.shape[1]) #bias lite for now
        costs = []
        iters = []
        for j in range(0, len(X)): #will run 569 times -- basically 1 epoch, it will run through all of the inputs present at least once
                self.weights = self.weights - learnrate*(X[j].T*(self.sigmoid(np.dot(self.weights,X[j])) - y[j]))#the gradient function is (tn - yn) DOT x
                costs.append(self.compute_cost(self.sigmoid(np.dot(self.weights,X[j])),y[j]))
                iters.append(j)
        #plt.plot(iters,costs)
        return
    
    def compute_cost(self, predictions, actual):
        #m = len(actual)
        log_of_predictions = np.log(predictions+0.000000000000001)
        log_of_oneMinusPredictions = np.log(1-predictions+0.000000000000001)
        cost = -1*(np.sum(((actual*log_of_predictions) + ((1-actual)*(log_of_oneMinusPredictions)))))
        return cost
    
    def classify(self, X, threshold):
        predicts = np.zeros(len(X))
        for j in range (0, len(X)):
            if(self.sigmoid(np.dot(self.weights, X[j])) >= threshold):
                predicts[j] = 1
            else:
                predicts[j] = 0
        return predicts
    
    def sigmoidpredictors(self, X):
        predicts2 = np.zeros(len(X))
        for j in range (0, len(X)):
            predicts2[j] = self.sigmoid(np.dot(self.weights, X[j]))
        return predicts2
    
    def metricscore(self, y_actual, y_out):
        truepos = 0;
        falsepos = 0;
        trueneg = 0;
        falseneg = 0;
        for i in range(len(y_actual)):
            if(y_actual[i] == y_out[i]):
                if(y_actual[i] == 0):
                    trueneg = trueneg + 1;
                else:
                    truepos = truepos + 1;
            else:
                if(y_out[i] == 0):
                    falseneg = falseneg + 1;
                else:
                    falsepos = falsepos + 1;
        return truepos, falsepos, trueneg, falseneg
    
    def accuracy(self , y_actual , y_out): #taking the accuracy as the number of correct predictions / total number of predictions
        acc = np.sum(y_actual == y_out)/len(y_out)
        return acc
    
    def precision(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        prec = tp/(tp + fp)
        return prec
    
    def recall(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        #rec = np.sum(y_actual == y_out and y_out == 1)/np.sum((y_actual == y_out and y_out == 1) or (y_actual != y_out and y_out == 0))
        rec = tp/(tp + fn)
        return rec
        

In [6]:
class MiniBatchLogreg:
    def sigmoid(self, x):
        sig = 1/(1 + np.exp(-x))
        return sig
  
    def __init__(self):
        self.weights = None #bias lite for now
        
        
    def graddesc(self, X, y, learnrate, batchSize, epochs):
        self.weights = np.zeros(df.shape[1]) #bias lite for nwo
        errors = np.zeros(df.shape[1])
        costs = []
        iters = []
        a = 0
        for e in range(0, epochs):
            for i in range(0, len(X)-batchSize+1):
                cost1 = 0
                for j in range(i, batchSize+i): #will run 569 times -- basically 1 epoch, it will run through all of the inputs present at least once
                    errors = errors + X[j].T*(self.sigmoid(np.dot(self.weights,X[j])) - y[j])#the gradient function is (tn - yn) DOT x
                    cost1 = cost1 + self.compute_cost(self.sigmoid(np.dot(self.weights,X[j])),y[j])
                    i=i+1
                    a=a+1
                cost1 = cost1/batchSize
                costs.append(cost1)
                iters.append(a)
                self.weights = self.weights - learnrate * errors
        #plt.plot(iters, costs)
        return
    
    def compute_cost(self, predictions, actual):
        
        log_of_predictions = np.log(predictions+0.000000000000001)
        log_of_oneMinusPredictions = np.log(1-predictions+0.000000000000001)
        cost = -1*(np.sum(((actual*log_of_predictions) + ((1-actual)*(log_of_oneMinusPredictions)))))
        return cost
    
    def classify(self, X, threshold):
        predicts = np.zeros(len(X))
        for j in range (0, len(X)):
            if(self.sigmoid(np.dot(self.weights, X[j])) >= threshold):
                predicts[j] = 1
            else:
                predicts[j] = 0
        return predicts
    
    def metricscore(self, y_actual, y_out):
        truepos = 0;
        falsepos = 0;
        trueneg = 0;
        falseneg = 0;
        for i in range(len(y_actual)):
            if(y_actual[i] == y_out[i]):
                if(y_actual[i] == 0):
                    trueneg = trueneg + 1;
                else:
                    truepos = truepos + 1;
            else:
                if(y_out[i] == 0):
                    falseneg = falseneg + 1;
                else:
                    falsepos = falsepos + 1;
        return truepos, falsepos, trueneg, falseneg
    
    def accuracy(self , y_actual , y_out): #taking the accuracy as the number of correct predictions / total number of predictions
        acc = np.sum(y_actual == y_out)/len(y_out)
        return acc
    
    def precision(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        prec = tp/(tp + fp)
        return prec
    
    def recall(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        #rec = np.sum(y_actual == y_out and y_out == 1)/np.sum((y_actual == y_out and y_out == 1) or (y_actual != y_out and y_out == 0))
        rec = tp/(tp + fn)
        return rec

In [7]:
class BatchLogreg:
    def sigmoid(self, x):
        sig = 1/(1 + np.exp(-x))
        return sig
  
    def __init__(self):
        self.weights = None 
        
    def compute_cost(self, predictions, actual):
        
        log_of_predictions = np.log(predictions+0.000000000000001)
        log_of_oneMinusPredictions = np.log(1-predictions+0.000000000000001)
        cost = -1*(np.sum(((actual*log_of_predictions) + ((1-actual)*(log_of_oneMinusPredictions)))))
        return cost
        
    def graddesc(self, X, y, learnrate, epochs):
        self.weights = np.zeros(df.shape[1]) 
        errors = np.zeros(df.shape[1])
        costs = []
        iters = []
        for i in range(0, epochs):
            cost1 = 0
            for j in range(0, len(X)): 
                errors = errors + X[j].T*(self.sigmoid(np.dot(self.weights,X[j])) - y[j])
                cost1 = cost1 + self.compute_cost(self.sigmoid(np.dot(self.weights,X[j])),y[j])
            cost1 = cost1/len(X)
            self.weights = self.weights - learnrate * errors
            costs.append(cost1)
            iters.append(i)
        #plt.plot(iters, costs)
        return costs
    
    def sigmoidpredictors(self, X):
        predicts2 = np.zeros(len(X))
        for j in range (0, len(X)):
            predicts2[j] = self.sigmoid(np.dot(self.weights, X[j]))
        return predicts2
    
    def classify(self, X, threshold):
        predicts = np.zeros(len(X))
        for j in range (0, len(X)):
            if(self.sigmoid(np.dot(self.weights, X[j])) >= threshold):
                predicts[j] = 1
            else:
                predicts[j] = 0
        return predicts
    
    def metricscore(self, y_actual, y_out):
        truepos = 0;
        falsepos = 0;
        trueneg = 0;
        falseneg = 0;
        for i in range(len(y_actual)):
            if(y_actual[i] == y_out[i]):
                if(y_actual[i] == 0):
                    trueneg = trueneg + 1;
                else:
                    truepos = truepos + 1;
            else:
                if(y_out[i] == 0):
                    falseneg = falseneg + 1;
                else:
                    falsepos = falsepos + 1;
        return truepos, falsepos, trueneg, falseneg
    
    def accuracy(self , y_actual , y_out): 
        acc = np.sum(y_actual == y_out)/len(y_out)
        return acc
    
    def precision(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        prec = tp/(tp + fp)
        return prec
    
    def recall(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        rec = tp/(tp + fn)
        return rec

In [8]:
learnrate=0.01
batchSize=5
epochs=100
threshold = 0.4

# #Stochastic
LR1 = Logreg()
LR1.graddesc(X_train, y_train, learnrate)
test1_acc = LR1.accuracy(y_test, LR1.classify(X_test, threshold))
test1_prec = LR1.precision(y_test, LR1.classify(X_test, threshold))
test1_rec = LR1.recall(y_test, LR1.classify(X_test, threshold))
print('Stochastic')
print(test1_acc)
print(test1_prec)
print(test1_rec)

# #MiniBatch
LR2 = MiniBatchLogreg()
LR2.graddesc(X_train, y_train, learnrate, batchSize, epochs)
test2_acc = LR2.accuracy(y_test, LR2.classify(X_test, threshold))
test2_prec = LR2.precision(y_test, LR2.classify(X_test, threshold))
test2_rec = LR2.recall(y_test, LR2.classify(X_test, threshold))
print('\nMiniBatch')
print(test2_acc)
print(test2_prec)
print(test2_rec)

#Batch
LR3 = BatchLogreg()
LR3.graddesc(X_train, y_train, learnrate, epochs)
test3_acc = LR3.accuracy(y_test, LR3.classify(X_test, threshold))
test3_prec = LR3.precision(y_test, LR3.classify(X_test, threshold))
test3_rec = LR3.recall(y_test, LR3.classify(X_test, threshold))
print('\nBatch')
print(test3_acc)
print(test3_prec)
print(test3_rec)

  sig = 1/(1 + np.exp(-x))
  sig = 1/(1 + np.exp(-x))


Stochastic
0.8387096774193549
0.9795918367346939
0.6233766233766234

MiniBatch
0.8817204301075269
0.8313253012048193
0.8961038961038961

Batch
0.8817204301075269
0.9230769230769231
0.7792207792207793
