## 1-Logistic regression classifier

In [0]:
import pandas as pd
import numpy as np
from random import randrange

In [0]:
df_breast_cancer = pd.read_csv("breast-cancer-wisconsin.data.txt", header=None)
df_breast_cancer.replace('?',np.nan, inplace=True)

df_breast_cancer.isnull().any(axis=1)
df_breast_cancer.dropna(axis=0, inplace=True)
df_breast_cancer.reset_index(inplace=True)

In [0]:
y = df_breast_cancer[10]
df_breast_cancer = df_breast_cancer[[1,2,3,4,5,6,7,8,9,10]]
# print(df_breast_cancer)

In [0]:
#positive:1 or 4, negative: 0 or 2
def get_values(actual, predicted):
  tp = tn = fp = fn = p = 0
#   print(len(actual))
#   print(len(predicted))
  for a in actual:
    if a == 0.0 and predicted[p] == 0.0:
      tn += 1
    elif a == 1.0 and predicted[p] == 1.0:  
      tp += 1
    elif a == 1.0 and predicted[p] == 0.0:
      fn += 1
    elif a == 0.0 and predicted[p] == 1.0:
      fp += 1   
      
    p += 1
  
  return tp, tn, fp, fn

def precision(tp, tn, fp):
  return (tp / (tp+fp) if (tp+fp) else 1) * 100
  
  
def recall(tp, tn, fn):
  return (tp / (tp+fn) if (tp+fn) else 1) * 100
  
  
def accuracy(tp, tn, fp, fn):
  return ((tp+tn) /(tp+tn+fp+fn)) * 100

In [30]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = []
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

def accuracy_metric(actual, predicted):
  correct = 0
#   print(predicted)
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      correct += 1
  return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation 
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = {}
  accuracy_ls = []
  precision_ls = []
  recall_ls = []

  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    
    tp, tn, fp, fn = get_values(actual, predicted)
    accuracy_ = accuracy(tp, tn, fp, fn)
    precision_ = precision(tp, tn, fp)
    recall_ = recall(tp, tn, fn)
    accuracy_ls.append(accuracy_)
    precision_ls.append(precision_)
    recall_ls.append(recall_)
  scores["Accuracy"] = accuracy_ls
  scores["Precision"] = precision_ls
  scores["Recall"] = recall_ls
  return scores

def predict(row, coefficients):
  f = coefficients[0]
  for i in range(len(row)-1):
    f += coefficients[i + 1] * int(row[i])  
  return 1.0 / (1.0 + np.exp(-f))
 
# Estimate logistic regression coefficients using stochastic gradient descent
def SGD_coeffs(train, learning_rate, n_epoch):
	coef = [0.0 for i in range(len(train[0]))]
	for epoch in range(n_epoch):
		for row in train:
			f = predict(row, coef)
			loss =  f - row[-1]
			coef[0] = coef[0] - learning_rate * loss * f * (1.0 - f)
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] - learning_rate * loss * f * (1.0 - f) * int(row[i])
	return coef

# MiniBatch Gradient Descent - finding coefficients
def MBGD_coeffs(train, learning_rate, n_epoch):
  coef = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):    
    np.random.shuffle(train)    
    
    #taking batch size as 32
    batch = train[:32]  
      
    for data in batch:
      f = predict(data, coef)
      loss = f - data[-1]
      coef[0] = coef[0] - learning_rate * loss * f * (1.0 - f)
      for i in range(len(data)-1):
        coef[i + 1] = coef[i + 1] - learning_rate * loss * f * (1.0 - f) * int(data[i])    
   
  return coef
 
# Logistic Regression Classifier using MiniBatch Gradient Descent
def logistic_regression_mbgd(train, test, learning_rate, n_epoch):
  predictions = []
  coef = MBGD_coeffs(train, learning_rate, n_epoch)
  for data in test:
    f = predict(data, coef)
    f = round(f)
    predictions.append(f)
  return(predictions)

# Logistic Regression Algorithm With Stochastic Gradient Descent
def logistic_regression_sgd(train, test, learning_rate, n_epoch):
  predictions = list()
  coef = SGD_coeffs(train, learning_rate, n_epoch)
  for row in test:
    f = predict(row, coef)
    f = round(f)
    predictions.append(f)
  return(predictions)

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column])
    
def normalize_dataset(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min, value_max])
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
    
#evaluation: 

dataset = df_breast_cancer.values.tolist()

for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
  
# normalize
normalize_dataset(dataset)

n_folds = 5
learning_rate = 0.1
n_epoch = 100

scores = evaluate_algorithm(dataset, logistic_regression_mbgd, n_folds, learning_rate, n_epoch)
print("MiniBatch:")
print("scores:%s"%scores)

scores = evaluate_algorithm(dataset, logistic_regression_sgd, n_folds, learning_rate, n_epoch)
print("SGD:")
print('Scores: %s' % scores)

MiniBatch:
scores:{'Accuracy': [90.44117647058823, 93.38235294117648, 93.38235294117648, 91.17647058823529, 89.70588235294117], 'Precision': [100.0, 100.0, 100.0, 97.22222222222221, 94.44444444444444], 'Recall': [74.0, 83.01886792452831, 79.06976744186046, 76.08695652173914, 73.91304347826086]}
SGD:
Scores: {'Accuracy': [95.58823529411765, 93.38235294117648, 94.11764705882352, 93.38235294117648, 90.44117647058823], 'Precision': [97.72727272727273, 97.77777777777777, 94.5945945945946, 97.36842105263158, 100.0], 'Recall': [89.58333333333334, 84.61538461538461, 85.36585365853658, 82.22222222222221, 74.50980392156863]}
