# Logistic Regression From Scratch using SGD 

In [None]:
from math import exp 

In [None]:
def predict(row,coefficient):
  yhat=coefficient[0]
  for i in range(len(row)-1):
    yhat +=coefficient[i+1]*row[i]
  return 1/(1+exp(-yhat))  

In [None]:
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
coef = [-0.406605464, 0.852573316, -1.104746259]
for row in dataset:
  yhat=predict(row,coef)
  print('Expected,Predicted',row[-1],yhat,round(yhat))

Expected,Predicted 0 0.2987569855650975 0
Expected,Predicted 0 0.14595105593031163 0
Expected,Predicted 0 0.08533326519733725 0
Expected,Predicted 0 0.21973731424800344 0
Expected,Predicted 0 0.24705900008926596 0
Expected,Predicted 1 0.9547021347460022 1
Expected,Predicted 1 0.8620341905282771 1
Expected,Predicted 1 0.9717729050420985 1
Expected,Predicted 1 0.9992954520878627 1
Expected,Predicted 1 0.9054893228110497 1


In [None]:
def coeff_sgd(train, l_rate,n_epoch):
  coef=[0.0 for i in range (len(train[0]))]
  for epoch in range(n_epoch):
    sum_error=0.0
    for row in train:
      yhat=predict(row,coef)
      error=row[-1]-yhat
      sum_error=error**2
      coef[0]=coef[0]+l_rate*error*yhat*(1-yhat)
      for i in range(len(row)-1):
        coef[i+1]=coef[i+1]+l_rate*error*yhat*(1-yhat)*row[i]
    print('Epoch,l_rate,Error',epoch,l_rate,sum_error)
  return coef    

In [None]:
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
l_rate=0.03
n_epoch=100
coef=coeff_sgd(dataset,l_rate,n_epoch)
print(coef)

In [None]:
from random import seed
from random import randrange
from csv import reader

In [None]:
def load_csv(filename):
  dataset=list()
  with open (filename,'r') as file:
    csv_reader=reader(file)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row)
  return dataset      

In [None]:
def str_column_to_float(dataset,column):
  for row in dataset:
    row[column]=float(row[column].strip())

In [None]:
def dataset_minmax(dataset):
  minmax=list()
  for i in range(len(dataset[0])):
    col_values=[row[i] for row in dataset]
    minset=min(col_values)
    maxset=max(col_values)
    minmax.append([minset,maxset])
  return minmax   

In [None]:
def normalize_dataset(dataset,minmax):
  for row in dataset:
    for i in range(len(row)):
      row[i]=row[i]-minmax[i][0]/(minmax[i][1]-minmax[i][0])

In [None]:
def cross_validation_split(dataset,n_folds):
  dataset_split=list()
  dataset_copy=list(dataset)
  fold_size=int(len(dataset)/n_folds)
  for i in range(n_folds):
    fold=list()
    while len(fold)<fold_size:
      index=randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split    

In [None]:
def accuracy_metric(actual,predicted):
  correct=0
  for i in range(len(actual)):
    if actual[i]==predicted[i]:
      correct += 1
  return correct/float(len(actual)) * 100    

In [None]:
def evaluate_algorithm(dataset,algorithm,n_folds,*args):
  folds=cross_validation_split(dataset,n_folds)
  scores=list()
  for fold in folds:
    train_set=list(folds)
    train_set.remove(fold)
    train_set=sum(train_set,[])
    test_set=list()
    for row in fold:
      row_copy=list(row)
      test_set.append(row_copy)
      row_copy[-1]=None
    predicted=algorithm(train_set,test_set,*args)
    actual=[row[-1] for row in fold]
    accuracy=accuracy_metric(actual,predicted)
    scores.append(accuracy)
  return scores    

In [None]:
def logistic_regression(train,test,l_rate,n_epoch):
  predictions=list()
  coef=coeff_sgd(train,l_rate,n_epoch)
  for row in test:
    yhat=predict(row,coef)
    yhat=round(yhat)
    predictions.append(yhat)
  return predictions  

In [None]:
seed(1)
filename='pima-indians-diabetes.csv'
dataset=load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset,i)
minmax=dataset_minmax(dataset)
normalize_dataset(dataset,minmax)
n_folds=5
l_rate=0.01
n_epoch=200
scores=evaluate_algorithm(dataset,logistic_regression,n_folds,l_rate,n_epoch)
print('Scores',scores)
print('Mean Accuracy',(sum(scores)/float(len(scores))))  