In [None]:
import sys
import numpy as np
import pandas as pd

In [None]:
class LogisticRegressionClassifier():
  def __init__(self,alpha,lmbda,epsilon,maxiter):
    self.alpha = float(alpha) # Learning rate
    self.lmbda = float(lmbda) # Regularization constant
    self.epsilon = float(epsilon) # Convergence measure
    self.maxiter = int(maxiter) # The maximum number of iterations
    self.threshold = 0.5 # The class prediction threshold
    self.log = np.zeros(self.maxiter)

  def __str__(self):
    return "<logistic Regression Classifier Instance: alpha=" + str(self.alpha) + ">\n"

  def get_proba(self,X):
    return np.array(1.0/(1 + np.exp(- np.dot(X,(self.theta).T))))
    
  def predict_proba(self,X):
    X_ = self.add_ones(X)
    return self.get_proba(X_)

  def predict(self,X):
    if X.ndim == 1:
      y_pred = [self.predict_proba(X) > self.threshold]
    else:
      y_pred = [proba > self.threshold for proba in self.predict_proba(X)]

    return np.array(y_pred)

  def add_ones(self,X):
    if X.ndim == 1:
      return np.insert(X, 0, 1)
    else:
      return np.insert(X, 0, 1, 1)
      
  def fit(self, X_, y):
    # Optimizing the parameters for the logistic regression classification model
    X = self.add_ones(X_) # Bias terms

    # Initialize optimization matrix
    self.n = X.shape[0] # Instances
    self.m = X.shape[1] # Features
    self.probability = np.zeros(self.n) # Output probabilities
    self.theta = np.random.rand(self.m)  # Weight Matrix
    #self.theta = np.ones(self.m)

    # Iterate through the data at most maxiter times, also stop iterating if error is less than epsilon
    #print('iter | magnitude of the gradient')
    for iteration in range(self.maxiter):
      alpha = self.alpha/(iteration/200+1)
      # Calculate probabilities
      self.probability = self.get_proba(X)

      # Calculate the gradient and update theta
      #gw = (1.0/self.n) * (X.T @ (self.probability - y))
      gw = X.T @ (self.probability - y)
      g0 = gw[0] # save the theta_0 gradient calculation before regularization
      #gw += ((self.lmbda * self.theta)/self.n) # regularize using the lmbda term
      gw += (self.lmbda * self.theta) # regularize using the lmbda term
      gw[0] = g0 # restore regularization independent theta_0 gradient calculation
      self.theta -= alpha * gw # update paramters

      # check for convergence
      loss = np.linalg.norm(gw)
      self.log[iteration] = loss
      if self.epsilon > loss**2:
        break

      #print(iteration, ':', loss)



In [None]:
def Accueval(Y_test, Y_proba):
  accuary = 1 - np.mean(np.abs(Y_test - Y_proba))
  return accuary


In [None]:
def data_preprocess(data_file, test_data, colomn_selected):

  data = np.array(pd.read_csv(data_file))
  X_origin = data[:,:-1]

  X_pre = np.hstack([X_origin,X_origin**2])
  X_selected = X_pre[:,colomn_selected]
  X_mean = np.mean(X_selected,axis=0)
  X_std = np.std(X_selected,axis=0)
  X = (X_selected - X_mean)/X_std
  Y = data[:,-1]	

  data_test = np.array(pd.read_csv(test_data))
  X_test_origin = data_test[:,:-1]
  X_test_pre = np.hstack([X_test_origin,X_test_origin**2])
  X_test_selected = X_test_pre[:,colomn_selected]
  X_test = (X_test_selected - X_mean)/X_std
  Y_test = data_test[:,-1]

  return X, Y, X_test, Y_test

In [None]:
def main(data_file, test_data, alpha=0.01, lmbda=0, epsilon=0.0001, maxiter=10000):
	
  if data_file == 'hepatitis_train.csv':
    column = [2,11,16,17,18]
  elif data_file == 'bankrupcy_train.csv':
    column = [23,33,34,47,49,55,62,68,76,83,86,93,107,115,127]
  else:
    print('Sorry, we have not optimize our model for this dataset.')
    return

  X, Y, X_test, Y_test = data_preprocess(data_file, test_data, column)

	# validation matrix
  Accuary = np.zeros(X.shape[0],float)
 
	# K-fold cross validation
  for k in range(0, X.shape[0], 10):
		# split the dataset into training subset and validation subset
    X_vali = X[k:k+10,:]
    Y_vali = Y[k:k+10]
    X_train = np.vstack([X[0:k, :], X[k+10:, :]])
    Y_train = np.hstack([Y[:k], Y[k+10:]])

		# create the logistic regression classifier using the training data
    LRC = LogisticRegressionClassifier(alpha, lmbda, epsilon, maxiter)

		# fit the model to the loaded training data
    LRC.fit(X_train, Y_train)

		# predict the results for the validation data
    Y_vali_proba = LRC.predict_proba(X_vali)

		# kth validation result
    Accuary[k] = Accueval(Y_vali, Y_vali_proba)
	
	
	# validation results for this model
  Accuary = Accuary[::10]
  Acc_train = np.mean(Accuary)
  print('Average accuracy in cross validation: ' + str(Acc_train))

	# get final model
  LRC = LogisticRegressionClassifier(alpha, lmbda, epsilon, maxiter)
  print("\nCreated a logistic regression classifier =", LRC)

  print("Fitting the training data...\n")
  print("The weight matrix is:\n")
  LRC.fit(X, Y)
  print(LRC.theta)

	# predict the results for the test data
  print("\nGenerating probability prediction for the test data...\n")
  Y_proba = LRC.predict_proba(X_test)
  Y_pred = LRC.predict(X_test)
  print(Y_pred)

#  print("The probabilities for each instance in the test set are:\n")
#  for prob in LRC.predict_proba(X_test):
#    print(prob)

	# Accuary
  accu = Accueval(Y_test, Y_proba)
  print('\nAccuracy in testing: ' + str(accu))	
  return LRC


In [None]:
main('hepatitis_train.csv', 'hepatitis_test.csv', alpha=0.1, lmbda=0, epsilon=0.01, maxiter=10000)

Average accuracy in cross validation: 0.7950685230811794

Created a logistic regression classifier = <logistic Regression Classifier Instance: alpha=0.1>

Fitting the training data...

The weight matrix is:

[ 2.1991571   0.25227407  0.5129146   0.62388065  0.45431409 -0.3887027 ]

Generating probability prediction for the test data...

[ True  True  True  True  True  True  True  True  True  True  True  True
 False False  True  True  True  True  True  True]

Accuracy in testing: 0.7495695188855812
Running time: 0.10585199999999961 Seconds
