In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 
from sklearn.metrics import accuracy_score

df_imputed = pd.read_csv('responses.csv', sep=",")

In [186]:
df_imputed = df_imputed.dropna()

In [187]:
for col in ['Smoking', 'Alcohol', 'Punctuality', 'Lying', 'Internet usage',
        'Gender', 'Left - right handed', 'Education', 'Only child',
        'Village - town', 'House - block of flats']:
    print(col)
    df_imputed = df_imputed.drop(col,1)

Smoking
Alcohol
Punctuality
Lying
Internet usage
Gender
Left - right handed
Education
Only child
Village - town
House - block of flats


In [188]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'Fear of public speaking' in df_imputed:
    y = df_imputed['Fear of public speaking'].values # get the labels we want
    del df_imputed['Fear of public speaking'] # get rid of the class label
    X = df_imputed.values # use everything else to predict!

num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(
                         n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)


In [None]:
g = df_imputed.columns.to_series().groupby(df_imputed.dtypes).groups
{k.name: v for k, v in g.items()}
cols_not_intfloat = ['Smoking', 'Alcohol', 'Punctuality', 'Lying', 'Internet usage',
        'Gender', 'Left - right handed', 'Education', 'Only child',
        'Village - town', 'House - block of flats']

In [None]:
import numpy as np
class BinaryLogisticRegressionBase:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private:
    @staticmethod
    def _sigmoid(theta):
        return 1/(1+np.exp(-theta)) 
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    # public:
    def predict_proba(self,X,add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    
    
        
blr = BinaryLogisticRegressionBase(0.1)
print(blr)


In [None]:
# inherit from base class
class BinaryLogisticRegression(BinaryLogisticRegressionBase):
    #private:
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    def _get_gradient(self,X,y):
        # programming \sum_i (yi-g(xi))xi
        gradient = np.zeros(self.w_.shape) # set gradient to zero
        for (xi,yi) in zip(X,y):
            gradi = (yi - self.predict_proba(xi,add_bias=False))*xi # the actual update inside of sum
            gradient += gradi.reshape(self.w_.shape) # reshape to be column vector and add to gradient
        
        return gradient/float(len(y))
       
    # public:
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

            
blr = BinaryLogisticRegression(0.1)
print(blr)

In [None]:
class LogisticRegression:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = y==yval # create a binary problem
            # train the binary classifier for this class
            blr = VectorBinaryLogisticRegression(self.eta,self.iters)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for blr in self.classifiers_:
            probs.append(blr.predict_proba(X)) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return np.argmax(self.predict_proba(X),axis=1) # take argmax along row
    
lr = LogisticRegression(0.1,1500)
print(lr)

In [None]:
X

In [None]:
%%time
# can we do better? Maybe more iterations?
params = dict(eta=0.1,
              iterations=500)

blr = LogisticRegression(**params)
blr.fit(X,y)
# print(blr)
yhat = blr.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat+1))

In [None]:
%%time
# now lets do some vectorized coding
import numpy as np
from scipy.special import expit

class VectorBinaryLogisticRegression(BinaryLogisticRegression):
    # inherit from our previous class to get same functionality
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # but overwrite the gradient calculation
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.w_.shape)

# use same params as defined above
blr = VectorBinaryLogisticRegression(**params)
blr.fit(X,y)
print(blr.w_)
yhat = blr.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat))

In [None]:
class RegularizedBinaryLogisticRegression(VectorBinaryLogisticRegression):
    # extend init functions
    def __init__(self, C=0.0, **kwds):        
        # need to add to the original initializer 
        self.C = C
        # but keep other keywords
        super().__init__(**kwds) # call parent initializer
        
        
    # extend previous class to change functionality
    def _get_gradient(self,X,y):
        # call get gradient from previous class
        gradient = super()._get_gradient(X,y)
        
        # add in regularization (to all except bias term)
        gradient[1:] += 2 * self.w_[1:] * self.C
        return gradient
        

In [None]:
# now redefine the Logistic Regression Function where needed
class RegularizedLogisticRegression(LogisticRegression):
    def __init__(self, C=0.0, **kwds):        
        # need to add to the original initializer 
        self.C = C
        # but keep other keywords
        super().__init__(**kwds) # call parent initializer
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = y==yval # create a binary problem
            # train the binary classifier for this class
            blr = RegularizedBinaryLogisticRegression(eta=self.eta,
                                                      iterations=self.iters,
                                                      C=self.C)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T

In [None]:
# run logistic regression and vary some parameters
from sklearn import metrics as mt

# first we create a reusable logisitic regression object
#   here we can setup the object with different learning parameters and constants
lr_clf = RegularizedLogisticRegression(eta=0.1,iterations=2000) # get object

# now we can use the cv_object that we setup before to iterate through the 
#    different training and testing sets. Each time we will reuse the logisitic regression 
#    object, but it gets trained on different data each time we use it.

iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in cv_object.split(X,y): 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = X[train_indices]
    y_train = y[train_indices]

    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    # train the reusable logisitc regression model on the training data
    lr_clf.fit(X_train,y_train)  # train object
    y_hat = lr_clf.predict(X_test) # get test set precitions

    # now let's get the accuracy and confusion matrix for this iterations of training/testing
    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print("accuracy", acc )
    print("confusion matrix\n",conf)
    iter_num+=1
    
# Also note that every time you run the above code
#   it randomly creates a new training and testing set, 
#   so accuracy will be different each time

In [190]:
%%time
# from last time, our logistic regression algorithm is given by (including everything we previously had):
class BinaryLogisticRegression:
    def __init__(self, eta, iterations=20, C=0.001):
        self.eta = eta
        self.iters = iterations
        self.C = C
        # internally we will store the weights as self.w_ to keep with sklearn conventions
        
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    # convenience, private:
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # vectorized gradient calculation with regularization using L2 Norm
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += 2 * self.w_[1:] * self.C
        
        return gradient
    
    # public:
    def predict_proba(self,X,add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    
    
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

blr = BinaryLogisticRegression(eta=0.1,iterations=500,C=0.001)

blr.fit(X,y)
print(blr)

yhat = blr.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat+1))

Binary Logistic Regression Object with coefficients:
[[    90.77700297]
 [   455.69584136]
 [   310.12282053]
 [   296.38078132]
 [   214.42342046]
 [   201.30932795]
 [   284.12760523]
 [   263.20815517]
 [   333.14489982]
 [   364.99230948]
 [   228.86059827]
 [   241.9717399 ]
 [   274.68285292]
 [   264.37882356]
 [   260.24370786]
 [   300.44206565]
 [   277.56896059]
 [   265.78549451]
 [   220.90069261]
 [   206.92855259]
 [   442.66107212]
 [   260.16446656]
 [   325.49531991]
 [   430.09429155]
 [   335.33045895]
 [   298.48972215]
 [   297.55702358]
 [   364.28971172]
 [   368.65984636]
 [   343.53062955]
 [   193.1951201 ]
 [   335.3350492 ]
 [   301.30380176]
 [   295.91607141]
 [   236.5990918 ]
 [   226.2106571 ]
 [   194.13126135]
 [   401.21182102]
 [   294.74523909]
 [   246.5078163 ]
 [   257.8183756 ]
 [   205.99101786]
 [   310.03972669]
 [   295.91459597]
 [   363.12174831]
 [   239.86763523]
 [   198.50729774]
 [   246.35113701]
 [   250.71823881]
 [   211.5350996

In [None]:
%%time
from sklearn.linear_model import LogisticRegression as SKLogisticRegression

lr_sk = SKLogisticRegression() # all params default
lr_sk.fit(X,y)
print(np.hstack((lr_sk.intercept_[:,np.newaxis],lr_sk.coef_)))
yhat = lr_sk.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat+1))

In [192]:
%%time
# and we can update this to use a line search along the gradient like this:
from scipy.optimize import minimize_scalar
import copy
class LineSearchLogisticRegression(BinaryLogisticRegression):
    
    # define custom line search for problem
    @staticmethod
    def line_search_function(eta,X,y,w,grad):
        wnew = w + grad*eta
        yhat = (1/(1+np.exp(-X @ wnew)))>0.5
        return np.sum((y-yhat)**2)+np.sum(wnew**2)
        
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            
            # do line search in gradient direction, using scipy function
            opts = {'maxiter':self.iters/20} # unclear exactly what this should be
            res = minimize_scalar(self.line_search_function, # objective function to optimize
                                  bounds=(self.eta/1000,self.eta*10), #bounds to optimize
                                  args=(Xb,y,self.w_,gradient), # additional argument for objective function
                                  method='bounded', # bounded optimization for speed
                                  options=opts) # set max iterations
            
            eta = res.x # get optimal learning rate
            self.w_ += gradient*eta # set new function values
                
            

lslr = LineSearchLogisticRegression(eta=0.1,iterations=110, C=0.001)

lslr.fit(X,y)

yhat = lslr.predict(X)
print(lslr)
print('Accuracy of: ',accuracy_score(y,yhat+1))    

Binary Logistic Regression Object with coefficients:
[[   11.17005256]
 [   53.64658572]
 [   36.51024463]
 [   34.89088082]
 [   25.24340221]
 [   23.69933782]
 [   33.44906939]
 [   30.98625141]
 [   39.21866253]
 [   42.96797028]
 [   26.94169838]
 [   28.4842537 ]
 [   32.33723963]
 [   31.12399529]
 [   30.63793742]
 [   35.36991036]
 [   32.67621979]
 [   31.29024367]
 [   26.005367  ]
 [   24.3605168 ]
 [   52.11235818]
 [   30.62814249]
 [   38.3181155 ]
 [   50.63308568]
 [   39.47594372]
 [   35.13970846]
 [   35.03147514]
 [   42.88522335]
 [   43.39928271]
 [   40.44295936]
 [   22.74555442]
 [   39.47829115]
 [   35.4725825 ]
 [   34.83760253]
 [   27.85698671]
 [   26.63130657]
 [   22.85554831]
 [   47.23234342]
 [   34.69977482]
 [   29.02171749]
 [   30.35022348]
 [   24.2498103 ]
 [   36.49847953]
 [   34.836848  ]
 [   42.74886279]
 [   28.23789924]
 [   23.37262578]
 [   29.00304983]
 [   29.5155582 ]
 [   24.90329026]
 [   41.02883892]
 [   26.13633406]
 [   25.070

In [196]:
%%time
class StochasticLogisticRegression(BinaryLogisticRegression):
    # stochastic gradient calculation 
    def _get_gradient(self,X,y):
        idx = int(np.random.rand()*len(y)) # grab random instance
        ydiff = y[idx]-self.predict_proba(X[idx],add_bias=False) # get y difference (now scalar)
        gradient = X[idx] * ydiff[:,np.newaxis] # make ydiff a column vector and multiply through
        
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += 2 * self.w_[1:] * self.C
        
        return gradient
    
    
slr = StochasticLogisticRegression(0.1,1000, C=0.001) # take a lot more steps!!

slr.fit(X,y)

yhat = slr.predict(X)
print(slr)
print('Accuracy of: ',accuracy_score(y,yhat+1))      

Binary Logistic Regression Object with coefficients:
[[   180.55      ]
 [   950.47628128]
 [   648.69707359]
 [   632.6019047 ]
 [   441.88531699]
 [   419.24571471]
 [   585.8666664 ]
 [   551.19829444]
 [   701.28440798]
 [   748.84377455]
 [   459.76109959]
 [   489.89457253]
 [   574.07760474]
 [   537.80302882]
 [   539.07795124]
 [   617.99387924]
 [   568.76047728]
 [   558.83407555]
 [   462.20116105]
 [   425.6008177 ]
 [   928.11766504]
 [   533.8148814 ]
 [   664.73165313]
 [   904.36447159]
 [   711.73914385]
 [   609.70005386]
 [   607.18910663]
 [   755.82029603]
 [   758.86380405]
 [   697.02290684]
 [   391.47757638]
 [   680.63779936]
 [   608.64734068]
 [   599.82924166]
 [   490.12747732]
 [   454.6999631 ]
 [   389.74341082]
 [   834.63900559]
 [   606.40247857]
 [   495.89074477]
 [   555.62370245]
 [   434.82821031]
 [   637.48914765]
 [   615.26622689]
 [   755.44574268]
 [   512.36148721]
 [   417.42540405]
 [   512.56003521]
 [   525.05493786]
 [   422.2097339

In [195]:
%%time
from numpy.linalg import pinv
class HessianBinaryLogisticRegression(BinaryLogisticRegression):
    # just overwrite gradient function
    def _get_gradient(self,X,y):
        g = self.predict_proba(X,add_bias=False).ravel() # get sigmoid value for all classes
        hessian = X.T @ np.diag(g*(1-g)) @ X + 2 * self.C # calculate the hessian

        ydiff = y-g # get y difference
        gradient = np.sum(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += 2 * self.w_[1:] * self.C
        
        return pinv(hessian) @ gradient
       
hlr = HessianBinaryLogisticRegression(eta=0.1,iterations=20,C=0.1) # note that we need only a few iterations here

hlr.fit(X,y)
yhat = hlr.predict(X)
print(hlr)
print('Accuracy of: ',accuracy_score(y,yhat+1))

Binary Logistic Regression Object with coefficients:
[[  2.45027462e+170]
 [ -1.31939092e+172]
 [  2.62226614e+171]
 [  2.69990104e+171]
 [ -8.10370053e+170]
 [ -1.25622611e+171]
 [  2.52865447e+176]
 [ -2.48917192e+175]
 [ -1.88031351e+175]
 [ -3.20587632e+175]
 [ -1.06333001e+176]
 [ -6.19847138e+174]
 [ -3.08920150e+174]
 [ -1.23578703e+174]
 [  8.72544465e+173]
 [ -2.69228095e+173]
 [ -5.04004842e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [ -4.94630900e+173]
 [

In [198]:
%%time
# for this, we won't perform our own BFGS implementation 
# (it takes a good deal of code and understanding of the algorithm)
# luckily for us, scipy has its own BFGS implementation:
from scipy.optimize import fmin_bfgs
class BFGSBinaryLogisticRegression(BinaryLogisticRegression):
    
    @staticmethod
    def objective_function(w,X,y,C):
        g = expit(X @ w)
        return -np.sum(np.log(g[y==1]))-np.sum(np.log(1-g[y==0])) + C*sum(w**2) #-np.sum(y*np.log(g)+(1-y)*np.log(1-g))

    @staticmethod
    def objective_gradient(w,X,y,C):
        g = expit(X @ w)
        ydiff = y-g # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(w.shape)
        gradient[1:] += 2 * w[1:] * C
        return -gradient
    
    # just overwrite fit function
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = fmin_bfgs(self.objective_function, # what to optimize
                            np.zeros((num_features,1)), # starting point
                            fprime=self.objective_gradient, # gradient function
                            args=(Xb,y,self.C), # extra args for gradient and objective function
                            gtol=1e-03, # stopping criteria for gradient, |v_k|
                            maxiter=self.iters, # stopping criteria iterations
                            disp=False)
        
        self.w_ = self.w_.reshape((num_features,1))
            
bfgslr = BFGSBinaryLogisticRegression(_,2) # note that we need only a few iterations here

bfgslr.fit(X,y)
yhat = bfgslr.predict(X)
print(bfgslr)
print('Accuracy of: ',accuracy_score(y,yhat+1))

Binary Logistic Regression Object with coefficients:
[[ 0.0053308 ]
 [ 0.02543407]
 [ 0.01737123]
 [ 0.01652036]
 [ 0.0119943 ]
 [ 0.01125106]
 [ 0.0158728 ]
 [ 0.01470071]
 [ 0.0185587 ]
 [ 0.02034076]
 [ 0.0127307 ]
 [ 0.01341243]
 [ 0.01534826]
 [ 0.01476222]
 [ 0.01457085]
 [ 0.01679544]
 [ 0.01546786]
 [ 0.01487498]
 [ 0.01232577]
 [ 0.01154665]
 [ 0.02472159]
 [ 0.01454181]
 [ 0.01813156]
 [ 0.0240279 ]
 [ 0.01868001]
 [ 0.01666729]
 [ 0.01669805]
 [ 0.02029976]
 [ 0.02052187]
 [ 0.01922505]
 [ 0.01087517]
 [ 0.01877569]
 [ 0.01690821]
 [ 0.01656991]
 [ 0.01338851]
 [ 0.01266578]
 [ 0.01091618]
 [ 0.02238595]
 [ 0.01650498]
 [ 0.01384812]
 [ 0.01431969]
 [ 0.0114766 ]
 [ 0.01726188]
 [ 0.01653915]
 [ 0.02029463]
 [ 0.0133663 ]
 [ 0.01126132]
 [ 0.01382762]
 [ 0.01398652]
 [ 0.01182857]
 [ 0.019413  ]
 [ 0.01248125]
 [ 0.01196867]
 [ 0.00996621]
 [ 0.01791798]
 [ 0.01666388]
 [ 0.01015415]
 [ 0.01263844]
 [ 0.0174242 ]
 [ 0.01723113]
 [ 0.01592406]
 [ 0.02409112]
 [ 0.01498263]
 [

In [200]:
class MultiClassLogisticRegression:
    def __init__(self, eta, iterations=20, C=0.0001):
        self.eta = eta
        self.iters = iterations
        self.C = C
        self.classifiers_ = []
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.sort(np.unique(y)) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = []
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = y==yval # create a binary problem
            # train the binary classifier for this class
            hblr = BFGSBinaryLogisticRegression(self.eta,self.iters,self.C)
            hblr.fit(X,y_binary)
            #print(accuracy(y_binary,hblr.predict(X)))
            # add the trained classifier to the list
            self.classifiers_.append(hblr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for hblr in self.classifiers_:
            probs.append(hblr.predict_proba(X).reshape((len(X),1))) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return np.argmax(self.predict_proba(X),axis=1) # take argmax along row
    



In [203]:
%%time
lr = MultiClassLogisticRegression(eta=0.1,iterations=10,C=0.0001)
lr.fit(X,y)
print(lr)

yhat = lr.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat+1))

MultiClass Logistic Regression Object with coefficients:
[[ -1.68977348e-02  -6.54692943e-02   1.85648173e-01  -6.94735652e-03
   -4.13950336e-02   1.15007357e-02   4.66314642e-02  -1.01079006e-02
    1.68426711e-02   2.27031224e-01  -1.07720407e-01  -2.56570631e-01
   -2.41356088e-01   1.68654003e-01  -1.94460663e-01   2.24846726e-02
   -7.94882119e-02  -4.28154610e-02  -1.93635604e-01  -8.12881598e-02
   -2.29203578e-01   2.34170647e-02  -4.19517079e-01   1.93012442e-01
    7.78681739e-02  -1.99817865e-01   4.15530303e-02  -9.91375127e-02
   -1.18865044e-01  -1.20559615e-01   8.22710360e-02   2.36236127e-02
    6.07405829e-02   1.15526457e-01  -1.19166210e-01   2.35682099e-02
    1.17088044e-02  -1.76163282e-02   3.78084858e-01  -5.12029460e-02
   -1.49398345e-01  -1.01647155e-01   7.33428471e-04  -6.87444845e-02
   -1.83340989e-01  -1.40875967e-01   2.82762920e-01  -3.39776023e-02
    1.18769818e-01   1.44380840e-01  -2.64865201e-01  -1.78614749e-01
    1.80375985e-01  -3.53049625e-

