In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 
from sklearn.metrics import accuracy_score

df_imputed = pd.read_csv('responses.csv', sep=",")

In [146]:
df_imputed = df_imputed.dropna()

In [147]:
for col in ['Smoking', 'Alcohol', 'Punctuality', 'Lying', 'Internet usage',
        'Gender', 'Left - right handed', 'Education', 'Only child',
        'Village - town', 'House - block of flats']:
    print(col)
    df_imputed = df_imputed.drop(col,1)

Smoking
Alcohol
Punctuality
Lying
Internet usage
Gender
Left - right handed
Education
Only child
Village - town
House - block of flats


In [148]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'Fear of public speaking' in df_imputed:
    y = df_imputed['Fear of public speaking'].values # get the labels we want
    del df_imputed['Fear of public speaking'] # get rid of the class label
#     norm_features = ['Music' ]
#     df_imputed[norm_features] = (df_imputed[norm_features]-df_imputed[norm_features].mean()) / df_imputed[norm_features].std()
    X = df_imputed.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(
                         n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)


In [149]:
g = df_imputed.columns.to_series().groupby(df_imputed.dtypes).groups
{k.name: v for k, v in g.items()}
cols_not_intfloat = ['Smoking', 'Alcohol', 'Punctuality', 'Lying', 'Internet usage',
        'Gender', 'Left - right handed', 'Education', 'Only child',
        'Village - town', 'House - block of flats']

In [150]:
import numpy as np
class BinaryLogisticRegressionBase:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private:
    @staticmethod
    def _sigmoid(theta):
        return 1/(1+np.exp(-theta)) 
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    # public:
    def predict_proba(self,X,add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    
    
        
blr = BinaryLogisticRegressionBase(0.1)
print(blr)


Base Binary Logistic Regression Object, Not Trainable


In [151]:
# inherit from base class
class BinaryLogisticRegression(BinaryLogisticRegressionBase):
    #private:
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    def _get_gradient(self,X,y):
        # programming \sum_i (yi-g(xi))xi
        gradient = np.zeros(self.w_.shape) # set gradient to zero
        for (xi,yi) in zip(X,y):
            gradi = (yi - self.predict_proba(xi,add_bias=False))*xi # the actual update inside of sum
            gradient += gradi.reshape(self.w_.shape) # reshape to be column vector and add to gradient
        
        return gradient/float(len(y))
       
    # public:
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

            
blr = BinaryLogisticRegression(0.1)
print(blr)

Untrained Binary Logistic Regression Object


In [152]:
class LogisticRegression:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = y==yval # create a binary problem
            # train the binary classifier for this class
            blr = VectorBinaryLogisticRegression(self.eta,self.iters)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for blr in self.classifiers_:
            probs.append(blr.predict_proba(X)) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return np.argmax(self.predict_proba(X),axis=1) # take argmax along row
    
lr = LogisticRegression(0.1,1500)
print(lr)

Untrained MultiClass Logistic Regression Object


In [153]:
X

array([[   5.,    3.,    2., ...,  163.,   48.,    1.],
       [   4.,    4.,    2., ...,  163.,   58.,    2.],
       [   5.,    5.,    2., ...,  176.,   67.,    2.],
       ..., 
       [   4.,    3.,    1., ...,  173.,   75.,    0.],
       [   5.,    3.,    3., ...,  173.,   58.,    1.],
       [   5.,    5.,    4., ...,  185.,   72.,    1.]])

In [165]:
%%time
# can we do better? Maybe more iterations?
params = dict(eta=0.1,
              iterations=500)

blr = LogisticRegression(**params)
blr.fit(X,y)
# print(blr)
yhat = blr.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat+3))

Accuracy of:  0.283382789318
CPU times: user 1.76 s, sys: 92 ms, total: 1.85 s
Wall time: 487 ms


In [166]:
%%time
# now lets do some vectorized coding
import numpy as np
from scipy.special import expit

class VectorBinaryLogisticRegression(BinaryLogisticRegression):
    # inherit from our previous class to get same functionality
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # but overwrite the gradient calculation
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.w_.shape)

# use same params as defined above
blr = VectorBinaryLogisticRegression(**params)
blr.fit(X,y)
print(blr.w_)
yhat = blr.predict(X)
print('Accuracy of: ',accuracy_score(y,yhat))

[[    90.77700297]
 [   433.32418398]
 [   294.8977003 ]
 [   281.83048961]
 [   203.89658754]
 [   191.42633531]
 [   270.17878338]
 [   250.28635015]
 [   316.78976261]
 [   347.07366469]
 [   217.62514837]
 [   230.09272997]
 [   261.1977003 ]
 [   251.3995549 ]
 [   247.46735905]
 [   285.69228487]
 [   263.94221068]
 [   252.73709199]
 [   210.05593472]
 [   196.76973294]
 [   420.92930267]
 [   247.39206231]
 [   309.515727  ]
 [   408.97945104]
 [   318.86802671]
 [   283.83583086]
 [   282.94873887]
 [   346.4055638 ]
 [   350.56120178]
 [   326.66543027]
 [   183.71031157]
 [   318.87218101]
 [   286.5115727 ]
 [   281.3884273 ]
 [   224.98323442]
 [   215.10511869]
 [   184.60051929]
 [   381.51498516]
 [   280.27507418]
 [   234.40571217]
 [   245.16135015]
 [   195.87826409]
 [   294.81891691]
 [   281.38709199]
 [   345.29480712]
 [   228.09176558]
 [   188.76149852]
 [   234.25675074]
 [   238.40964392]
 [   201.15007418]
 [   331.41186944]
 [   211.09918398]
 [   202.488

In [156]:
class RegularizedBinaryLogisticRegression(VectorBinaryLogisticRegression):
    # extend init functions
    def __init__(self, C=0.0, **kwds):        
        # need to add to the original initializer 
        self.C = C
        # but keep other keywords
        super().__init__(**kwds) # call parent initializer
        
        
    # extend previous class to change functionality
    def _get_gradient(self,X,y):
        # call get gradient from previous class
        gradient = super()._get_gradient(X,y)
        
        # add in regularization (to all except bias term)
        gradient[1:] += 2 * self.w_[1:] * self.C
        return gradient
        

In [157]:
# now redefine the Logistic Regression Function where needed
class RegularizedLogisticRegression(LogisticRegression):
    def __init__(self, C=0.0, **kwds):        
        # need to add to the original initializer 
        self.C = C
        # but keep other keywords
        super().__init__(**kwds) # call parent initializer
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = y==yval # create a binary problem
            # train the binary classifier for this class
            blr = RegularizedBinaryLogisticRegression(eta=self.eta,
                                                      iterations=self.iters,
                                                      C=self.C)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T

In [167]:
# run logistic regression and vary some parameters
from sklearn import metrics as mt

# first we create a reusable logisitic regression object
#   here we can setup the object with different learning parameters and constants
lr_clf = RegularizedLogisticRegression(eta=0.1,iterations=2000) # get object

# now we can use the cv_object that we setup before to iterate through the 
#    different training and testing sets. Each time we will reuse the logisitic regression 
#    object, but it gets trained on different data each time we use it.

iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in cv_object.split(X,y): 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = X[train_indices]
    y_train = y[train_indices]

    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    # train the reusable logisitc regression model on the training data
    lr_clf.fit(X_train,y_train)  # train object
    y_hat = lr_clf.predict(X_test) # get test set precitions

    # now let's get the accuracy and confusion matrix for this iterations of training/testing
    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print("accuracy", acc )
    print("confusion matrix\n",conf)
    iter_num+=1
    
# Also note that every time you run the above code
#   it randomly creates a new training and testing set, 
#   so accuracy will be different each time

accuracy 0.192592592593
confusion matrix
 [[26  0  0  0  0]
 [36  0  0  0  0]
 [33  0  0  0  0]
 [25  0  0  0  0]
 [14  1  0  0  0]]
accuracy 0.162962962963
confusion matrix
 [[ 0  0  0  0  0  0]
 [13  0 12  0  0  0]
 [ 9  0 22  0  0  0]
 [ 9  0 34  0  0  0]
 [ 6  0 17  0  0  0]
 [ 2  0 11  0  0  0]]
accuracy 0.148148148148
confusion matrix
 [[ 0  0  0  0  0  0]
 [21  0  0  0  2  0]
 [18  0  0  0 10  0]
 [11  0  0  0 29  0]
 [ 8  0  0  0 20  0]
 [ 4  0  0  0 12  0]]
