## Logistic Regression

This notebook demonstrates classification using a logistic regression algorithm.


In [90]:
%matplotlib widget
import sklearn
from sklearn import datasets 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('/Users/ondrea/MLandstats/OStats/')
from ostats import ML

In [91]:
iris = datasets.load_iris()
df = pd.DataFrame(iris.data)
#print(iris.DESCR)

In [92]:
iris['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [93]:
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
df['class'] = (iris['target'] == 2).astype(int)
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1


In [94]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [95]:
X_feature = df['petal_width'].to_numpy() #petal width
y_label = df['class'].to_numpy()

In [96]:
ifig=1;plt.close(ifig);plt.figure(ifig)
plt.scatter(X_feature,y_label)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [77]:
#fake data for testing
XF =  np.linspace(0,5,30)
yf = np.concatenate((np.zeros(20), np.ones(10))) 

In [78]:
ifig=2;plt.close(ifig);plt.figure(ifig)
plt.scatter(XF,yf)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [79]:
def Log_Regression_V1(x, y, gamma = 0.01, epsilon =None, theta_init = None):
    '''
    X = list or array. X values of data I am fitting to.
    y = list or array. Y values of the data I am fitting to.
    theta_init = initial guesses for theta...list/vector
    '''
    
    #m, n = X.shape
    m = len(y)
    n = 1
    ones_list = np.ones(m)
    X_matrix = np.vstack((ones_list,x)).T
    if theta_init is not None:  
        theta = np.array(theta_init)
    else:
        theta = np.ones(n+1)
    #now do it
    #print(X_matrix)
    k = 1
    stop_condition = False
   # while not stop_condition:
    for i in range(0,5000):
        ydiffX = np.zeros(len(theta))
        for j in range(len(theta)):
            ydiff = np.zeros(len(y))
            for m in range(len(y)):
                x = X_matrix[m,:]

                thetax = np.dot(theta,x)

                sigmoid = 1/(1+np.exp(-thetax))
                ydiff[m] = y[m] - sigmoid

            ydiffX[j] = np.sum(ydiff * X_matrix[:,j])
            
        theta = theta + gamma*ydiffX
#print(theta, ydiff)
#    print(theta.shape,ydiff.shape)
    return(theta)

In [80]:
def Log_Regression(x, y, gamma, epsilon = 0.0001, theta_init = None):
    '''
    gamma = Step size: Currently a constant. Later I will add an adaptive method to calculate it
    X = list or array. X values of data I am fitting to.
    y = list or array. Y values of the data I am fitting to.
    n = number of variables 
    theta_init = initial guesses for theta...list/vector
    '''
    m = len(y) #number of points
    ones_list = np.ones(m)
    X_matrix = np.vstack((ones_list,x)).T
    X_matrix_T = X_matrix.T #transpose of X
    n = len(X_matrix_T)

    if theta_init is not None:  
        theta = theta_init
    else:
        theta = np.zeros(n)
    #now do it
    k = 1
    J = []
    store_theta = []
    gamma_X_matrix_T = gamma*X_matrix_T
    stop_condition = False
    while not stop_condition:
        store_theta.append(theta)
        thetaTx = np.dot(theta,X_matrix.T)   
        sigmoid = 1/(1 + np.exp(-thetaTx))
        yminsig = y - sigmoid
        gamma_XT_times_yminsig = np.matmul(gamma_X_matrix_T,yminsig)
        theta = theta + gamma_XT_times_yminsig
     #   X_matrix_times_theta_miny = np.matmul(X_matrix,theta) - y
     #   theta = theta - np.dot(gamma_X_matrix_T,X_matrix_times_theta_miny)/m
        stop_condition = (np.abs(theta[1] - store_theta[-1][1])/store_theta[-1][1] < epsilon) and (np.abs(theta[0] -\
        store_theta[-1][0])/store_theta[-1][0] < epsilon)

        k +=1
    
    return(store_theta[-1])

In [81]:
ones_list = np.ones(len(XF))
XF2 = np.vstack((ones_list,XF)).T
#XF2

In [82]:
th = Log_Regression_V1(x=XF, y=yf, gamma = 0.1,theta_init = [-50,100])

In [83]:
th1 = Log_Regression(XF, yf, gamma=0.1, epsilon = 0.001, theta_init= [-50,100])

In [84]:
th1

array([-83.05795078,  24.81425082])

In [85]:
x_input = np.linspace(0, 5, 100)
y_output = 1.0 / (1.0 + np.exp(-(th[0] + th[1] * x_input)))
y_output_1 = 1.0 / (1.0 + np.exp(-(th1[0] + th1[1] * x_input)))


In [86]:
ifig=1;plt.close(ifig);plt.figure(ifig)

plt.plot(x_input,y_output,label='Pred.')
plt.scatter(XF,yf, label='index')
plt.scatter(XF,yf, label='matrix')
plt.legend(loc = 'upper left')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Now I will try it on the flower data. First just using the petal width to determine if it is Virginica (y =1)

In [97]:
th2 = Log_Regression(X_feature, y_label, gamma=0.1, epsilon = 1e-6, theta_init= [1,10])

In [98]:
#create the sigmoid to plot
x_in = np.linspace(0, 2.5, 100)
y_out= 1.0 / (1.0 + np.exp(-(th2[0] + th2[1] * x_input)))

In [99]:
ifig=2;plt.close(ifig);plt.figure(ifig)

plt.plot(x_in,y_out,label='Pred.')
plt.scatter(X_feature,y_label, label='data')
plt.legend(loc = 'upper left')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Add more features

In [164]:
X_features = df.drop(columns=['class', 'sepal_length', 'sepal_width'], axis=1)

In [165]:
X_features

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [166]:
X_features = X_features.to_numpy()

In [217]:
th3 = Log_Regression(X_features.T, y_label, gamma=0.1, epsilon =1e-6, theta_init= [-200,30,30])

In [218]:
#create the sigmoid to plot
x_in_multi = np.linspace(0, 2.5, 100)
y_out_multi = 1.0 / (1.0 + np.exp(-(th3[0] + th3[1] * x_in_multi  + th3[2]* x_in_multi )))# + th3[3] * x_in_multi + th3[4] * x_in_multi)) )

In [222]:
print(th3)

[-198.12248074   28.71783579   34.34634112]


In [220]:
ifig=2;plt.close(ifig);plt.figure(ifig)

plt.plot(x_in_multi,y_out_multi,label='Pred.')
plt.scatter(X_features.T[1], y_label, label='data')
plt.legend(loc = 'upper left')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …