# Logistic Regression Extension

In [2]:
from sklearn.datasets import load_digits # The MNIST data set is in scikit learn data set
from sklearn.preprocessing import StandardScaler  # It is important in neural networks to scale the date
from sklearn.model_selection import train_test_split  # The standard - train/test to prevent overfitting and choose hyperparameters
from sklearn.metrics import accuracy_score # 
import numpy as np
import numpy.random as r # We will randomly initialize our weights
import matplotlib.pyplot as plt 

# load dataset
digits=load_digits()
X = digits.data
y = digits.target

# scalar dataset
X_scale = StandardScaler()
X = X_scale.fit_transform(digits.data)

#split original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


# Create binary dataset

In [13]:
# rule label:0-4 count as 0, label:5-9 count as 1
y_binary = np.zeros(len(y))
for i in range(len(y)):
    if  y[i] == 0 or y[i] == 1 or y[i] == 2 or y[i] == 3 or y[i] == 4:
        y_binary[i] = 0
    else:
        y_binary[i] =1
y= np.ravel(y)

#Split the binary data set
X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(X, y_binary, test_size=0.4)

# Create data set for muti classification of logistic regression

In [14]:
new_y_train_list =[] # 0-9

for j in range(10):
    new_y = np.zeros(len(y_train))
    for i in range(len(y_train)):
        if  y_train[i] == j:
            new_y[i] = 1
        else:
            new_y[i] = 0
    new_y_train_list.append(new_y)
new_y_train_list = np.mat(new_y_train_list)
print("The original train set is X:{}, y:{}".format(X_train.shape,y_train.shape))
print("We create 9 trainning data set according to class 0-9.")
print("The new train set is New y List: {}".format(new_y_train_list.shape))
print("The original y is: {}".format(y_train))
print("The 0 class: y:{}".format(new_y_train_list[0]))
print("The 1 class: y:{}".format(new_y_train_list[1]))
print("The 2 class: y:{}".format(new_y_train_list[2]))
print("The 3 class: y:{}".format(new_y_train_list[3]))
print("The 4 class: y:{}".format(new_y_train_list[4]))
print("The 5 class: y:{}".format(new_y_train_list[5]))
print("The 6 class: y:{}".format(new_y_train_list[6]))
print("The 7 class: y:{}".format(new_y_train_list[7]))
print("The 8 class: y:{}".format(new_y_train_list[8]))
print("The 9 class: y:{}".format(new_y_train_list[9]))

The original train set is X:(1078, 64), y:(1078,)
We create 9 trainning data set according to class 0-9.
The new train set is New y List: (10, 1078)
The original y is: [9 4 2 ... 1 2 3]
The 0 class: y:[[0. 0. 0. ... 0. 0. 0.]]
The 1 class: y:[[0. 0. 0. ... 1. 0. 0.]]
The 2 class: y:[[0. 0. 1. ... 0. 1. 0.]]
The 3 class: y:[[0. 0. 0. ... 0. 0. 1.]]
The 4 class: y:[[0. 1. 0. ... 0. 0. 0.]]
The 5 class: y:[[0. 0. 0. ... 0. 0. 0.]]
The 6 class: y:[[0. 0. 0. ... 0. 0. 0.]]
The 7 class: y:[[0. 0. 0. ... 0. 0. 0.]]
The 8 class: y:[[0. 0. 0. ... 0. 0. 0.]]
The 9 class: y:[[1. 0. 0. ... 0. 0. 0.]]


# Validation in Sklearn

In [15]:
# experiment in Sklearn:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(penalty ='none',class_weight= 'balanced', random_state=0, solver='newton-cg', multi_class='ovr')
logreg.fit(X_train, y_train)
prepro = logreg.predict_proba(X_test)
acc = logreg.score(X_test,y_test)
print("None: ")
print (acc)
logreg = LogisticRegression(penalty ='l2',class_weight= 'balanced', random_state=0, solver='newton-cg', multi_class='ovr')
logreg.fit(X_train, y_train)
prepro = logreg.predict_proba(X_test)
acc = logreg.score(X_test,y_test)
print ("L2:")
print (acc)

None: 
0.9360222531293463
L2:
0.9499304589707928


# Our implementation of logistic regression

In [16]:

# Some params
learning_rate = 0.1
num_iters = 3000 # The number of iteratins to run the gradient ascent algorithm

# logistc regression
def sigmoid(z):
    return 1/(1+np.exp(-z))

# Initialize parameters w
w = np.zeros((X_train.shape[1], 1))

def hypothesis(X , w):
    return 1/(1+np.exp(-np.dot(X,w)))
yhat = hypothesis(X_train, w)

def log_likelihood(X ,y , w ):
    hx = hypothesis(X , w)
    log_likelihood =0
    for i in range(X.shape[0]):
        if y[i] == 0:
            if hx[i] ==1:
                continue
            log_likelihood = log_likelihood + np.log(1-hx[i])
        else: 
            if hx[i] ==0:
                continue
            log_likelihood = log_likelihood + np.log(hx[i])
    return log_likelihood

def Logistic_Regresion_Gradient_Ascent(X, y, learning_rate, num_iters):
    #initialize
    log_likelihood_values = []
    w = np.zeros((X.shape[1], 1))
    N = X.shape[0] 
    #do iteration
    for i in range(num_iters):
        gradient = np.dot(X.transpose(),(y-hypothesis(X,w)))
        w = w + (learning_rate/N)*gradient
        if (i % 100) == 0:
            log_likelihood_values.append(log_likelihood(X,y,w))  
    return w, log_likelihood_values

def ridge_log_likelihood(X , y, w, C = 0.65):    
    hx = hypothesis(X , w)
    log_likelihood = 0
    for i in range(X.shape[0]):
        if y[i] == 0:
            if hx[i] ==1:
                continue     
            log_likelihood = log_likelihood + np.log(1-hx[i])
        else: 
            if hx[i] ==0:
                continue
            log_likelihood = log_likelihood + np.log(hx[i])
    reg_term = C*np.dot(w.T,w)
    log_likelihood = log_likelihood  - reg_term
    return log_likelihood

def Ridge_Regresion_Gradient_Ascent(X, y, learning_rate, num_iters, C = 0.65):
    #initialize
    ridge_log_likelihood_values = []
    w = np.zeros((X.shape[1], 1))
    N = X.shape[0] 
    #do iteration
    for i in range(num_iters):
        gradient = np.dot(X.transpose(),(y - hypothesis(X,w)))
        w = w + (learning_rate/N)*gradient - learning_rate*C*w 
        if (i % 100) == 0:
            ridge_log_likelihood_values.append(ridge_log_likelihood(X, y, w, C))  
    return w, ridge_log_likelihood_values



# Build multiclass model

In [17]:
# train n model (0,9)
w =[]
log_likelihood_values =[]
for i in range(10):
    w_new, log_likelihood_values_new = Ridge_Regresion_Gradient_Ascent(X_train, new_y_train_list[i].transpose(), learning_rate, num_iters, 0.65)
    w.append(w_new)
    log_likelihood_values.append(log_likelihood_values_new)

# Predict with ridge multiclass regression

In [18]:
#pretdict
result = []
for i in range(len(y_test)):
    hx_every_example = []
    for j in range(len(w)):
        hx = hypothesis(X_test[i],w[j])
        hx_every_example.append(np.linalg.det(hx))
    predict_class =np.argmax(hx_every_example)
    result.append(predict_class)
    
#caculate precision
right = 0
for i in range(len(y_test)):
    if result[i] == y_test[i]:
        right = right + 1
                    
print("The precision is: {}".format(right/len(y_test)))



The precision is: 0.8803894297635605


In [19]:
# to find best C
for C in [0.6, 0.65, 0.7, 0.75, 0.8]:
    # train n model (0,9)
    w =[]
    log_likelihood_values =[]
    for i in range(10):
        w_new, log_likelihood_values_new = Ridge_Regresion_Gradient_Ascent(X_train, new_y_train_list[i].transpose(), learning_rate, num_iters, C)
        w.append(w_new)
        log_likelihood_values.append(log_likelihood_values_new)
    #pretdict
    result = []
    for i in range(len(y_test)):
        hx_every_example = []
        for j in range(len(w)):
            hx = hypothesis(X_test[i],w[j])
            hx_every_example.append(np.linalg.det(hx))
        predict_class =np.argmax(hx_every_example)
        result.append(predict_class)
    
    #caculate precision
    right = 0
    for i in range(len(y_test)):
        if result[i] == y_test[i]:
            right = right + 1
                    
    print("In ridge regression,C = {}, the precision is: {}".format(C, right/len(y_test)))

In ridge regression,C = 0.6, the precision is: 0.8803894297635605
In ridge regression,C = 0.65, the precision is: 0.8803894297635605
In ridge regression,C = 0.7, the precision is: 0.8803894297635605
In ridge regression,C = 0.75, the precision is: 0.8803894297635605
In ridge regression,C = 0.8, the precision is: 0.8803894297635605


# Predict with logistic multiclass regression

In [20]:
# the accuracy of original one
# train n model (0,9)
w =[]
#log_likelihood_values =[]
for i in range(10):
    w_new, log_likelihood_values_new = Logistic_Regresion_Gradient_Ascent(X_train, new_y_train_list[i].transpose(), learning_rate, num_iters)
    w.append(w_new)

#pretdict
result = []
for i in range(len(y_test)):
    hx_every_example = []
    for j in range(len(w)):
        hx = hypothesis(X_test[i],w[j])
        hx_every_example.append(np.linalg.det(hx))
    predict_class =np.argmax(hx_every_example)
    result.append(predict_class)
    
#caculate precision
right = 0
for i in range(len(y_test)):
    if result[i] == y_test[i]:
        right = right + 1
                    
print("The logistic regression precision is: {}".format(right/len(y_test)))



The logistic regression precision is: 0.8706536856745479


# Test on another dataset

In [3]:
#test on another dataset
import pandas as pd

data = pd.read_csv("train.csv")

data = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
data = data.dropna()
data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])
data_conti = pd.DataFrame(data, columns=['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], index=data.index)
data = data_conti.join(data_dummy)

#split data into X and y
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# standard
stdsc = StandardScaler()
X_train_conti_std = stdsc.fit_transform(X_train[['Age', 'SibSp', 'Parch', 'Fare']])
X_test_conti_std = stdsc.fit_transform(X_test[['Age', 'SibSp', 'Parch', 'Fare']])

# change ndarray into dataframe
X_train_conti_std = pd.DataFrame(data=X_train_conti_std, columns=['Age', 'SibSp', 'Parch', 'Fare'], index=X_train.index)
X_test_conti_std = pd.DataFrame(data=X_test_conti_std, columns=['Age', 'SibSp', 'Parch', 'Fare'], index=X_test.index)

# Pclass is an ordered categorical variable
X_train_cat = X_train[['Pclass']]
X_test_cat = X_test[['Pclass']]
#   disordered encoded categorical variable
X_train_dummy = X_train[['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
X_test_dummy = X_test[['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

# linked them to the dataframe
X_train_set = [X_train_cat, X_train_conti_std, X_train_dummy]
X_test_set = [X_test_cat, X_test_conti_std, X_test_dummy]
X_train = pd.concat(X_train_set, axis=1)
X_test = pd.concat(X_test_set, axis=1)

#change back into the ndarray
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

y_train = np.mat(y_train)


In [29]:
# Some params
learning_rate = 0.1
num_iters = 3000 # The number of iteratins to run the gradient ascent algorithm

# train logistic regression
w1, log_likelihood_values1 = Logistic_Regresion_Gradient_Ascent(X_train, y_train.transpose(), learning_rate, num_iters)
#pretdict
result = []
for i in range(len(y_test)):
    hx = hypothesis(X_test[i],w1)
    if hx > 0.5:
        result.append(1)
    else:
        result.append(0)
    
#caculate precision
right = 0
for i in range(len(y_test)):
    if result[i] == y_test[i]:
        right = right + 1
                    
print("The logistic regression precision is: {}".format(right/len(y_test)))



# train ridge regression
for C in [0.04, 0.05,0.06, 0.03]:
    w2, log_likelihood_values2= Ridge_Regresion_Gradient_Ascent(X_train, y_train.transpose(), learning_rate, num_iters, C)
    #pretdict
    result = []
    for i in range(len(y_test)):
        hx = hypothesis(X_test[i],w2)
        if hx > 0.5:
            result.append(1)
        else:
            result.append(0)

    #caculate precision
    right = 0
    for i in range(len(y_test)):
        if result[i] == y_test[i]:
            right = right + 1

    print("C = {}, The Ridge regression precision is: {}".format(C, right/len(y_test)))


The logistic regression precision is: 0.7616822429906542
C = 0.04, The Ridge regression precision is: 0.7710280373831776
C = 0.05, The Ridge regression precision is: 0.7757009345794392
C = 0.06, The Ridge regression precision is: 0.7710280373831776
C = 0.03, The Ridge regression precision is: 0.7710280373831776
