# Testing the logistic regression using various real-world data sets.

## Author: Bojian Xu, bojianxu@ewu.edu

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logistic_regression as logic
import sys
sys.path.append('..')
from code_misc.utils import MyUtils

In [2]:
data_set = 'ionosphere'

print(data_set+'/'+'hello')

ionosphere/hello


In [3]:
# READ in data
df_X_train = pd.read_csv(data_set+'/'+'X_train.csv', header=None)
df_y_train = pd.read_csv(data_set+'/'+'y_train.csv', header=None)
df_X_test = pd.read_csv(data_set+'/'+'X_test.csv', header=None)
df_y_test = pd.read_csv(data_set+'/'+'y_test.csv', header=None)

# save in numpy arrays
X_train = df_X_train.to_numpy()
y_train = df_y_train.to_numpy()
X_test = df_X_test.to_numpy()
y_test = df_y_test.to_numpy()

# get training set size
n_train = X_train.shape[0]

# normalize all features to [0,1] or [-1,1]
if data_set == 'ionosphere':
    X_all = MyUtils.normalize_neg1_pos1(np.concatenate((X_train, X_test), axis=0))


X_train = X_all[:n_train]
X_test = X_all[n_train:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
#print(y_test)

(280, 34)
(280, 1)
(71, 34)
(71, 1)


In [4]:
# build the model
log = logic.LogisticRegression()

In [5]:
# unison shuffling test
a = np.arange(5).reshape(-1, 1)
#b = np.arange(5).reshape(-1, 1)
b = np.array([[1,2],[3,4],[5,6],[7,8],[9,10]])

#print(a)
#print(b)

c, d = logic.LogisticRegression._unison_shuffled_copies(a,b)

#print(c)
#print(d)


In [6]:
# train the model
log.fit(X_train, y_train, lam = 0, eta = 0.1, iterations = 50000, SGD = False, mini_batch_size = 20, degree = 3)
#log.fit(X_train, y_train, lam = 0, eta = 0.1, iterations = 50000, SGD = True, mini_batch_size = 20, degree = 3)

In [7]:
print('misclassfied percentage from training: ', log.error(X_train, y_train)/X_train.shape[0])
print('misclassfied percentage from validation: ', log.error(X_test, y_test)/X_test.shape[0])

misclassfied percentage from training:  0.0
misclassfied percentage from validation:  0.056338028169014086


In [8]:
preds = log.predict(X_test)

In [9]:
for i in range(y_test.shape[0]):
    print('test sample ', i)
    if np.sign(preds[i]-0.5) != y_test[i]:
        print('misclassified!!')
    print('predicted probablity of being +1 is: ', preds[i])
    print('label is', y_test[i])
    print('\n')

test sample  0
predicted probablity of being +1 is:  [0.99999964]
label is [1.]


test sample  1
predicted probablity of being +1 is:  [0.99996858]
label is [1.]


test sample  2
predicted probablity of being +1 is:  [0.1946249]
label is [-1.]


test sample  3
predicted probablity of being +1 is:  [0.9999788]
label is [1.]


test sample  4
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  5
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  6
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  7
predicted probablity of being +1 is:  [0.99910556]
label is [1.]


test sample  8
predicted probablity of being +1 is:  [0.19515761]
label is [-1.]


test sample  9
predicted probablity of being +1 is:  [2.08030752e-08]
label is [-1.]


test sample  10
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  11
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  12
predicted probablity of being +1 