In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

# turns off warnings
import warnings
warnings.filterwarnings('ignore')


def check_acc(a, b, test, correct):
    for i in range(len(test)):
        row = test.iloc[i]
        compare = list(correct)[i]
        correct_ctr = 0
        
        result = np.dot(row, a) + b
        if((result < 0 and compare < 0) or (result > 0 and compare > 0)):
            correct_ctr += 1
        
    return correct_ctr / len(test)


def calc_gradient(a, b, x, y_k, reg):
    check = np.dot(a, x) + b
    if(y_k * check >= 1):
        return (reg*a, 0)
    else:
        return ((reg*a - (y_k*x)), -1 * y_k)
    

In [2]:
df = pd.read_csv('20K_K9.csv', dtype=np.float64, header=None)
df = df.drop(df.columns[0], axis=1)
df.columns = np.arange(0, 5409)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5399,5400,5401,5402,5403,5404,5405,5406,5407,5408
0,-0.161,-0.014,0.002,-0.036,-0.033,-0.093,0.025,0.005,0.000,-0.015,...,0.006,0.013,0.021,0.020,0.016,-0.011,0.003,0.010,-0.007,-1.0
1,-0.158,-0.002,-0.012,-0.025,-0.012,-0.106,0.013,0.005,0.000,-0.002,...,0.002,-0.008,0.007,0.015,-0.008,-0.011,-0.004,0.013,0.005,-1.0
2,-0.169,-0.025,-0.010,-0.041,-0.045,-0.069,0.038,0.014,0.008,-0.014,...,0.019,0.010,0.025,0.025,0.021,-0.012,0.006,0.016,-0.018,-1.0
3,-0.183,-0.051,-0.023,-0.077,-0.092,-0.015,0.071,0.027,0.020,-0.019,...,0.051,0.012,0.050,0.038,0.051,-0.015,0.017,0.027,-0.049,-1.0
4,-0.154,0.005,-0.011,-0.013,-0.002,-0.115,0.005,0.002,-0.003,0.002,...,-0.011,0.012,0.009,0.003,-0.001,0.002,-0.006,0.009,0.013,-1.0
5,-0.150,0.016,-0.014,0.000,0.016,-0.123,-0.004,-0.002,-0.005,0.010,...,0.000,-0.033,-0.032,0.029,-0.052,-0.029,-0.006,0.004,0.023,-1.0
6,-0.158,0.002,-0.019,-0.028,-0.008,-0.101,0.011,0.005,0.001,0.003,...,0.015,-0.025,-0.003,0.017,-0.017,-0.009,-0.001,0.017,0.012,-1.0
7,-0.152,0.009,-0.015,-0.008,0.004,-0.120,-0.002,-0.002,-0.007,0.004,...,-0.014,-0.009,-0.007,0.009,-0.027,-0.012,-0.009,0.008,0.021,-1.0
8,-0.172,-0.028,0.003,-0.045,-0.055,-0.078,0.039,0.011,0.006,-0.020,...,0.016,0.031,-0.059,0.050,0.010,-0.005,0.137,0.098,-0.015,-1.0
9,-0.164,-0.019,-0.011,-0.037,-0.031,-0.081,0.029,0.012,0.007,-0.010,...,0.012,0.006,0.019,0.022,0.011,-0.012,0.002,0.014,-0.009,-1.0


In [3]:
# drop last colmun and replace active/inactive with numbers
X = df.drop(df.columns[-1], axis=1, inplace=False)
y = df.iloc[:,-1]

In [4]:
# split into training and eval
X_tru, X_gar, y_tru, y_gar = train_test_split(X, y, train_size=.1)
X_t, X_eval, y_t, y_eval = train_test_split(X_tru, y_tru, test_size=0.1)
X_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5398,5399,5400,5401,5402,5403,5404,5405,5406,5407
12440,-0.162,-0.017,0.006,-0.036,-0.044,-0.230,0.034,0.010,0.005,-0.015,...,-0.012,0.020,0.013,0.030,0.028,0.027,-0.012,0.006,0.015,-0.018
2362,-0.160,-0.005,-0.008,-0.024,-0.018,-0.108,0.015,0.004,-0.001,-0.006,...,-0.008,-0.004,0.005,0.007,0.016,-0.003,-0.010,-0.002,0.009,0.006
16748,-0.002,-0.021,0.012,-0.011,-0.032,0.037,0.011,0.002,0.003,-0.012,...,-0.009,0.008,0.018,0.015,0.005,0.028,-0.002,0.013,0.001,-0.017
15140,0.856,2.722,1.854,1.147,0.657,19.053,19.967,1.553,2.116,2.630,...,-0.015,0.013,0.020,0.028,0.024,0.027,-0.012,0.009,0.012,-0.016
17264,-0.002,-0.007,0.008,-0.002,-0.017,0.009,0.005,0.000,0.001,-0.005,...,-0.005,0.001,0.011,0.007,0.002,0.013,-0.002,0.008,0.001,-0.006
4522,-0.146,0.027,-0.014,0.013,0.039,-0.131,-0.013,-0.005,-0.009,0.014,...,0.001,-0.023,-0.019,-0.018,0.001,-0.045,-0.011,-0.013,0.008,0.033
3902,-0.167,-0.019,-0.003,-0.037,-0.037,-0.090,0.031,0.009,0.004,-0.016,...,-0.013,0.009,0.016,0.020,0.023,0.020,-0.011,0.005,0.011,-0.011
14754,-0.171,-0.028,0.000,-0.045,-0.053,-0.077,0.038,0.012,0.007,-0.020,...,-0.016,0.018,0.018,0.029,0.027,0.030,-0.012,0.009,0.012,-0.020
855,-0.154,0.006,-0.012,-0.011,-0.001,-0.117,0.004,0.002,-0.003,0.002,...,-0.001,-0.012,-0.006,-0.003,0.010,-0.019,-0.014,-0.004,0.010,0.016
8255,-0.147,0.025,-0.014,0.010,0.036,-0.132,-0.011,-0.004,-0.008,0.014,...,0.001,-0.024,-0.016,-0.018,0.002,-0.043,-0.011,-0.013,0.007,0.032


In [7]:
# split remaining into training and test data
reg_val = np.array([.0001, .001, .01, .1])

#Get a random val
random_value= int(np.random.random()*X_t[1].count())

#Generate initial values for a & b.
a_vec = X_t.iloc[random_value]
b = list(y_t)[random_value]


pk = np.full((5408,), 1)
m = 10
n = 1
tolerance = 1e-6

# (a, b, accuracy, lambda)
best = tuple()
best_list = list()

X_training, X_test, y_training, y_test = train_test_split(X_t, y_t, test_size=0.4)
#     print(type(X_test.ix[1]))
#     break

# normalize the data set X_training
cols = list(X_training.columns)
for col in cols:
    X_training[col] = (X_training[col] - X_training[col].mean()) / X_training[col].std(ddof=1)

# number of (rows * 2) / 100
#steps = (X_training[0].count() * 2) / 100
steps = 200

In [8]:
for lam in reg_val:
    #100 seasons for each lambda.
    for i in range(100):
        
        for j in range(int(steps)):
            # creates some random row from our training set. (sampling with replacement.)
            rand_val = int(np.random.random()*X_training[1].count())
            rand_row = X_training.iloc[rand_val]
        
            #target_val = y_training[rand_val]
            #Get the one value
            target_val = list(y_training)[rand_val]

            
            # grad is a tuple of the adjusted value for a and b
            grad = calc_gradient(a_vec, b, rand_row, target_val, lam)
            step_size = (m / (j + n))
            
            #Update the gradient values.
            a_vec = a_vec - (step_size * grad[0])
            b = b - (step_size * grad[1])
            accuracy = check_acc(a_vec, b, X_test, y_test)

            #Check the curr value.
            curr = tuple([a_vec, b, accuracy, lam])
            best_list.append(curr)
            if(len(best) == 0 or best[2] < accuracy):
                best = curr
                print(curr[2])
                
    

0.0
0.001402524544179523


In [None]:
#Declare an Naive Bayes implementation for comparison

gnb = GaussianNB()
#Use the ShuffleSplit cross validation module to 
cv = ShuffleSplit(n_splits=n_folds, test_size=0.1)
scores = cross_val_score(gnb, X, y, cv=cv)
    
#Output the means and std deviations.
print("The mean of the accuracy is: {0}".format(np.mean(scores)))
print("The Standard Deviation of the accuracy is: {0}".format(np.std(scores)))

In [10]:
for i in best_list:
    print(i[2])

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
