In [1]:
import math
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

import sys

sys.path.append("../src/")
sys.setrecursionlimit(2000000)

import utils
from NaCL import NaCLK

In [2]:
cov_encoding = [[0, 1, 2, 3, 4, 5],[10, 11, 12, 13, 14, 15],[20, 21, 22, 23, 24, 25],[30, 31, 32, 33, 34, 35],[40, 41, 42, 43, 44, 45],[50, 51, 52, 53, 54, 55],[60, 61, 62, 63, 64, 65],[70, 71, 72, 73, 74, 75],[80, 81, 82, 83, 84, 85],[90, 91, 92, 93, 94, 95],[60, 61, 62, 63],
                [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103]]

# Loading Data

In [3]:
X_train, y_train, X_test, y_test = utils.load_dataset("../data/covtype", "covtype")

In [4]:
X_train.shape

(464808, 104)

In [5]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6])

In [6]:
%%time
from sklearn.naive_bayes import BernoulliNB
NB = BernoulliNB().fit(X_train, y_train)
print(np.average(NB.predict(X_train) == y_train))
print(np.average(NB.predict(X_test) == y_test))

0.6335497667854254
0.6345846026746528
CPU times: user 3.68 s, sys: 1.35 s, total: 5.03 s
Wall time: 2.62 s


### Training Logistic Regression

In [None]:
%%time
clf = LogisticRegression(solver='lbfgs', 
    verbose=True, 
    C = 1,
    multi_class='multinomial',
    max_iter=400, n_jobs=6).fit(X_train, y_train)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


In [None]:
print (np.average(clf.predict(X_test) == y_test), np.average(clf.predict(X_train) == y_train))

### Learning NaCL model

In [None]:
%%time
have_mosek = True # set False if you don't have mosek installed

nacl = NaCLK().setup(clf, X_train, y_train) # , c = 1e-10)
if have_mosek:
    nacl.solve(solver = 'mosek_conif', verbose=1) 
else:
    nacl.solve(solver = 'cvxopt', verbose=1) 

In [None]:
# Sanity Check to compare NaCL results with Logistic Regression
assert(1.0 == np.average( nacl.predict(X_test)  == clf.predict(X_test )))
assert(1.0 == np.average( nacl.predict(X_train) == clf.predict(X_train)))

## Generating Table for Paper

In [None]:
N = len(cov_encoding)

In [None]:
%%time
from sklearn.metrics import f1_score 

accuracy = lambda x,y: np.mean(x == y)
f1 = lambda x,y: f1_score(x,y.flatten(), average = "weighted")

setting = {
    "repeat": 3, #10
    "k": [i for i in range(0, N, int(0.2*N))],
    "feature_encoding": cov_encoding,
    "prob" : False,# True
    "function": f1 #utils.conditional_likelihood_k
}
data_for_table = utils.run_experiment_k_paper(X_test, y_test, clf, NB, nacl, setting)

In [None]:
row = []
for method in ["min", "max", "mean", "median","ours"]:
    temp = []
    for k in range(1, 5):
        m = np.mean(100*data_for_table[method][k])
        s = np.std(100*data_for_table[method][k])
        temp.append(np.round(m, 1))
        print(method, k*20, m, s)
        
    row.append(temp)

row = np.array(row)  
bold = np.argmax(row, axis=0)
print("")
for i in range(len(row)):
    line = ""
    for j in range(len(row[i])):
        if bold[j] == i or row[i][j] == row[bold[j]][j]:
            line += "\\textbf{" + str(row[i][j]) + "}"
        else:
            line += str(row[i][j])
        
        line += " & "
    print(line)
    

## Generating Missing Data Charts

In [None]:
X_test.shape

In [None]:
%%time
setting = {
    "repeat": 1,
    "k": [i for i in range(0, 13, 1)],
    "prob" : True,
    "function": utils.conditional_likelihood_k,
    "feature_encoding": cov_encoding,
}
data_a2 = utils.run_experiment_k_paper(X_test, y_test, clf, NB, nacl, setting)

In [None]:
setting = {
    "show": ["min", "mean", "median", "ours"],
    "saveAs": "../results/covType_cross.pdf",
    "title": "CovType",
    "Ylabel": "Cross Entropy"
}
utils.plot_results_paper(data_a2, setting)

In [None]:
%%time
from sklearn.metrics import f1_score 

accuracy = lambda x,y: np.mean(x == y)
f1 = lambda x,y: f1_score(x,y.flatten(), average = "weighted")

setting = {
    "repeat": 1,
    "k": [i for i in range(0, 13, 1)],
    "prob" : False,
    "function": f1,
    "feature_encoding": cov_encoding,
}
data_a3 = utils.run_experiment_k_paper(X_test, y_test, clf, NB, nacl, setting)

In [None]:
setting = {
    "show": ["min", "ours", "mean", "median"],
    "saveAs": "../results/covtype_accuracy.pdf",
    "title": "CovType",
    "Ylabel": "F1"
}
utils.plot_results_paper(data_a3, setting)

In [None]:
utils.save("../results/data_covtype_result_cross.pickle", data_a2)
utils.save("../results/data_covtype_result_accuracy.pickle", data_a3)