In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, confusion_matrix, accuracy_score

from sklearn.naive_bayes import BernoulliNB 


## Bernoulli Naive Bayes
##### Setup

In [13]:
x = np.memmap('data_full/exp_test_binary_data.npy', mode='r', shape = (80000,150*150))
y = np.memmap('data/exp_test_target.npy', mode='r', shape = (80000))
print(x.shape)
print(y.shape)
x = pd.DataFrame(x)
y = pd.DataFrame(y)

(80000, 22500)
(80000,)


"target_data": {
"Inco": 2, 
"Teac": 1, 
"Cons": 0, 
"Publ": 4, 
"Econ": 3}}

In [14]:
X_train, x_test, y_train, y_test = train_test_split(x, y[0], test_size=0.2, stratify=y)
print(X_train.shape)
print(y_test.shape)

(64000, 22500)
(16000,)


In [15]:
chunk = 8000
for i in range(X_train.shape[0]//chunk):
    X_train[i*chunk:(i+1)*chunk] = X_train[i*chunk:(i+1)*chunk]/255
for i in range(x_test.shape[0]//chunk):
    x_test[i*chunk:(i+1)*chunk] = x_test[i*chunk:(i+1)*chunk]/255

In [16]:
X_train[-10:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22490,22491,22492,22493,22494,22495,22496,22497,22498,22499
33491,0,1,1,1,1,1,1,0,0,1,...,1,1,1,0,1,1,0,0,1,1
45245,1,0,0,0,0,0,0,0,0,1,...,0,0,1,1,1,1,1,1,1,1
13738,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
66415,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,0,1
52118,0,0,1,1,1,0,1,0,0,1,...,1,0,1,1,1,0,0,1,1,1
1760,1,0,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
10901,1,0,0,0,0,0,0,0,1,1,...,1,1,1,1,0,0,1,1,1,0
13725,1,1,1,1,1,1,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1
38857,1,1,1,1,1,1,1,1,1,1,...,0,0,1,1,1,1,1,1,0,0
42526,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Setup up for Ensembling

In [19]:
models={}
mapping = {
    "Inco": 2, 
    "Teac": 1, 
    "Cons": 0, 
    "Publ": 4, 
    "Econ": 3,
    "TeaF":5,
    "Gene":6,
    "Reaf":7}

nb_full = BernoulliNB(alpha=10)
for key in mapping.keys():
    models[key] =  BernoulliNB(alpha=0.1)

chunk = 16000

for i in range(X_train.shape[0]//chunk):
    nb_full.partial_fit(X_train[i*chunk:(i+1)*chunk], y_train[i*chunk:(i+1)*chunk], classes=[0,1,2,3,4,5,6,7])
    for key in mapping.keys():
        target_class = mapping[key]
        models[key].partial_fit(X_train[i*chunk:(i+1)*chunk], y_train[i*chunk:(i+1)*chunk]==target_class, classes=[0,1])
    print(i,end='\r', flush=True)
print('\nDone Training!!')


3
Done Training!!


In [62]:

# results = np.zeros((y_test.shape[0],8))
# nb_full_results = np.zeros((y_test.shape[0],8))
#     # y_pred = np.zeros(shape=y_test.shape)
# for i in range(x_test.shape[0]//chunk):
#     y_pred = nb_full.predict_proba(x_test[i*chunk:(i+1)*chunk])[:]
#     nb_full_results[i*chunk:(i+1)*chunk] = y_pred
#     for key in mapping.keys():
#         y_pred = models[key].predict_proba(x_test[i*chunk:(i+1)*chunk])[:,0]
#         results[i*chunk:(i+1)*chunk,mapping[key]] = y_pred
#     print(i,end='\r', flush=True)
#     # results[:,mapping[key]] = y_pred
test = np.copy(y_test)
test2 = np.copy(y_test)
test3 = np.copy(y_test)

added_results = np.zeros(nb_full_results.shape)
full_results = np.zeros(shape=y_test.shape)

fixed_ordering = [2,1,0,4,3]
for  i in range(results[:,0].shape[0]):
    added_results[i,:] = (results[i,:].argsort()+1) + abs(nb_full_results[i,:].argsort()-8)
    max = np.where(results[i,:] == np.amin(results[i,:].reshape(8)))[0]
    max2 = np.where(added_results[i,:] == np.amin(added_results[i,:].reshape(8)))[0]
    max3 = np.where(nb_full_results[i,:]== np.amax(nb_full_results[i,:].reshape(8)))[0]
    # max4 = np.amin(nb_full_results[i,:].reshape(5))
    if len(max)>1:
        test[i] = 8
    else:
        test[i]=max[0]

    if len(max2)>1:
        test2[i] = 8
    else:
        test2[i]=max2[0]

    if len(max3)>1:
        test3[i] = 8
    else:
        test3[i]=max3[0]
y_pred = test
mask = y_pred != 8

print('Base OVR Ensemble')
print(f'\n\nData Size:\t{len(y_pred)}/{len(y_pred)}')
print(f'\tAccuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'\tPrecision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'\tRecall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'\tF1:\t{f1_score(y_test, y_pred, average="macro")}')
print(confusion_matrix(y_test, y_pred))

print("==="*20)

print('Filtered OVR Ensemble')
print(f'\n\nData Size:\t{len(y_pred[mask])}/{len(y_pred)}')
print(f'\tAccuracy:\t{accuracy_score(y_test[mask], y_pred[mask])}')
print(f'\tPrecision:\t{precision_score(y_test[mask], y_pred[mask], average="macro")}')
print(f'\tRecall:\t{recall_score(y_test[mask], y_pred[mask], average="macro")}')
print(f'\tF1:\t{f1_score(y_test[mask], y_pred[mask], average="macro")}')
print(confusion_matrix(y_test[mask], y_pred[mask]))

print("==="*20)

y_pred = test3
print('Base Multiclass')
print(f'\n\nData Size:\t{len(y_pred)}/{len(y_pred)}')
print(f'\tAccuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'\tPrecision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'\tRecall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'\tF1:\t{f1_score(y_test, y_pred, average="macro")}')
print(confusion_matrix(y_test, y_pred))

print("==="*20)

y_pred = test2
mask = y_pred != 8
print('Base Multiclass Ensemble')
print(f'\n\nData Size:\t{len(y_pred)}/{len(y_pred)}')
print(f'\tAccuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'\tPrecision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'\tRecall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'\tF1:\t{f1_score(y_test, y_pred, average="macro")}')False
print(confusion_matrix(y_test, y_pred))

print("==="*20)

print('Filtered Multiclass Ensemble')
print(f'\n\nData Size:\t{len(y_pred[mask])}/{len(y_pred)}')
print(f'\tAccuracy:\t{accuracy_score(y_test[mask], y_pred[mask])}')
print(f'\tPrecision:\t{precision_score(y_test[mask], y_pred[mask], average="macro")}')
print(f'\tRecall:\t{recall_score(y_test[mask], y_pred[mask], average="macro")}')
print(f'\tF1:\t{f1_score(y_test[mask], y_pred[mask], average="macro")}')
print(confusion_matrix(y_test[mask], y_pred[mask]))

Base OVR Ensemble


Data Size:	16000/16000
	Accuracy:	0.552625
	Precision:	0.5853080317404935
	Recall:	0.49122222222222217
	F1:	0.5227337729502552
[[1896    8   12    0   34    3    8   22   17]
 [   6  586  140   48   40  242  157  108  673]
 [   2    5 1354  137  196    3    1   52  250]
 [   0   15  349 1030   97   11   33  109  356]
 [   9    9  428   22 1159    4   13   71  285]
 [   3  441  124   54   53  405  147  107  666]
 [   0   86   67   97    8   36 1530   33  143]
 [   2  140  242   43  141   73   79  882  398]
 [   0    0    0    0    0    0    0    0    0]]
Filtered OVR Ensemble


Data Size:	13212/16000
	Accuracy:	0.6692400847714199
	Precision:	0.6584715357080552
	Recall:	0.6439788643915684
	F1:	0.6429860858322942
[[1896    8   12    0   34    3    8   22]
 [   6  586  140   48   40  242  157  108]
 [   2    5 1354  137  196    3    1   52]
 [   0   15  349 1030   97   11   33  109]
 [   9    9  428   22 1159    4   13   71]
 [   3  441  124   54   53  405  147  107]
 [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
ranks1
abs(results[i,:].argsort()-7)

array([0, 7, 6, 5, 4, 3, 2, 1])

In [57]:
added_results[i]

array([14.,  1.,  6.,  5.,  7., 10.,  7.,  6.])