In [32]:
import numpy as np
from river import utils
from river.tree import HoeffdingAdaptiveTreeClassifier
from river import model_selection
from river import metrics
from river import datasets
from river import evaluate
from river.neighbors import KNNClassifier
from river.ensemble import AdaptiveRandomForestClassifier
from river.linear_model import PAClassifier, LogisticRegression
from river import optim
from sklearn import metrics as sk_metrics

In [33]:
n_days_lookahead = int(input('Please input the length of days lookahead in {5, 7, 15, 30, 45, 60, 90, 120}: '))

if(n_days_lookahead not in [5, 7, 15, 30, 45, 60, 90, 120]):
    print('Input does not meet requirements.')
    exit()


data_type = str(input('Please specify the coverage of the data {A - Manufacturer 1, B - Manufacturer 2, C - Manufacturer 1 & 2}: '))

if(data_type not in ['A', 'B', 'C']):
    print('Input does not meet requirements.')
    exit()

dit_str = {'A':'mc1', 'B':'mc2', 'C':'mc1_mc2'}

def loadData():

    X_train = np.load('../data/' + dit_str[data_type] + '/' + str(n_days_lookahead) + '_days_lookahead/smart_train.npy',allow_pickle=True)
    y_train = np.load('../data/' + dit_str[data_type] + '/' + str(n_days_lookahead) + '_days_lookahead/train_labels.npy',allow_pickle=True)
    X_test = np.load('../data/' + dit_str[data_type] + '/' + str(n_days_lookahead) + '_days_lookahead/smart_test.npy',allow_pickle=True)
    y_test = np.load('../data/' + dit_str[data_type] + '/' + str(n_days_lookahead) + '_days_lookahead/test_labels.npy',allow_pickle=True)
    X_train = X_train.astype('float32')
    y_train = y_train.astype('float32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('float32')

    state = np.random.get_state()
    np.random.set_state(state)
    np.random.shuffle(X_train)
    np.random.set_state(state)
    np.random.shuffle(y_train)

    return X_train, y_train, X_test, y_test
def print_all_metrics(true, predicted):
    print(sk_metrics.classification_report(true, predicted, digits=5), sk_metrics.roc_auc_score(true, predicted))
    
X_train, y_train, X_test, y_test = loadData()
print('------------------ Loading Data ------------------')
X_train = X_train.reshape((len(X_train), -1))
X_test = X_test.reshape((len(X_test), -1))
print(X_train.shape)

Please input the length of days lookahead in {5, 7, 15, 30, 45, 60, 90, 120}:  5
Please specify the coverage of the data {A - Manufacturer 1, B - Manufacturer 2, C - Manufacturer 1 & 2}:  A


------------------ Loading Data ------------------
(48000, 330)


In [6]:
print('------------------- Decision Tree -------------------')
dt = HoeffdingAdaptiveTreeClassifier()
dts = utils.expand_param_grid(dt, {
    'HoeffdingAdaptiveTreeClassifier': {
        'grace_period ': [100,150,200],
        'max_depth': [None, 10, 100],
        'split_criterion': ['gini', 'info_gain', 'hellinger'],
        'leaf_prediction': ['mc', 'nb', 'nba']

    }
})
print(len(dts))
sh_dt = model_selection.SuccessiveHalvingClassifier(
    dts,
    metric=metrics.Accuracy(),
    budget=2000,
    eta=2,
    verbose=True
)

headers = [str(i) for i in range(330)]
data_x = [dict(zip(headers, x)) for x in X_train]
data_y = [True if y == 1 else False for y in y_train]

i=0
for (Xi, yi) in zip(data_x, data_y):
    if i % 1000 == 0:
        print(i)
    i+=1
    sh_dt.learn_one(Xi, yi)

sh_dt.best_model

------------------- Decision Tree -------------------
81
0
[1]	40 removed	41 left	3 iterations	budget used: 243	budget left: 1757	best Accuracy: 33.33%
[2]	20 removed	21 left	6 iterations	budget used: 489	budget left: 1511	best Accuracy: 33.33%
[3]	10 removed	11 left	13 iterations	budget used: 762	budget left: 1238	best Accuracy: 38.46%
[4]	5 removed	6 left	25 iterations	budget used: 1037	budget left: 963	best Accuracy: 44.00%
[5]	3 removed	3 left	47 iterations	budget used: 1319	budget left: 681	best Accuracy: 53.19%
[6]	1 removed	2 left	95 iterations	budget used: 1604	budget left: 396	best Accuracy: 49.47%
[7]	1 removed	1 left	142 iterations	budget used: 1888	budget left: 112	best Accuracy: 50.00%
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000


HoeffdingAdaptiveTreeClassifier (
  grace_period=200
  max_depth=inf
  split_criterion="info_gain"
  delta=1e-07
  tau=0.05
  leaf_prediction="nba"
  nb_threshold=0
  nominal_attributes=None
  splitter=GaussianSplitter (
    n_splits=10
  )
  bootstrap_sampling=True
  drift_window_threshold=300
  drift_detector=ADWIN (
    delta=0.002
  )
  switch_significance=0.05
  binary_split=False
  max_size=100.
  memory_estimate_period=1000000
  stop_mem_management=False
  remove_poor_attrs=False
  merit_preprune=True
  seed=None
)

In [11]:
print('------------------- Random Forest -------------------')
rf = AdaptiveRandomForestClassifier()
rfs = utils.expand_param_grid(rf, {
    'AdaptiveRandomForestClassifier': {
        'n_models  ': [10,50,100, 150],
        'max_depth  ': [None, 50, 100],
        'split_criterion': ['gini', 'info_gain', 'hellinger'],
        'leaf_prediction': ['mc', 'nb', 'nba']

    }
})
print(len(rfs))
sh_rf = model_selection.SuccessiveHalvingClassifier(
    rfs,
    metric=metrics.Accuracy(),
    budget=2000,
    eta=2,
    verbose=True
)

headers = [str(i) for i in range(330)]
data_x = [dict(zip(headers, x)) for x in X_train]
data_y = [True if y == 1 else False for y in y_train]

i=0
for (Xi, yi) in zip(data_x, data_y):
    if i % 1000 == 0:
        print(i)
    i+=1
    sh_rf.learn_one(Xi, yi)

sh_rf.best_model

------------------- Random Forest -------------------
108
0
[1]	54 removed	54 left	2 iterations	budget used: 216	budget left: 1784	best Accuracy: 0.00%
[2]	27 removed	27 left	5 iterations	budget used: 486	budget left: 1514	best Accuracy: 40.00%
[3]	13 removed	14 left	10 iterations	budget used: 756	budget left: 1244	best Accuracy: 50.00%
[4]	7 removed	7 left	20 iterations	budget used: 1036	budget left: 964	best Accuracy: 45.00%
[5]	3 removed	4 left	40 iterations	budget used: 1316	budget left: 684	best Accuracy: 50.00%
[6]	2 removed	2 left	71 iterations	budget used: 1600	budget left: 400	best Accuracy: 52.11%
[7]	1 removed	1 left	142 iterations	budget used: 1884	budget left: 116	best Accuracy: 56.34%
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000


AdaptiveRandomForestClassifier([ForestMemberClassifier (
                                  index_original=0
                                  model=BaseTreeClassifier (
                                    max_features=18
                                    grace_period=50
                                    max_depth=inf
                                    split_criterion="info_gain"
                                    delta=0.01
                                    tau=0.05
                                    leaf_prediction="nba"
                                    nb_threshold=0
                                    nominal_attributes=None
                                    splitter=GaussianSplitter (
                                      n_splits=10
                                    )
                                    binary_split=False
                                    max_size=100.
                                    memory_estimate_period=2000000
                            

In [20]:
print('------------------- KNN -------------------')
knn = KNNClassifier()
knns = utils.expand_param_grid(knn, {
    'KNNClassifier': {
        'n_neighbors': [1, 5, 10, 20],
        'window_size ': [500, 1000, 1500]

    }
})
print(len(knns))
sh_knn = model_selection.SuccessiveHalvingClassifier(
    knns,
    metric=metrics.Accuracy(),
    budget=2000,
    eta=2,
    verbose=True
)

headers = [str(i) for i in range(330)]
data_x = [dict(zip(headers, x)) for x in X_train]
data_y = [True if y == 1 else False for y in y_train]

i=0
for (Xi, yi) in zip(data_x, data_y):
    if i % 1000 == 0:
        print(i)
    i+=1
    sh_knn.learn_one(Xi, yi)

sh_knn.best_model

------------------- KNN -------------------
12
0
[1]	6 removed	6 left	41 iterations	budget used: 492	budget left: 1508	best Accuracy: 60.98%
[2]	3 removed	3 left	83 iterations	budget used: 990	budget left: 1010	best Accuracy: 56.63%
[3]	1 removed	2 left	166 iterations	budget used: 1488	budget left: 512	best Accuracy: 63.25%
[4]	1 removed	1 left	250 iterations	budget used: 1988	budget left: 12	best Accuracy: 62.80%
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000


KNNClassifier (
  n_neighbors=5
  window_size=1000
  min_distance_keep=0.
  weighted=True
  cleanup_every=0
  distance_func=functools.partial(<function minkowski_distance at 0x7f697a230310>, p=2)
  softmax=False
)

In [15]:
print('------------------- Passive Aggressive Classifier -------------------')
pac = PAClassifier()
pacs = utils.expand_param_grid(pac, {
    'PAClassifier': {
        'C': [0.1, 0.5, 1, 5, 10]
    }
})
print(len(pacs))
sh_pac = model_selection.SuccessiveHalvingClassifier(
    pacs,
    metric=metrics.Accuracy(),
    budget=2000,
    eta=2,
    verbose=True
)

headers = [str(i) for i in range(330)]
data_x = [dict(zip(headers, x)) for x in X_train]
data_y = [True if y == 1 else False for y in y_train]

i=0
for (Xi, yi) in zip(data_x, data_y):
    if i % 1000 == 0:
        print(i)
    i+=1
    sh_pac.learn_one(Xi, yi)

sh_pac.best_model

------------------- Passive Aggressive Classifier -------------------
5
0
[1]	2 removed	3 left	133 iterations	budget used: 665	budget left: 1335	best Accuracy: 47.37%
[2]	1 removed	2 left	222 iterations	budget used: 1331	budget left: 669	best Accuracy: 46.85%
[3]	1 removed	1 left	333 iterations	budget used: 1997	budget left: 3	best Accuracy: 48.95%
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000


PAClassifier (
  C=1.
  mode=1
  learn_intercept=True
)

In [26]:
print('------------------- LinearRegression -------------------')
lr = LogisticRegression()
lrs = utils.expand_param_grid(lr, {
    'LogisticRegression': {
        'optimizer': [
            (optim.SGD, {'lr': [.1, .01, .005]}),
            (optim.Adam, {'beta_1': [.01, .001], 'lr': [.1, .01, .001]}),
            (optim.Adam, {'beta_1': [.1], 'lr': [.001]}),
        ]
    }
})
print(len(lrs))
sh_lr = model_selection.SuccessiveHalvingClassifier(
    lrs,
    metric=metrics.Accuracy(),
    budget=2000,
    eta=2,
    verbose=True
)
print(X_train, y_train)
headers = [str(i) for i in range(330)]
data_x = [dict(zip(headers, x)) for x in X_train]
data_y =  y_train

i=0
for (Xi, yi) in zip(data_x, data_y):
    if i % 1000 == 0:
        print(i)
    i+=1
    sh_lr.learn_one(Xi, yi)

sh_lr.best_model

------------------- LinearRegression -------------------
10
[[2.9758942e-01 1.4687101e-02 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [7.7395278e-01 1.3622819e-01 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [6.0912836e-01 8.7696895e-02 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 ...
 [2.0190072e-01 2.1285654e-04 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [6.8866503e-01 3.6398467e-02 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [5.8710283e-01 2.6607066e-02 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]] [1. 0. 0. ... 0. 1. 0.]
0
[1]	5 removed	5 left	50 iterations	budget used: 500	budget left: 1500	best Accuracy: 52.00%
[2]	2 removed	3 left	100 iterations	budget used: 1000	budget left: 1000	best Accuracy: 50.00%
[3]	1 removed	2 left	166 iterations	budget used: 1498	budget left: 502	best Accuracy: 53.61%
[4]	1 removed	1 left	250 iterations	budget used: 1998	budget left: 2	be

LogisticRegression (
  optimizer=SGD (
    lr=Constant (
      learning_rate=0.01
    )
  )
  loss=Log (
    weight_pos=1.
    weight_neg=1.
  )
  l2=0.
  l1=0.
  intercept_init=0.
  intercept_lr=Constant (
    learning_rate=0.01
  )
  clip_gradient=1e+12
  initializer=Zeros ()
)

In [34]:
headers = [str(i) for i in range(330)]
print(headers)
data_x = [dict(zip(headers, x)) for x in X_train]
data_y = [True if y == 1 else False for y in y_train]

model = LogisticRegression()
i=0
for (Xi, yi) in zip(data_x, data_y):
    if i % 1000 == 0:
        print(i)
    i+=1
    model.learn_one(Xi, yi)
    
headers = [str(i) for i in range(330)]
data_x_test = [dict(zip(headers, x)) for x in X_test]
data_y_test = [True if y == 1 else False for y in y_test]
y_true = []
y_pred = []
for Xi, yi in zip(data_x_test, data_y_test):
    yi_pred = model.predict_one(Xi)
    y_true.append(yi)
    y_pred.append(yi_pred)
print_all_metrics(np.asarray(y_test), np.asarray(y_pred))


['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '15

AttributeError: module 'river.metrics' has no attribute 'roc_auc_score'