In [1]:
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

#model
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
#al
from modAL.models import ActiveLearner,Committee
from modAL.uncertainty import uncertainty_sampling,margin_sampling,entropy_sampling
from functools import partial
from modAL.disagreement import vote_entropy_sampling

#metric
from sklearn.datasets import make_blobs
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef,cohen_kappa_score,balanced_accuracy_score

import warnings

warnings.filterwarnings('ignore')

# Splite Dataset

In [2]:
# train_set=pd.read_csv('./firsttrain/6903102.csv')
# test_set=pd.read_csv('./lasttest/6903102.csv')


train_set=pd.read_csv('./randomtrain/3901890.csv')
test_set=pd.read_csv('./randomtest/3901890.csv')

len(train_set),len(test_set)

(124622, 53497)

In [3]:
#this function use random grid search to find best parameters
def getPar(model,dist,data,niter):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import RandomizedSearchCV
    x,y=data.shape

    clf = model

    param_dist = dist
    grid = RandomizedSearchCV(clf,param_dist,cv = 3,scoring = "balanced_accuracy",n_iter=niter,n_jobs = -1)

    #train
    grid.fit(data.iloc[:,0:y-1],data.iloc[:,y-1])
    #get best parameter
    print(grid.best_score_)
    return grid.best_params_

In [4]:
# metric
def computeMetric(y_tru,y_pre):
    acc = accuracy_score(y_tru,y_pre)
    pre=precision_score(y_tru,y_pre)
    recall=recall_score(y_tru,y_pre)
    cm=confusion_matrix(y_tru,y_pre)
    f1 = f1_score(y_tru,y_pre)
    mcc=matthews_corrcoef(y_tru, y_pre)
    kappa=cohen_kappa_score(y_tru, y_pre)
    bac=balanced_accuracy_score(y_tru,y_pre)
    print("acc:",acc)
    print("balanced acc:",bac)
    print("precision:",pre)
    print("recall:",recall)
    print("cm:",cm)
    print("f1:",f1)
    print("MCC:", mcc)
    print("Kappa:",kappa)
    
    # confusion matrix
#     cmap1 = sns.diverging_palette(260,-10,s=50, l=75, n=5, as_cmap=True)
#     plt.subplots(figsize=(12,8))
#     cf_matrix = confusion_matrix(y_tru, y_pre)
#     sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})
    
    return kappa

In [5]:
def draw(performance_history):
    fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
    ax.plot(performance_history)
    ax.scatter(range(len(performance_history)), performance_history, s=13)
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=5, integer=True))
    ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=10))
    ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=1))
    ax.set_ylim(bottom=0, top=1)
    ax.grid(True)
    ax.set_title('Incremental classification accuracy')
    ax.set_xlabel('Query iteration')
    ax.set_ylabel('Classification Accuracy')
    plt.show()

# splite train(pool), test, Labeled, Unlabeled

In [6]:
x,y=train_set.shape
#pool 
X_Pool = train_set.iloc[:,0:y-1].values
y_Pool = train_set.iloc[:,y-1].values

In [7]:
#test set
X_test=test_set.iloc[:,0:y-1]
y_tru=test_set.iloc[:,y-1]
print(len(X_test),len(y_tru))

53497 53497


In [8]:
X_initial=pd.read_csv('./randomtrain/3901890_X_initial_1.csv').values
y_initial=pd.read_csv('./randomtrain/3901890_y_initial_1.csv').values.ravel()
X_re=pd.read_csv('./randomtrain/3901890_X_re_1.csv').values
y_re=pd.read_csv('./randomtrain/3901890_y_re_1.csv').values.ravel()
print(X_re.shape,X_initial.shape)
print(y_re.shape,y_initial.shape)

(124522, 6) (100, 6)
(124522,) (100,)


# molAL

In [9]:
def random_sampling(classifier, X_pool):
    n_samples = len(X_pool)
    query_idx = np.random.choice(range(n_samples))
    return query_idx, X_pool[query_idx]

In [10]:
def al(clf,X_L,y_L):
    learner = ActiveLearner(estimator=clf,
                            X_training=X_L, y_training=y_L)
    return learner

In [11]:
# clf1 KNN
clf1 = KNeighborsClassifier()
dic1={'n_neighbors':[1,2]}
para1=getPar(clf1,dic1,test_set,10)
clf1 = KNeighborsClassifier(n_neighbors=para1['n_neighbors'],n_jobs=-1)
clf1.fit(X_Pool, y_Pool)
y_pre=clf1.predict(X_test)
computeMetric(y_tru,y_pre)

0.9920453450872895
acc: 0.9959063125035049
balanced acc: 0.9948891909629031
precision: 0.9936265137029955
recall: 0.9924247246801197
cm: [[37688   100]
 [  119 15590]]
f1: 0.9930252555813879
MCC: 0.9901286032207376
Kappa: 0.990128240084297


0.990128240084297

In [12]:
# clf2 lightgbm
clf2 = lgb.LGBMClassifier()

dic2 = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 900, 1000, 1500, 2000],
              'num_leaves': range(6, 50), 
              'min_child_samples': range(10, 200, 10), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
para2=getPar(clf2,dic2,test_set,10)

    
clf2 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=para2['num_leaves'], max_depth=-1, 
                              learning_rate=para2['learning_rate'], n_estimators=para2['n_estimators'], 
                              subsample_for_bin=200000, 
                              objective=None, class_weight=None, min_split_gain=0.0, 
                              min_child_weight=para2['min_child_weight'],
                              min_child_samples=para2['min_child_samples'], 
                              subsample=1.0, 
                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=para2['reg_alpha'], 
                              reg_lambda=para2['reg_lambda'], random_state=None, n_jobs=-1, importance_type='split')
clf2.fit(X_Pool, y_Pool)
y_pre=clf2.predict(X_test)
computeMetric(y_tru,y_pre)

0.9954394845130189
acc: 0.9971400265435445
balanced acc: 0.9958182755942078
precision: 0.9976327575175944
recall: 0.9926156980075116
cm: [[37751    37]
 [  116 15593]]
f1: 0.9951179042088134
MCC: 0.9931018950240861
Kappa: 0.9930955841371559


0.9930955841371559

In [13]:
# clf3 gradientBoostingClassifier
clf3 = lgb.LGBMClassifier()

dic3 = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 900, 1000, 1500, 2000],
              'num_leaves': range(6, 50), 
              'min_child_samples': range(10, 200, 10), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
para3=getPar(clf3,dic3,test_set,10)

    
clf3 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=para3['num_leaves'], max_depth=-1, 
                              learning_rate=para3['learning_rate'], n_estimators=para3['n_estimators'], 
                              subsample_for_bin=200000, 
                              objective=None, class_weight=None, min_split_gain=0.0, 
                              min_child_weight=para3['min_child_weight'],
                              min_child_samples=para3['min_child_samples'], 
                              subsample=1.0, 
                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=para3['reg_alpha'], 
                              reg_lambda=para3['reg_lambda'], random_state=None, n_jobs=-1, importance_type='split')
clf3.fit(X_Pool, y_Pool)
y_pre=clf3.predict(X_test)
computeMetric(y_tru,y_pre)

0.9953436238428885
acc: 0.9971400265435445
balanced acc: 0.9957624840718265
precision: 0.9978238607270865
recall: 0.9924247246801197
cm: [[37754    34]
 [  119 15590]]
f1: 0.995116969329461
MCC: 0.9931021200458625
Kappa: 0.9930948125016412


0.9930948125016412

In [14]:
#clf4 catboost
clf4 = CatBoostClassifier(loss_function='Logloss')
# dic4 = {'learning_rate': [0.03, 0.1],
#         'depth': [4, 6, 10],
#         'l2_leaf_reg': [1, 3, 5, 7, 9]}
# #para3=getPar(clf4,dic4,test_set,10)

# grid_search_result = clf4.grid_search(dic4, 
#                                        X=X_Pool, 
#                                        y=y_Pool)

clf4.fit(X_Pool, y_Pool)
y_pre=clf4.predict(X_test)
computeMetric(y_tru,y_pre)

Learning rate set to 0.080864
0:	learn: 0.4817185	total: 71.3ms	remaining: 1m 11s
1:	learn: 0.3345190	total: 81.6ms	remaining: 40.7s
2:	learn: 0.2350744	total: 92.5ms	remaining: 30.7s
3:	learn: 0.1719374	total: 103ms	remaining: 25.5s
4:	learn: 0.1301573	total: 112ms	remaining: 22.4s
5:	learn: 0.1021513	total: 122ms	remaining: 20.2s
6:	learn: 0.0826860	total: 132ms	remaining: 18.7s
7:	learn: 0.0673160	total: 142ms	remaining: 17.7s
8:	learn: 0.0561177	total: 154ms	remaining: 16.9s
9:	learn: 0.0483988	total: 165ms	remaining: 16.3s
10:	learn: 0.0419532	total: 175ms	remaining: 15.7s
11:	learn: 0.0381434	total: 186ms	remaining: 15.4s
12:	learn: 0.0352039	total: 196ms	remaining: 14.9s
13:	learn: 0.0321725	total: 208ms	remaining: 14.7s
14:	learn: 0.0300095	total: 218ms	remaining: 14.3s
15:	learn: 0.0279608	total: 228ms	remaining: 14s
16:	learn: 0.0261395	total: 239ms	remaining: 13.8s
17:	learn: 0.0249144	total: 249ms	remaining: 13.6s
18:	learn: 0.0236429	total: 258ms	remaining: 13.3s
19:	learn

161:	learn: 0.0088879	total: 1.78s	remaining: 9.21s
162:	learn: 0.0088775	total: 1.79s	remaining: 9.21s
163:	learn: 0.0088587	total: 1.8s	remaining: 9.2s
164:	learn: 0.0088453	total: 1.82s	remaining: 9.2s
165:	learn: 0.0088322	total: 1.83s	remaining: 9.19s
166:	learn: 0.0088165	total: 1.84s	remaining: 9.2s
167:	learn: 0.0088051	total: 1.85s	remaining: 9.18s
168:	learn: 0.0087957	total: 1.86s	remaining: 9.17s
169:	learn: 0.0087841	total: 1.88s	remaining: 9.16s
170:	learn: 0.0087841	total: 1.89s	remaining: 9.14s
171:	learn: 0.0087543	total: 1.9s	remaining: 9.13s
172:	learn: 0.0087363	total: 1.91s	remaining: 9.11s
173:	learn: 0.0087247	total: 1.92s	remaining: 9.1s
174:	learn: 0.0087069	total: 1.93s	remaining: 9.08s
175:	learn: 0.0086803	total: 1.94s	remaining: 9.07s
176:	learn: 0.0086637	total: 1.95s	remaining: 9.06s
177:	learn: 0.0086572	total: 1.96s	remaining: 9.04s
178:	learn: 0.0086481	total: 1.97s	remaining: 9.02s
179:	learn: 0.0086357	total: 1.98s	remaining: 9.01s
180:	learn: 0.0086

327:	learn: 0.0075581	total: 3.4s	remaining: 6.96s
328:	learn: 0.0075581	total: 3.41s	remaining: 6.95s
329:	learn: 0.0075581	total: 3.42s	remaining: 6.93s
330:	learn: 0.0075581	total: 3.42s	remaining: 6.92s
331:	learn: 0.0075581	total: 3.43s	remaining: 6.91s
332:	learn: 0.0075581	total: 3.44s	remaining: 6.89s
333:	learn: 0.0075581	total: 3.45s	remaining: 6.88s
334:	learn: 0.0075581	total: 3.46s	remaining: 6.87s
335:	learn: 0.0075581	total: 3.47s	remaining: 6.85s
336:	learn: 0.0075581	total: 3.48s	remaining: 6.84s
337:	learn: 0.0075581	total: 3.48s	remaining: 6.82s
338:	learn: 0.0075581	total: 3.49s	remaining: 6.81s
339:	learn: 0.0075581	total: 3.5s	remaining: 6.79s
340:	learn: 0.0075581	total: 3.51s	remaining: 6.78s
341:	learn: 0.0075581	total: 3.52s	remaining: 6.76s
342:	learn: 0.0075565	total: 3.52s	remaining: 6.75s
343:	learn: 0.0075565	total: 3.53s	remaining: 6.74s
344:	learn: 0.0075565	total: 3.54s	remaining: 6.72s
345:	learn: 0.0075482	total: 3.55s	remaining: 6.71s
346:	learn: 0.

499:	learn: 0.0075323	total: 4.8s	remaining: 4.8s
500:	learn: 0.0075323	total: 4.81s	remaining: 4.79s
501:	learn: 0.0075323	total: 4.82s	remaining: 4.78s
502:	learn: 0.0075323	total: 4.83s	remaining: 4.77s
503:	learn: 0.0075323	total: 4.84s	remaining: 4.76s
504:	learn: 0.0075323	total: 4.85s	remaining: 4.75s
505:	learn: 0.0075323	total: 4.86s	remaining: 4.74s
506:	learn: 0.0075323	total: 4.86s	remaining: 4.73s
507:	learn: 0.0075323	total: 4.87s	remaining: 4.72s
508:	learn: 0.0075323	total: 4.88s	remaining: 4.71s
509:	learn: 0.0075323	total: 4.89s	remaining: 4.7s
510:	learn: 0.0075323	total: 4.89s	remaining: 4.68s
511:	learn: 0.0075323	total: 4.9s	remaining: 4.67s
512:	learn: 0.0075323	total: 4.91s	remaining: 4.66s
513:	learn: 0.0075323	total: 4.92s	remaining: 4.65s
514:	learn: 0.0075323	total: 4.93s	remaining: 4.64s
515:	learn: 0.0075323	total: 4.93s	remaining: 4.63s
516:	learn: 0.0075323	total: 4.94s	remaining: 4.62s
517:	learn: 0.0075323	total: 4.95s	remaining: 4.61s
518:	learn: 0.00

673:	learn: 0.0075323	total: 6.2s	remaining: 3s
674:	learn: 0.0075323	total: 6.21s	remaining: 2.99s
675:	learn: 0.0075323	total: 6.22s	remaining: 2.98s
676:	learn: 0.0075323	total: 6.22s	remaining: 2.97s
677:	learn: 0.0075323	total: 6.23s	remaining: 2.96s
678:	learn: 0.0075323	total: 6.24s	remaining: 2.95s
679:	learn: 0.0075323	total: 6.25s	remaining: 2.94s
680:	learn: 0.0075323	total: 6.26s	remaining: 2.93s
681:	learn: 0.0075323	total: 6.27s	remaining: 2.92s
682:	learn: 0.0075323	total: 6.28s	remaining: 2.91s
683:	learn: 0.0075323	total: 6.28s	remaining: 2.9s
684:	learn: 0.0075323	total: 6.29s	remaining: 2.89s
685:	learn: 0.0075323	total: 6.3s	remaining: 2.88s
686:	learn: 0.0075323	total: 6.31s	remaining: 2.87s
687:	learn: 0.0075323	total: 6.31s	remaining: 2.86s
688:	learn: 0.0075323	total: 6.32s	remaining: 2.85s
689:	learn: 0.0075323	total: 6.33s	remaining: 2.84s
690:	learn: 0.0075323	total: 6.34s	remaining: 2.83s
691:	learn: 0.0075323	total: 6.34s	remaining: 2.82s
692:	learn: 0.0075

846:	learn: 0.0075323	total: 7.6s	remaining: 1.37s
847:	learn: 0.0075323	total: 7.61s	remaining: 1.36s
848:	learn: 0.0075323	total: 7.62s	remaining: 1.35s
849:	learn: 0.0075323	total: 7.63s	remaining: 1.34s
850:	learn: 0.0075323	total: 7.63s	remaining: 1.34s
851:	learn: 0.0075323	total: 7.64s	remaining: 1.33s
852:	learn: 0.0075323	total: 7.65s	remaining: 1.32s
853:	learn: 0.0075323	total: 7.66s	remaining: 1.31s
854:	learn: 0.0075323	total: 7.67s	remaining: 1.3s
855:	learn: 0.0075323	total: 7.67s	remaining: 1.29s
856:	learn: 0.0075323	total: 7.68s	remaining: 1.28s
857:	learn: 0.0075323	total: 7.69s	remaining: 1.27s
858:	learn: 0.0075323	total: 7.7s	remaining: 1.26s
859:	learn: 0.0075323	total: 7.71s	remaining: 1.25s
860:	learn: 0.0075323	total: 7.71s	remaining: 1.25s
861:	learn: 0.0075323	total: 7.72s	remaining: 1.24s
862:	learn: 0.0075323	total: 7.73s	remaining: 1.23s
863:	learn: 0.0075323	total: 7.74s	remaining: 1.22s
864:	learn: 0.0075323	total: 7.75s	remaining: 1.21s
865:	learn: 0.0

0.9935040249527429

# Committee 

In [15]:
learner1 = al(clf1,X_initial,y_initial)
learner2 = al(clf2,X_initial,y_initial)
learner3 = al(clf2,X_initial,y_initial)
learner4 = al(clf4,X_initial,y_initial)

Learning rate set to 0.003854
0:	learn: 0.6882381	total: 1.17ms	remaining: 1.17s
1:	learn: 0.6829336	total: 1.98ms	remaining: 986ms
2:	learn: 0.6779247	total: 2.82ms	remaining: 938ms
3:	learn: 0.6733793	total: 3.67ms	remaining: 914ms
4:	learn: 0.6685902	total: 4.48ms	remaining: 891ms
5:	learn: 0.6647157	total: 5.22ms	remaining: 865ms
6:	learn: 0.6595798	total: 6.06ms	remaining: 859ms
7:	learn: 0.6543432	total: 6.91ms	remaining: 857ms
8:	learn: 0.6501704	total: 7.72ms	remaining: 850ms
9:	learn: 0.6458704	total: 8.62ms	remaining: 853ms
10:	learn: 0.6414314	total: 9.46ms	remaining: 851ms
11:	learn: 0.6363489	total: 10.2ms	remaining: 844ms
12:	learn: 0.6312507	total: 11.1ms	remaining: 844ms
13:	learn: 0.6273616	total: 11.9ms	remaining: 838ms
14:	learn: 0.6235983	total: 12.7ms	remaining: 835ms
15:	learn: 0.6203496	total: 13.5ms	remaining: 832ms
16:	learn: 0.6169829	total: 14.3ms	remaining: 826ms
17:	learn: 0.6133022	total: 15.2ms	remaining: 829ms
18:	learn: 0.6092069	total: 16ms	remaining: 

201:	learn: 0.2074871	total: 180ms	remaining: 712ms
202:	learn: 0.2064427	total: 181ms	remaining: 712ms
203:	learn: 0.2054852	total: 182ms	remaining: 712ms
204:	learn: 0.2045981	total: 183ms	remaining: 711ms
205:	learn: 0.2034313	total: 184ms	remaining: 710ms
206:	learn: 0.2024861	total: 185ms	remaining: 708ms
207:	learn: 0.2017396	total: 186ms	remaining: 707ms
208:	learn: 0.2008043	total: 186ms	remaining: 706ms
209:	learn: 0.1998877	total: 187ms	remaining: 704ms
210:	learn: 0.1988560	total: 188ms	remaining: 703ms
211:	learn: 0.1979234	total: 189ms	remaining: 701ms
212:	learn: 0.1968465	total: 189ms	remaining: 700ms
213:	learn: 0.1961680	total: 190ms	remaining: 698ms
214:	learn: 0.1952198	total: 191ms	remaining: 697ms
215:	learn: 0.1939586	total: 192ms	remaining: 696ms
216:	learn: 0.1931366	total: 192ms	remaining: 694ms
217:	learn: 0.1922257	total: 193ms	remaining: 693ms
218:	learn: 0.1913771	total: 194ms	remaining: 692ms
219:	learn: 0.1906198	total: 195ms	remaining: 691ms
220:	learn: 

361:	learn: 0.1091218	total: 299ms	remaining: 528ms
362:	learn: 0.1089272	total: 300ms	remaining: 527ms
363:	learn: 0.1085133	total: 301ms	remaining: 526ms
364:	learn: 0.1079445	total: 302ms	remaining: 525ms
365:	learn: 0.1076263	total: 302ms	remaining: 524ms
366:	learn: 0.1072701	total: 303ms	remaining: 523ms
367:	learn: 0.1069781	total: 304ms	remaining: 522ms
368:	learn: 0.1065270	total: 305ms	remaining: 521ms
369:	learn: 0.1061196	total: 306ms	remaining: 520ms
370:	learn: 0.1058652	total: 306ms	remaining: 520ms
371:	learn: 0.1056198	total: 307ms	remaining: 519ms
372:	learn: 0.1053099	total: 308ms	remaining: 518ms
373:	learn: 0.1048785	total: 309ms	remaining: 517ms
374:	learn: 0.1044399	total: 309ms	remaining: 516ms
375:	learn: 0.1042279	total: 310ms	remaining: 515ms
376:	learn: 0.1038718	total: 311ms	remaining: 514ms
377:	learn: 0.1034793	total: 312ms	remaining: 513ms
378:	learn: 0.1030780	total: 313ms	remaining: 512ms
379:	learn: 0.1028867	total: 313ms	remaining: 511ms
380:	learn: 

610:	learn: 0.0576686	total: 482ms	remaining: 307ms
611:	learn: 0.0575544	total: 483ms	remaining: 306ms
612:	learn: 0.0573720	total: 483ms	remaining: 305ms
613:	learn: 0.0572720	total: 484ms	remaining: 304ms
614:	learn: 0.0571602	total: 485ms	remaining: 304ms
615:	learn: 0.0570176	total: 486ms	remaining: 303ms
616:	learn: 0.0569332	total: 486ms	remaining: 302ms
617:	learn: 0.0568241	total: 487ms	remaining: 301ms
618:	learn: 0.0567225	total: 488ms	remaining: 300ms
619:	learn: 0.0566329	total: 489ms	remaining: 300ms
620:	learn: 0.0565197	total: 490ms	remaining: 299ms
621:	learn: 0.0563896	total: 490ms	remaining: 298ms
622:	learn: 0.0562686	total: 491ms	remaining: 297ms
623:	learn: 0.0561511	total: 492ms	remaining: 296ms
624:	learn: 0.0560991	total: 493ms	remaining: 296ms
625:	learn: 0.0559843	total: 493ms	remaining: 295ms
626:	learn: 0.0558641	total: 494ms	remaining: 294ms
627:	learn: 0.0557276	total: 495ms	remaining: 293ms
628:	learn: 0.0556403	total: 496ms	remaining: 292ms
629:	learn: 

855:	learn: 0.0373020	total: 664ms	remaining: 112ms
856:	learn: 0.0372159	total: 665ms	remaining: 111ms
857:	learn: 0.0371118	total: 666ms	remaining: 110ms
858:	learn: 0.0370756	total: 667ms	remaining: 109ms
859:	learn: 0.0370260	total: 668ms	remaining: 109ms
860:	learn: 0.0369790	total: 668ms	remaining: 108ms
861:	learn: 0.0369322	total: 669ms	remaining: 107ms
862:	learn: 0.0368937	total: 670ms	remaining: 106ms
863:	learn: 0.0368427	total: 671ms	remaining: 106ms
864:	learn: 0.0367970	total: 671ms	remaining: 105ms
865:	learn: 0.0367620	total: 672ms	remaining: 104ms
866:	learn: 0.0367215	total: 673ms	remaining: 103ms
867:	learn: 0.0366569	total: 674ms	remaining: 102ms
868:	learn: 0.0365929	total: 675ms	remaining: 102ms
869:	learn: 0.0365325	total: 675ms	remaining: 101ms
870:	learn: 0.0364096	total: 676ms	remaining: 100ms
871:	learn: 0.0363465	total: 677ms	remaining: 99.4ms
872:	learn: 0.0363035	total: 678ms	remaining: 98.6ms
873:	learn: 0.0362583	total: 679ms	remaining: 97.8ms
874:	lear

In [16]:
# assembling the committee
def com(lists):
    X_L = X_initial.copy()
    y_L = y_initial.copy()
    X_U,y_U =X_re.copy(),y_re.copy()
    committee = Committee(learner_list=lists,
                          query_strategy=vote_entropy_sampling)
    print(len(X_L))
    y_pre=committee.predict(X_test)
    unqueried_kappa=cohen_kappa_score(y_tru, y_pre)
    unqueried_f1=f1_score(y_tru,y_pre)
    kappa_history = [unqueried_kappa]
    f1_history = [unqueried_f1]
    # query by committee
    n_queries = 200
    for idx in range(n_queries):
        query_index, query_instance = committee.query(X_U)
        X, y = X_U[query_index].reshape(1, -1), y_U[query_index].reshape(1, )
        committee.teach(X=X,y=y)
        y_pre=committee.predict(X_test)
        kappa=cohen_kappa_score(y_tru, y_pre)
        f1=f1_score(y_tru,y_pre)
        print(idx+1,y,"-------------------->",kappa)
        print(idx+1,y,"-------------------->",f1)
        kappa_history.append(kappa)
        f1_history.append(f1)
    
        # remove queried instance from pool
        X_U = np.delete(X_U, query_index, axis=0)
        y_U = np.delete(y_U, query_index)
        
    df_scores= pd.concat([pd.DataFrame(kappa_history,columns=['kappa']), 
                          pd.DataFrame(f1_history,columns=['F1'])],
                         axis=1)
    
        
    return df_scores

In [17]:
# learner_list1=[]
# learner_list1.append(learner1)
# learner_list1.append(learner2)
# knn_light=com(learner_list1)
# metric1= knn_light.rename(columns={'kappa': 'Knn_light_kappa', 'F1': 'Knn_light_F1'})



# learner_list2=[]
# learner_list2.append(learner1)
# learner_list2.append(learner3)
# Knn_gradient=com(learner_list2)
# metric2= Knn_gradient.rename(columns={'kappa': 'Knn_gradient_kappa', 'F1': 'Knn_gradient_F1'})


# learner_list3=[]
# learner_list3.append(learner1)
# learner_list3.append(learner4)
# Knn_catboost=com(learner_list3)
# metric3=Knn_catboost.rename(columns={'kappa': 'Knn_catboost_kappa', 'F1': 'Knn_catboost_F1'})

# learner_list4=[]
# learner_list4.append(learner2)
# learner_list4.append(learner3)
# light_gradient=com(learner_list4)
# metric4=light_gradient.rename(columns={'kappa': 'light_gradient_kappa', 'F1': 'light_gradient_F1'})


# learner_list5=[]
# learner_list5.append(learner2)
# learner_list5.append(learner4)
# light_catboost=com(learner_list5)
# metric5=light_catboost.rename(columns={'kappa': 'light_catboost_kappa', 'F1': 'light_catboost_F1'})


# learner_list6=[]
# learner_list6.append(learner3)
# learner_list6.append(learner4)
# gradient_catboost=com(learner_list6)
# metric6=gradient_catboost.rename(columns={'kappa': 'gradient_catboost_kappa', 'F1': 'gradient_catboost_F1'})

In [18]:
# 3
# learner_list7=[]
# learner_list7.append(learner2)
# learner_list7.append(learner3)
# learner_list7.append(learner4)
# lg_gb_cb=com(learner_list7)
# metric7=lg_gb_cb.rename(columns={'kappa': 'lg_gb_cb_kappa', 'F1': 'lg_gb_cb_F1'})


# learner_list8=[]
# learner_list8.append(learner1)
# learner_list8.append(learner3)
# learner_list8.append(learner4)
# Knn_gb_cb=com(learner_list8)
# metric8=Knn_gb_cb.rename(columns={'kappa': 'Knn_gb_cb_kappa', 'F1': 'Knn_gb_cb_F1'})

# learner_list9=[]
# learner_list9.append(learner1)
# learner_list9.append(learner2)
# learner_list9.append(learner4)
# Knn_lg_cb=com(learner_list9)
# metric9=Knn_lg_cb.rename(columns={'kappa': 'Knn_lg_cb_kappa', 'F1': 'Knn_lg_cb_F1'})


learner_list10=[]
learner_list10.append(learner1)
learner_list10.append(learner2)
learner_list10.append(learner3)
Knn_lg_gb=com(learner_list10)
metric10=Knn_lg_gb.rename(columns={'kappa': 'Knn_lg_gb_kappa', 'F1': 'Knn_lg_gb_F1'})

100
1 [1] --------------------> 0.7245437310596672
1 [1] --------------------> 0.800580277318579
2 [1] --------------------> 0.7125687391296589
2 [1] --------------------> 0.7828316610925307
3 [1] --------------------> 0.7164824755591277
3 [1] --------------------> 0.7862058758232813
4 [0] --------------------> 0.7220589257041365
4 [0] --------------------> 0.7898459018834215
5 [1] --------------------> 0.7218716864822757
5 [1] --------------------> 0.7896794370602033
6 [1] --------------------> 0.7212224714723868
6 [1] --------------------> 0.7891207153502234
7 [0] --------------------> 0.7212224714723868
7 [0] --------------------> 0.7891207153502234
8 [1] --------------------> 0.7222106865551212
8 [1] --------------------> 0.7899322362052276
9 [1] --------------------> 0.7220347368229048
9 [1] --------------------> 0.7897814024503781
10 [0] --------------------> 0.7189072996639583
10 [0] --------------------> 0.7871203914683799
11 [0] --------------------> 0.7187743048909477
11 [0] 

87 [1] --------------------> 0.8924074470752775
87 [1] --------------------> 0.9233369081277826
88 [1] --------------------> 0.8967425516072116
88 [1] --------------------> 0.9265210911682759
89 [1] --------------------> 0.899634796109849
89 [1] --------------------> 0.9286384065703108
90 [0] --------------------> 0.9006378483610948
90 [0] --------------------> 0.9293295366295625
91 [0] --------------------> 0.8963994234737742
91 [0] --------------------> 0.9263579668465441
92 [0] --------------------> 0.8947036634387702
92 [0] --------------------> 0.9250987246714573
93 [1] --------------------> 0.8949885204580625
93 [1] --------------------> 0.9253074433656958
94 [0] --------------------> 0.8982223957176768
94 [0] --------------------> 0.9275859835481571
95 [1] --------------------> 0.8967446240729308
95 [1] --------------------> 0.9265704858640098
96 [0] --------------------> 0.8966778006264542
96 [0] --------------------> 0.9265119891272693
97 [0] --------------------> 0.8953806439

172 [0] --------------------> 0.9624995875368276
172 [0] --------------------> 0.9733685839566675
173 [1] --------------------> 0.9629605278680742
173 [1] --------------------> 0.973699477857281
174 [0] --------------------> 0.9623330391471142
174 [0] --------------------> 0.9732347528492556
175 [0] --------------------> 0.9622421659001833
175 [0] --------------------> 0.9731701804797728
176 [0] --------------------> 0.9622868953137873
176 [0] --------------------> 0.9732016014464677
177 [1] --------------------> 0.9622868953137873
177 [1] --------------------> 0.9732016014464677
178 [1] --------------------> 0.9622868953137873
178 [1] --------------------> 0.9732016014464677
179 [1] --------------------> 0.9622868953137873
179 [1] --------------------> 0.9732016014464677
180 [1] --------------------> 0.9622868953137873
180 [1] --------------------> 0.9732016014464677
181 [1] --------------------> 0.9622868953137873
181 [1] --------------------> 0.9732016014464677
182 [0] -------------

In [19]:
# 4
# learner_list11=[]
# learner_list11.append(learner1)
# learner_list11.append(learner2)
# learner_list11.append(learner3)
# learner_list11.append(learner4)
# Knn_lg_gb_cb=com(learner_list11)
# metric11=Knn_lg_gb_cb.rename(columns={'kappa': 'Knn_lg_gb_cb_kappa', 'F1': 'Knn_lg_gb_cb_F1'})

# save

In [20]:
#metric11.to_csv("./result/random_3901890_committee.csv")

In [21]:
df=pd.read_csv("./result/random_3901890_committee.csv",index_col=0)
df

Unnamed: 0,Knn_lg_gb_cb_kappa,Knn_lg_gb_cb_F1,Knn_light_kappa,Knn_light_F1,Knn_gradient_kappa,Knn_gradient_F1,Knn_catboost_kappa,Knn_catboost_F1,light_gradient_kappa,light_gradient_F1,light_catboost_kappa,light_catboost_F1,gradient_catboost_kappa,gradient_catboost_F1,lg_gb_cb_kappa,lg_gb_cb_F1,Knn_gb_cb_kappa,Knn_gb_cb_F1,Knn_lg_cb_kappa,Knn_lg_cb_F1
0,0.786853,0.841095,0.724544,0.800580,0.724544,0.800580,0.724544,0.800580,0.790388,0.847683,0.932494,0.951683,0.932494,0.951683,0.636296,0.712377,0.775576,0.833860,0.844307,0.887106
1,0.785580,0.840107,0.724544,0.800580,0.724544,0.800580,0.729418,0.803715,0.790398,0.847670,0.933537,0.952447,0.933537,0.952447,0.743082,0.803716,0.775038,0.833540,0.854763,0.895663
2,0.850370,0.891083,0.723842,0.800121,0.723842,0.800121,0.753729,0.822944,0.790398,0.847670,0.935574,0.953938,0.935574,0.953938,0.831857,0.875081,0.775188,0.833682,0.857590,0.897578
3,0.840783,0.885933,0.727967,0.803375,0.727967,0.803375,0.759364,0.829264,0.803551,0.856543,0.936343,0.954507,0.936343,0.954507,0.861807,0.898275,0.779266,0.836955,0.856265,0.896682
4,0.860178,0.899266,0.732840,0.806510,0.732840,0.806510,0.757945,0.828342,0.803688,0.856721,0.934897,0.953484,0.934897,0.953484,0.883270,0.914635,0.779458,0.837040,0.854667,0.895494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0.986847,0.990711,0.846949,0.890669,0.881217,0.916185,0.927976,0.948864,0.966251,0.976070,0.988339,0.991745,0.988339,0.991745,0.988339,0.991745,0.978971,0.985107,0.988701,0.992002
197,0.989627,0.992668,0.843738,0.888490,0.881019,0.916037,0.928176,0.949016,0.966205,0.976038,0.988928,0.992163,0.988928,0.992163,0.988928,0.992163,0.981664,0.987005,0.988838,0.992100
198,0.985369,0.989672,0.843738,0.888490,0.883502,0.917860,0.926713,0.947949,0.966251,0.976070,0.988520,0.991873,0.988520,0.991873,0.988520,0.991873,0.982882,0.987865,0.988885,0.992134
199,0.988240,0.991693,0.843524,0.888345,0.885197,0.919093,0.928220,0.949047,0.966251,0.976070,0.988565,0.991905,0.988565,0.991905,0.988565,0.991905,0.981176,0.986664,0.988839,0.992100


In [22]:
merged_df = pd.concat([df, metric10], axis=1)
merged_df 

Unnamed: 0,Knn_lg_gb_cb_kappa,Knn_lg_gb_cb_F1,Knn_light_kappa,Knn_light_F1,Knn_gradient_kappa,Knn_gradient_F1,Knn_catboost_kappa,Knn_catboost_F1,light_gradient_kappa,light_gradient_F1,...,gradient_catboost_kappa,gradient_catboost_F1,lg_gb_cb_kappa,lg_gb_cb_F1,Knn_gb_cb_kappa,Knn_gb_cb_F1,Knn_lg_cb_kappa,Knn_lg_cb_F1,Knn_lg_gb_kappa,Knn_lg_gb_F1
0,0.786853,0.841095,0.724544,0.800580,0.724544,0.800580,0.724544,0.800580,0.790388,0.847683,...,0.932494,0.951683,0.636296,0.712377,0.775576,0.833860,0.844307,0.887106,0.724544,0.800580
1,0.785580,0.840107,0.724544,0.800580,0.724544,0.800580,0.729418,0.803715,0.790398,0.847670,...,0.933537,0.952447,0.743082,0.803716,0.775038,0.833540,0.854763,0.895663,0.724544,0.800580
2,0.850370,0.891083,0.723842,0.800121,0.723842,0.800121,0.753729,0.822944,0.790398,0.847670,...,0.935574,0.953938,0.831857,0.875081,0.775188,0.833682,0.857590,0.897578,0.712569,0.782832
3,0.840783,0.885933,0.727967,0.803375,0.727967,0.803375,0.759364,0.829264,0.803551,0.856543,...,0.936343,0.954507,0.861807,0.898275,0.779266,0.836955,0.856265,0.896682,0.716482,0.786206
4,0.860178,0.899266,0.732840,0.806510,0.732840,0.806510,0.757945,0.828342,0.803688,0.856721,...,0.934897,0.953484,0.883270,0.914635,0.779458,0.837040,0.854667,0.895494,0.722059,0.789846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0.986847,0.990711,0.846949,0.890669,0.881217,0.916185,0.927976,0.948864,0.966251,0.976070,...,0.988339,0.991745,0.988339,0.991745,0.978971,0.985107,0.988701,0.992002,0.969592,0.978433
197,0.989627,0.992668,0.843738,0.888490,0.881019,0.916037,0.928176,0.949016,0.966205,0.976038,...,0.988928,0.992163,0.988928,0.992163,0.981664,0.987005,0.988838,0.992100,0.969592,0.978433
198,0.985369,0.989672,0.843738,0.888490,0.883502,0.917860,0.926713,0.947949,0.966251,0.976070,...,0.988520,0.991873,0.988520,0.991873,0.982882,0.987865,0.988885,0.992134,0.970030,0.978737
199,0.988240,0.991693,0.843524,0.888345,0.885197,0.919093,0.928220,0.949047,0.966251,0.976070,...,0.988565,0.991905,0.988565,0.991905,0.981176,0.986664,0.988839,0.992100,0.971362,0.979690


In [23]:
merged_df.to_csv("./result/random_3901890_committee.csv")