In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns


from sklearn.datasets import make_blobs
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef,cohen_kappa_score,balanced_accuracy_score


import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling,margin_sampling,entropy_sampling
from modAL.disagreement import KL_max_disagreement
from modAL.batch import uncertainty_batch_sampling

from functools import partial


import warnings

warnings.filterwarnings('ignore')

In [2]:
data_source="./dataset/3901890_all.csv"
data= pd.read_csv(data_source)
data

Unnamed: 0,datetime,latitude,longitude,pressure,salinity,temperature,label
0,-1.873862,1.261588,0.939612,0.676593,0.967276,-0.896749,0
1,-1.873862,1.261588,0.939612,0.670563,0.967218,-0.896749,0
2,-1.873862,1.261588,0.939612,0.661519,0.967218,-0.895699,0
3,-1.873862,1.261588,0.939612,0.653336,0.967218,-0.894649,0
4,-1.873862,1.261588,0.939612,0.644723,0.967218,-0.894123,0
...,...,...,...,...,...,...,...
178316,1.563162,-1.925879,-1.540014,-1.242509,-1.141878,1.222459,0
178317,1.563162,-1.925879,-1.540014,-1.246816,-1.141065,1.221934,0
178318,1.563162,-1.925879,-1.540014,-1.251122,-1.141065,1.223509,0
178319,1.563162,-1.925879,-1.540014,-1.256291,-1.138048,1.225610,0


In [3]:
row,col=data.shape

In [4]:
#def compute ratio
def comp_ratio(data):
    instance = data[(data['label']==1)]
    rate=len(instance)/len(data)*100
    print(rate)
    return rate

In [5]:
dirty_ratio = comp_ratio(data)
dirty_ratio

29.364460719713325


29.364460719713325

In [None]:
def plotCluster(data,name1,name2,name3,label,start,end):   
    %matplotlib notebook

    data1=data.loc[data[label] == 1]

    x1 = data1.loc[:,[name1]].iloc[start:end, :]  
    y1 = data1.loc[:,[name2]].iloc[start:end, :]  
    z1 = data1.loc[:,[name3]].iloc[start:end, :]  


    data2=data.loc[data[label] == 0]

    x2 = data2.loc[:,[name1]].iloc[start:end, :]  
    y2 = data2.loc[:,[name2]].iloc[start:end, :]  
    z2 = data2.loc[:,[name3]].iloc[start:end, :]  

    fig = plt.figure()
    ax = fig.add_subplot(projection="3d")
    ax.scatter(x1, y1, z1, c='r',s=10, label='Low-quality Data')
    ax.scatter(x2, y2, z2, c='g',s=10, label='Normal Data')
    
    ax.legend(loc='best')

    ax.set_zlabel(name1, fontdict={'size': 15, 'color': 'black'})
    ax.set_ylabel(name2, fontdict={'size': 15, 'color': 'black'})
    ax.set_xlabel(name3, fontdict={'size': 15, 'color': 'black'})
    plt.show()
    plt.savefig('3901890_scatter.png')
    
    #violin
#     fig = px.violin(data, y=name1,color=label)
#     fig.show()
#     fig = px.violin(data, y=name2,color=label)
#     fig.show()
#     fig = px.violin(data, y=name3,color=label)
#     fig.show()
    


In [None]:
dataplot=data.copy()

plotCluster(dataplot,'temperature','pressure','salinity','label',0,data.shape[0])

# Splite Dataset

In [6]:
train_set=pd.read_csv('./randomtrain/3901890.csv')
test_set=pd.read_csv('./randomtest/3901890.csv')
len(train_set),len(test_set)

(124622, 53497)

In [7]:
#train_set2,test_set2=getLastSplit(data,0.3,dirty_ratio)

In [8]:
comp_ratio(train_set)
comp_ratio(test_set)

29.35918216687262
29.364263416640185


29.364263416640185

In [9]:
#this function use random grid search to find best parameters
def getPar(model,dist,data,niter):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import RandomizedSearchCV
    x,y=data.shape

    clf = model

    param_dist = dist
    grid = RandomizedSearchCV(clf,param_dist,cv = 3,scoring = "balanced_accuracy",n_iter=niter,n_jobs = -1)

    #train
    grid.fit(data.iloc[:,0:y-1],data.iloc[:,y-1])
    #get best parameter
    print(grid.best_score_)
    return grid.best_params_

In [10]:
# metric
def computeMetric(y_tru,y_pre):
    acc = accuracy_score(y_tru,y_pre)
    pre=precision_score(y_tru,y_pre)
    recall=recall_score(y_tru,y_pre)
    cm=confusion_matrix(y_tru,y_pre)
    f1 = f1_score(y_tru,y_pre)
    mcc=matthews_corrcoef(y_tru, y_pre)
    kappa=cohen_kappa_score(y_tru, y_pre)
    bac=balanced_accuracy_score(y_tru,y_pre)
    print("acc:",acc)
    print("balanced acc:",bac)
    print("precision:",pre)
    print("recall:",recall)
    print("cm:",cm)
    print("f1:",f1)
    print("MCC:", mcc)
    print("Kappa:",kappa)
    
    # confusion matrix
#     cmap1 = sns.diverging_palette(260,-10,s=50, l=75, n=5, as_cmap=True)
#     plt.subplots(figsize=(12,8))
#     cf_matrix = confusion_matrix(y_tru, y_pre)
#     sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})
    
    return kappa

In [11]:
def draw(performance_history):
    fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
    ax.plot(performance_history)
    ax.scatter(range(len(performance_history)), performance_history, s=13)
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=5, integer=True))
    ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(nbins=10))
    ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=1))
    ax.set_ylim(bottom=0.80, top=1)
    ax.grid(True)
    ax.set_title('Incremental classification accuracy')
    ax.set_xlabel('Query iteration')
    ax.set_ylabel('Classification Accuracy')
    plt.show()

# splite train(pool), test, Labeled, Unlabeled

In [12]:
x,y=train_set.shape
#pool 
X_Pool = train_set.iloc[:,0:y-1].values
y_Pool = train_set.iloc[:,y-1].values

In [13]:
#test set
X_test=test_set.iloc[:,0:y-1]
y_tru=test_set.iloc[:,y-1]
print(len(X_test),len(y_tru))

53497 53497


In [None]:
#Pool
def initial_data(n_initial,X_Pool,y_Pool):
    #inital
    #n_initial=1000
    initial_idx = np.random.choice(range(len(X_Pool)), size=n_initial, replace=False)
    #initial Labeled data
    #X_initial, y_initial = X_train[], y_train[initial_idx]
    X_L = X_Pool[initial_idx]
    y_L = y_Pool[initial_idx]
    # Unlabeled data
    # X_U = Pool_X[ini_num:]
    # y_U = Pool_X[ini_num:]
    X_U, y_U = np.delete(X_Pool, initial_idx, axis=0), np.delete(y_Pool, initial_idx, axis=0)
    return X_L,y_L,X_U,y_U

In [None]:
# Set RNG seed for reproducibility.
RANDOM_STATE_SEED = 123
np.random.seed(RANDOM_STATE_SEED)

n_initial=100
N_QUERIES = 200

# Save initial training data

In [None]:
X_initial,y_initial,X_re,y_re=initial_data(n_initial,X_Pool,y_Pool)
df_X_i = pd.DataFrame(X_initial)
df_y_i = pd.DataFrame(y_initial)
df_X_r = pd.DataFrame(X_re)
df_y_r = pd.DataFrame(y_re)


df_X_i.to_csv('./randomtrain/3901890_X_initial.csv', index=False)
df_y_i.to_csv('./randomtrain/3901890_y_initial.csv', index=False)
df_X_r.to_csv('./randomtrain/3901890_X_re.csv', index=False)
df_y_r.to_csv('./randomtrain/3901890_y_re.csv', index=False)

# molAL

In [None]:
def random_sampling(classifier, X_pool):
    n_samples = len(X_pool)
    query_idx = np.random.choice(range(n_samples))
    return query_idx, X_pool[query_idx]

In [None]:
def al(clf,strategy,X_L,y_L):
    learner = ActiveLearner(estimator=clf,
                            query_strategy=strategy,
                            X_training=X_L, y_training=y_L)
    return learner

In [None]:
def al_learn(clf,sampling,X_initial,y_initial,X_re,y_re):
    X_L = X_initial.copy()
    y_L = y_initial.copy()
    X_U,y_U =X_re.copy(),y_re.copy()
    #print(len(X_U),len(y_U))
    learner = al(clf,sampling,X_L,y_L)
    y_pre=learner.predict(X_test)
    unqueried_score=cohen_kappa_score(y_tru, y_pre)
    #unqueried_score=f1_score(y_tru,y_pre)
    print("unqueried --------------------->",unqueried_score)
    performance_history = [unqueried_score]
    # Query
    for index in range(N_QUERIES):
        query_index, query_instance = learner.query(X_U)
        # Teach ActiveLearner model the record it has requested.
        X, y = X_U[query_index].reshape(1, -1), y_U[query_index].reshape(1, )
        print(index+1,"query label --------------------->",y)
        learner.teach(X=X, y=y)
        
        X_U, y_U = np.delete(X_U, query_index, axis=0), np.delete(y_U, query_index)
        y_pre=learner.predict(X_test)
        kappa=cohen_kappa_score(y_tru, y_pre)
        #f1=f1_score(y_tru,y_pre)
        print(index+1,"-------------------->",kappa)
        #print(index+1,"-------------------->",f1)
        # Recall precision F1
        #performance_history.append(f1)
        performance_history.append(kappa)
    #print(len(X_U))
    #draw(performance_history)
    return performance_history

In [14]:
# clf1 KNN
clf1 = KNeighborsClassifier()
dic1={'n_neighbors':[1,2,3,4,5,6,7,8]}
para1=getPar(clf1,dic1,test_set,10)
clf1 = KNeighborsClassifier(n_neighbors=para1['n_neighbors'],n_jobs=-1)
clf1.fit(X_Pool, y_Pool)
y_pre=clf1.predict(X_test)
computeMetric(y_tru,y_pre)

0.9920453450872895
acc: 0.9959063125035049
balanced acc: 0.9948891909629031
precision: 0.9936265137029955
recall: 0.9924247246801197
cm: [[37688   100]
 [  119 15590]]
f1: 0.9930252555813879
MCC: 0.9901286032207376
Kappa: 0.990128240084297


0.990128240084297

In [15]:
# clf2 lightgbm
clf2 = lgb.LGBMClassifier()

dic2 = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 900, 1000, 1500, 2000],
              'num_leaves': range(6, 50), 
              'min_child_samples': range(10, 200, 10), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
para2=getPar(clf2,dic2,test_set,10)

    
clf2 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=para2['num_leaves'], max_depth=-1, 
                              learning_rate=para2['learning_rate'], n_estimators=para2['n_estimators'], 
                              subsample_for_bin=200000, 
                              objective=None, class_weight=None, min_split_gain=0.0, 
                              min_child_weight=para2['min_child_weight'],
                              min_child_samples=para2['min_child_samples'], 
                              subsample=1.0, 
                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=para2['reg_alpha'], 
                              reg_lambda=para2['reg_lambda'], random_state=None, n_jobs=-1, importance_type='split')
clf2.fit(X_Pool, y_Pool)
y_pre=clf2.predict(X_test)
computeMetric(y_tru,y_pre)

0.9948747966809095
acc: 0.9971587191805148
balanced acc: 0.9958315073079793
precision: 0.9976965896730438
recall: 0.9926156980075116
cm: [[37752    36]
 [  116 15593]]
f1: 0.9951496585614908
MCC: 0.993147055488078
Kappa: 0.9931405832847607


0.9931405832847607

In [16]:
# clf3 gradientBoostingClassifier
clf3 = lgb.LGBMClassifier()

dic3 = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 900, 1000, 1500, 2000],
              'num_leaves': range(6, 50), 
              'min_child_samples': range(10, 200, 10), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
para3=getPar(clf3,dic3,test_set,10)

    
clf3 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=para3['num_leaves'], max_depth=-1, 
                              learning_rate=para3['learning_rate'], n_estimators=para3['n_estimators'], 
                              subsample_for_bin=200000, 
                              objective=None, class_weight=None, min_split_gain=0.0, 
                              min_child_weight=para3['min_child_weight'],
                              min_child_samples=para3['min_child_samples'], 
                              subsample=1.0, 
                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=para3['reg_alpha'], 
                              reg_lambda=para3['reg_lambda'], random_state=None, n_jobs=-1, importance_type='split')
clf3.fit(X_Pool, y_Pool)
y_pre=clf3.predict(X_test)
computeMetric(y_tru,y_pre)

0.9953064436763093
acc: 0.9972147970914257
balanced acc: 0.9958154109269128
precision: 0.9980793854033291
recall: 0.9924247246801197
cm: [[37758    30]
 [  119 15590]]
f1: 0.995244023109579
MCC: 0.993282853362263
Kappa: 0.9932748392113631


0.9932748392113631

In [17]:
#clf4 catboost
clf4 = CatBoostClassifier(loss_function='Logloss')
# dic4 = {'learning_rate': [0.03, 0.1],
#         'depth': [4, 6, 10],
#         'l2_leaf_reg': [1, 3, 5, 7, 9]}
# #para3=getPar(clf4,dic4,test_set,10)

# grid_search_result = clf4.grid_search(dic4, 
#                                        X=X_Pool, 
#                                        y=y_Pool)

clf4.fit(X_Pool, y_Pool)
y_pre=clf4.predict(X_test)
computeMetric(y_tru,y_pre)

Learning rate set to 0.080864
0:	learn: 0.4817185	total: 72.1ms	remaining: 1m 12s
1:	learn: 0.3345190	total: 85ms	remaining: 42.4s
2:	learn: 0.2350744	total: 98ms	remaining: 32.6s
3:	learn: 0.1719374	total: 112ms	remaining: 27.8s
4:	learn: 0.1301573	total: 156ms	remaining: 31.1s
5:	learn: 0.1021513	total: 178ms	remaining: 29.4s
6:	learn: 0.0826860	total: 190ms	remaining: 27s
7:	learn: 0.0673160	total: 202ms	remaining: 25s
8:	learn: 0.0561177	total: 213ms	remaining: 23.5s
9:	learn: 0.0483988	total: 224ms	remaining: 22.2s
10:	learn: 0.0419532	total: 235ms	remaining: 21.1s
11:	learn: 0.0381434	total: 246ms	remaining: 20.2s
12:	learn: 0.0352039	total: 255ms	remaining: 19.3s
13:	learn: 0.0321725	total: 265ms	remaining: 18.7s
14:	learn: 0.0300095	total: 274ms	remaining: 18s
15:	learn: 0.0279608	total: 285ms	remaining: 17.5s
16:	learn: 0.0261395	total: 296ms	remaining: 17.1s
17:	learn: 0.0249144	total: 305ms	remaining: 16.7s
18:	learn: 0.0236429	total: 316ms	remaining: 16.3s
19:	learn: 0.0223

163:	learn: 0.0088587	total: 2s	remaining: 10.2s
164:	learn: 0.0088453	total: 2.01s	remaining: 10.2s
165:	learn: 0.0088322	total: 2.02s	remaining: 10.2s
166:	learn: 0.0088165	total: 2.03s	remaining: 10.1s
167:	learn: 0.0088051	total: 2.04s	remaining: 10.1s
168:	learn: 0.0087957	total: 2.05s	remaining: 10.1s
169:	learn: 0.0087841	total: 2.06s	remaining: 10.1s
170:	learn: 0.0087841	total: 2.07s	remaining: 10.1s
171:	learn: 0.0087543	total: 2.09s	remaining: 10s
172:	learn: 0.0087363	total: 2.1s	remaining: 10s
173:	learn: 0.0087247	total: 2.11s	remaining: 10s
174:	learn: 0.0087069	total: 2.12s	remaining: 10s
175:	learn: 0.0086803	total: 2.14s	remaining: 10s
176:	learn: 0.0086637	total: 2.15s	remaining: 9.99s
177:	learn: 0.0086572	total: 2.16s	remaining: 9.98s
178:	learn: 0.0086481	total: 2.17s	remaining: 9.96s
179:	learn: 0.0086357	total: 2.18s	remaining: 9.95s
180:	learn: 0.0086354	total: 2.19s	remaining: 9.93s
181:	learn: 0.0086175	total: 2.21s	remaining: 9.92s
182:	learn: 0.0086094	tota

336:	learn: 0.0075581	total: 3.81s	remaining: 7.49s
337:	learn: 0.0075581	total: 3.82s	remaining: 7.47s
338:	learn: 0.0075581	total: 3.84s	remaining: 7.48s
339:	learn: 0.0075581	total: 3.85s	remaining: 7.47s
340:	learn: 0.0075581	total: 3.85s	remaining: 7.45s
341:	learn: 0.0075581	total: 3.86s	remaining: 7.43s
342:	learn: 0.0075565	total: 3.87s	remaining: 7.42s
343:	learn: 0.0075565	total: 3.88s	remaining: 7.41s
344:	learn: 0.0075565	total: 3.89s	remaining: 7.39s
345:	learn: 0.0075482	total: 3.91s	remaining: 7.38s
346:	learn: 0.0075482	total: 3.92s	remaining: 7.37s
347:	learn: 0.0075482	total: 3.93s	remaining: 7.36s
348:	learn: 0.0075481	total: 3.94s	remaining: 7.34s
349:	learn: 0.0075481	total: 3.95s	remaining: 7.33s
350:	learn: 0.0075481	total: 3.96s	remaining: 7.31s
351:	learn: 0.0075481	total: 3.96s	remaining: 7.3s
352:	learn: 0.0075481	total: 3.98s	remaining: 7.29s
353:	learn: 0.0075481	total: 3.98s	remaining: 7.27s
354:	learn: 0.0075481	total: 4s	remaining: 7.26s
355:	learn: 0.00

512:	learn: 0.0075323	total: 5.41s	remaining: 5.14s
513:	learn: 0.0075323	total: 5.42s	remaining: 5.13s
514:	learn: 0.0075323	total: 5.43s	remaining: 5.12s
515:	learn: 0.0075323	total: 5.44s	remaining: 5.11s
516:	learn: 0.0075323	total: 5.46s	remaining: 5.1s
517:	learn: 0.0075323	total: 5.47s	remaining: 5.09s
518:	learn: 0.0075323	total: 5.48s	remaining: 5.08s
519:	learn: 0.0075323	total: 5.49s	remaining: 5.07s
520:	learn: 0.0075323	total: 5.5s	remaining: 5.06s
521:	learn: 0.0075323	total: 5.51s	remaining: 5.05s
522:	learn: 0.0075323	total: 5.52s	remaining: 5.04s
523:	learn: 0.0075323	total: 5.53s	remaining: 5.03s
524:	learn: 0.0075323	total: 5.54s	remaining: 5.02s
525:	learn: 0.0075323	total: 5.55s	remaining: 5s
526:	learn: 0.0075323	total: 5.56s	remaining: 4.99s
527:	learn: 0.0075323	total: 5.57s	remaining: 4.98s
528:	learn: 0.0075323	total: 5.58s	remaining: 4.97s
529:	learn: 0.0075323	total: 5.59s	remaining: 4.96s
530:	learn: 0.0075323	total: 5.6s	remaining: 4.95s
531:	learn: 0.0075

688:	learn: 0.0075323	total: 7.04s	remaining: 3.17s
689:	learn: 0.0075323	total: 7.04s	remaining: 3.16s
690:	learn: 0.0075323	total: 7.05s	remaining: 3.15s
691:	learn: 0.0075323	total: 7.06s	remaining: 3.14s
692:	learn: 0.0075323	total: 7.07s	remaining: 3.13s
693:	learn: 0.0075323	total: 7.08s	remaining: 3.12s
694:	learn: 0.0075323	total: 7.09s	remaining: 3.11s
695:	learn: 0.0075323	total: 7.09s	remaining: 3.1s
696:	learn: 0.0075323	total: 7.1s	remaining: 3.09s
697:	learn: 0.0075323	total: 7.11s	remaining: 3.08s
698:	learn: 0.0075323	total: 7.12s	remaining: 3.06s
699:	learn: 0.0075323	total: 7.13s	remaining: 3.05s
700:	learn: 0.0075323	total: 7.13s	remaining: 3.04s
701:	learn: 0.0075323	total: 7.14s	remaining: 3.03s
702:	learn: 0.0075323	total: 7.15s	remaining: 3.02s
703:	learn: 0.0075323	total: 7.16s	remaining: 3.01s
704:	learn: 0.0075323	total: 7.17s	remaining: 3s
705:	learn: 0.0075323	total: 7.18s	remaining: 2.99s
706:	learn: 0.0075323	total: 7.18s	remaining: 2.98s
707:	learn: 0.007

848:	learn: 0.0075323	total: 8.43s	remaining: 1.5s
849:	learn: 0.0075323	total: 8.44s	remaining: 1.49s
850:	learn: 0.0075323	total: 8.45s	remaining: 1.48s
851:	learn: 0.0075323	total: 8.46s	remaining: 1.47s
852:	learn: 0.0075323	total: 8.47s	remaining: 1.46s
853:	learn: 0.0075323	total: 8.48s	remaining: 1.45s
854:	learn: 0.0075323	total: 8.49s	remaining: 1.44s
855:	learn: 0.0075323	total: 8.5s	remaining: 1.43s
856:	learn: 0.0075323	total: 8.51s	remaining: 1.42s
857:	learn: 0.0075323	total: 8.52s	remaining: 1.41s
858:	learn: 0.0075323	total: 8.53s	remaining: 1.4s
859:	learn: 0.0075323	total: 8.54s	remaining: 1.39s
860:	learn: 0.0075323	total: 8.55s	remaining: 1.38s
861:	learn: 0.0075323	total: 8.55s	remaining: 1.37s
862:	learn: 0.0075323	total: 8.56s	remaining: 1.36s
863:	learn: 0.0075323	total: 8.57s	remaining: 1.35s
864:	learn: 0.0075323	total: 8.58s	remaining: 1.34s
865:	learn: 0.0075323	total: 8.59s	remaining: 1.33s
866:	learn: 0.0075323	total: 8.6s	remaining: 1.32s
867:	learn: 0.00

0.9935040249527429

## random sampling

In [None]:
sampling=random_sampling

In [None]:
metric01 = al_learn(clf1,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric02 = al_learn(clf2,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric03 = al_learn(clf3,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric04 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

## Uncertainty

In [None]:
sampling=uncertainty_sampling

In [None]:
metric11 = al_learn(clf1,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric12 = al_learn(clf2,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric13 = al_learn(clf3,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric14 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

## entropy

In [None]:
sampling=entropy_sampling

In [None]:
metric21 = al_learn(clf1,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric22 = al_learn(clf2,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric23 = al_learn(clf3,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric24 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

## margin

In [None]:
sampling=margin_sampling

In [None]:
metric31 = al_learn(clf1,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric32 = al_learn(clf2,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric33 = al_learn(clf3,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric34 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

# visual


In [None]:
def compare_metric(col1,col2,col3,col4):
    fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
    x = np.linspace(200, 401, 201)
    l1=plt.plot(x, df[col1])
    l2=plt.plot(x, df[col2], color='blue', linestyle='--')
    l3=plt.plot(x, df[col3], color='red', linestyle='--')
    l4=plt.plot(x, df[col4], color='green')
    plt.legend(handles=[l1,l2,l3,l4],
    labels=[col1,col2,col3,col4],loc='best')
    plt.xlim((200, 401))
    plt.ylim((0, 1))
    plt.title('f1 over Time')
    plt.xlabel('Query Iteration')
    plt.ylabel('f1')

    my_x_ticks = np.arange(200, 401, 10)

    plt.xticks(my_x_ticks)
    plt.show()


In [None]:
# df = pd.DataFrame(metrics, columns=['Random_Knn', 'Random_Lighgbm',"Random_GradientBoosting","Random_Catboost",
#                                    'Uncertainty_Knn', 'Uncertainty_Lighgbm',"Uncertainty_GradientBoosting","Uncertainty_Catboost",
#                                    'Entropy_Knn', 'Entropy_Lighgbm',"Entropy_GradientBoosting","Entropy_Catboost",
#                                     'Margin_Knn', 'Margin_Lighgbm',"Margin_GradientBoosting","Margin_Catboost",
#                                    ])

In [None]:
df

In [None]:
compare_metric('Random_Knn', 'Random_Lighgbm',"Random_GradientBoosting","Random_Catboost")


compare_metric('Uncertainty_Knn', 'Uncertainty_Lighgbm',"Uncertainty_GradientBoosting","Uncertainty_Catboost")

In [None]:
compare_metric('Entropy_Knn', 'Entropy_Lighgbm',"Entropy_GradientBoosting","Entropy_Catboost")

In [None]:
compare_metric('Margin_Knn', 'Margin_Lighgbm',"Margin_GradientBoosting","Margin_Catboost")

In [None]:
#save
def con_format(data):
    metrics_arr=np.transpose(metrics)# array T
    df = pd.DataFrame(metrics_arr, columns=['Random_Knn', 'Random_Lighgbm',"Random_GradientBoosting","Random_Catboost",
                                   'Uncertainty_Knn', 'Uncertainty_Lighgbm',"Uncertainty_GradientBoosting","Uncertainty_Catboost",
                                   'Entropy_Knn', 'Entropy_Lighgbm',"Entropy_GradientBoosting","Entropy_Catboost",
                                    'Margin_Knn', 'Margin_Lighgbm',"Margin_GradientBoosting","Margin_Catboost",
                                   ])
    return df
    

In [None]:
metrics=[metric01,metric02,metric03,metric04,
        metric11,metric12,metric13,metric14,
        metric21,metric22,metric23,metric24,
        metric31,metric32,metric33,metric34,]



In [None]:
df=con_format(metrics)
df

In [None]:
df.to_csv("./result/random_f1_3901890.csv")