In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

#unsupervised
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from tensorflow import keras

#supervised
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier


#al
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling,margin_sampling,entropy_sampling
from modAL.disagreement import KL_max_disagreement
from modAL.batch import uncertainty_batch_sampling

#metric
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef,cohen_kappa_score,balanced_accuracy_score
from sklearn.metrics import make_scorer

#warnings
import warnings

warnings.filterwarnings('ignore')

2023-03-10 17:51:27.368432: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_set=pd.read_csv('./randomtrain/3902110.csv')
test_set=pd.read_csv('./randomtest/3902110.csv')#0.16588
print(len(train_set),len(test_set))

x,y=train_set.shape
#test set
X_test=test_set.iloc[:,0:y-1]
y_tru=test_set.iloc[:,y-1]
print(len(X_test),len(y_tru))

44307 18990
18990 18990


In [3]:
train_set1=train_set.iloc[:,:6]
test_set1=test_set.iloc[:,:6]

In [4]:
#this function use random grid search to find best parameters
def getPar(model,dist,data,niter):
    x,y=data.shape
    clf = model
    param_dist = dist
    scoring = make_scorer(f1_score)
    grid_search = GridSearchCV(clf, param_dist, cv=5, scoring=scoring)
    grid_search.fit(data)
    print("Best parameters:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    
    return grid_search.best_params_

# iforest

In [5]:
iforest = IsolationForest(n_estimators=100, contamination='auto')
params = {'n_estimators': [10, 50, 100],
          'max_samples': [0.1, 0.5, 1.0],
          'contamination': [0.01, 0.05, 0.1]}

para=getPar(iforest,params,train_set1,10)


iforest = IsolationForest(n_estimators=para['n_estimators'],max_samples=para['max_samples'],
                                  contamination=para['contamination'])

Best parameters: {'contamination': 0.01, 'max_samples': 0.1, 'n_estimators': 10}
Best score: nan


In [6]:
# compute score
iforest.fit(train_set1)
if_scores = -iforest.score_samples(train_set1)

In [7]:
# combine
if_result = pd.concat([train_set,pd.Series(if_scores, name='score')], axis=1)

# ascending by score
if_data=if_result.sort_values(by='score', ascending=False)
if_data.to_csv("if_score_low_top_3902110.csv")

In [8]:
train_data_if=if_data.iloc[:,:7]
train_data_if

Unnamed: 0,datetime,latitude,longitude,pressure,salinity,temperature,label
32740,-1.689269,-1.352247,-3.518201,1.533916,5.784250,0.168473,0
13580,-1.689269,-1.352247,-3.518201,1.561147,5.790447,0.168473,0
4134,-1.689269,-1.352247,-3.518201,1.588378,5.829007,0.168717,0
23019,-1.689269,-1.352247,-3.518201,1.659861,5.829696,0.168717,0
41094,-1.689269,-1.352247,-3.518201,1.462434,5.738805,0.167006,0
...,...,...,...,...,...,...,...
17736,0.518275,0.010505,-0.328021,-0.178253,-0.436243,-0.723579,0
12747,0.518275,0.010505,-0.328021,-0.242927,-0.436243,-0.723579,0
38696,0.518275,0.010505,-0.328021,-0.208888,-0.436243,-0.723579,0
20650,0.518161,0.010505,-0.328021,-0.164637,-0.435554,-0.719912,0


In [None]:
train_data_if

# autoencoder

In [9]:
# normalization
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train_set1)

#  Autoencoder model
input_dim = scaled_data.shape[1]
encoding_dim = 1
hidden_dim = 2
output_dim = input_dim
input_layer = keras.layers.Input(shape=(input_dim,))
encoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(input_layer)
encoder_layer2 = keras.layers.Dense(encoding_dim, activation='relu')(encoder_layer1)
decoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(encoder_layer2)
decoder_layer2 = keras.layers.Dense(output_dim, activation=None)(decoder_layer1)
autoencoder = keras.models.Model(inputs=input_layer, outputs=decoder_layer2)

2023-03-10 17:54:46.654161: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# compile and fit
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(train_set1, train_set1, epochs=50, batch_size=16, verbose=0)

<keras.callbacks.History at 0x7fc3336f1ca0>

In [11]:
# compute
reconstructed_data = autoencoder.predict(train_set1)
mse = np.mean(np.power(train_set1 - reconstructed_data, 2), axis=1)
anomaly_scores = pd.Series(mse, name='anomaly_score')



In [12]:
# combine
auto_result = pd.concat([train_set, anomaly_scores], axis=1)

auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
auto_data.to_csv("auto_score_low_top_3902110.csv")

In [13]:
train_data_auto=auto_data.iloc[:,:7]
train_data_auto

Unnamed: 0,datetime,latitude,longitude,pressure,salinity,temperature,label
21578,1.896719,2.130023,0.666384,4.791462,2.014359,-0.215337,0
11548,1.896719,2.130023,0.666384,4.764230,2.015047,-0.215092,0
30720,0.223804,1.394365,0.085388,4.859540,2.866113,-0.094327,0
9666,1.896719,2.130023,0.666384,4.658709,2.015047,-0.215337,0
6807,0.223804,1.394365,0.085388,4.798270,2.852342,-0.095060,0
...,...,...,...,...,...,...,...
20934,1.718168,1.224987,1.397360,-0.484605,-0.681372,-0.162777,0
11038,1.718289,1.224987,1.397360,-0.453970,-0.679306,-0.162777,0
14453,1.718168,1.224987,1.397360,-0.443758,-0.679995,-0.162777,0
2624,1.482203,1.208701,1.161052,-0.147618,-0.532642,-0.085282,0


# OneClassSVM

In [14]:
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
ocsvm.fit(train_set1)

In [15]:
scores3 = -ocsvm.decision_function(train_set1)

In [16]:
# combine
oc_result = pd.concat([train_set, pd.Series(scores3, name='score')], axis=1)

oc_data=oc_result.sort_values(by='score', ascending=False)
oc_data.to_csv("oc_score_low_top_3902110.csv")
train_data_oc=oc_data.iloc[:,:7]
train_data_oc


# auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
# train_data_auto=auto_data.iloc[:,:7]
# train_data_auto

Unnamed: 0,datetime,latitude,longitude,pressure,salinity,temperature,label
23019,-1.689269,-1.352247,-3.518201,1.659861,5.829696,0.168717,0
10610,-1.682008,-1.281885,-3.651659,1.595186,5.823499,0.168717,0
37609,-1.682008,-1.281885,-3.651659,1.557743,5.822121,0.168717,0
4134,-1.689269,-1.352247,-3.518201,1.588378,5.829007,0.168717,0
13580,-1.689269,-1.352247,-3.518201,1.561147,5.790447,0.168473,0
...,...,...,...,...,...,...,...
21441,0.422659,0.048005,0.055053,-0.341641,-0.520936,-0.516029,0
38944,0.422514,0.048005,0.055053,-0.307602,-0.520936,-0.512606,0
44177,0.422514,0.048005,0.055053,-0.379084,-0.520936,-0.512606,0
12553,0.422514,0.048005,0.055053,-0.338237,-0.520936,-0.512606,0


# active learning

In [31]:
#n_initial= 100
n_initial = 1000
N_QUERIES = 1000

In [22]:
#Pool
def initial_data(n_initial,X_Pool,y_Pool):
    #inital
    #initial Labeled data
    #X_initial, y_initial = X_train[], y_train[initial_idx]
    X_L = X_Pool[:n_initial]
    y_L = y_Pool[:n_initial]
    # Unlabeled data
    X_U = X_Pool[n_initial:]
    y_U = y_Pool[n_initial:]
    return X_L,y_L,X_U,y_U

In [18]:
def al_learn(clf,sampling,X_initial,y_initial,X_re,y_re):
    X_L = X_initial.copy()
    y_L = y_initial.copy()
    X_U,y_U =X_re.copy(),y_re.copy()
    #print(len(X_U),len(y_U))
    learner = al(clf,sampling,X_L,y_L)
    y_pre=learner.predict(X_test)
    unqueried_kappa=cohen_kappa_score(y_tru, y_pre)
    unqueried_f1=f1_score(y_tru,y_pre)
    #print("unqueried --------------------->",unqueried_score)
    kappa_history = [unqueried_kappa]
    f1_history = [unqueried_f1]
    
    # Query
    for index in range(N_QUERIES):
        query_index=0
        # Teach ActiveLearner model the record it has requested.
        X, y = X_U[query_index].reshape(1, -1), y_U[query_index].reshape(1, )
        #print(index+1,"query label --------------------->",y)
        learner.teach(X=X, y=y)
        
        X_U, y_U = np.delete(X_U, query_index, axis=0), np.delete(y_U, query_index)
        y_pre=learner.predict(X_test)
        kappa=cohen_kappa_score(y_tru, y_pre)
        f1=f1_score(y_tru,y_pre)
        print(index+1,"-------------------->",kappa)
        print(index+1,"-------------------->",f1)
        # Recall precision F1
        kappa_history.append(kappa)
        f1_history.append(f1)
    #print(len(X_U))
    #draw(performance_history)
    df_scores= pd.concat([pd.DataFrame(kappa_history,columns=['kappa']), 
                          pd.DataFrame(f1_history,columns=['f1'])],
                         axis=1)
    return df_scores 

In [19]:
# metric
def computeMetric(y_tru,y_pre):
    acc = accuracy_score(y_tru,y_pre)
    pre=precision_score(y_tru,y_pre)
    recall=recall_score(y_tru,y_pre)
    cm=confusion_matrix(y_tru,y_pre)
    f1 = f1_score(y_tru,y_pre)
    mcc=matthews_corrcoef(y_tru, y_pre)
    kappa=cohen_kappa_score(y_tru, y_pre)
    bac=balanced_accuracy_score(y_tru,y_pre)
    print("acc:",acc)
    print("balanced acc:",bac)
    print("precision:",pre)
    print("recall:",recall)
    print("cm:",cm)
    print("f1:",f1)
    print("MCC:", mcc)
    print("Kappa:",kappa)
    
    # confusion matrix
#     cmap1 = sns.diverging_palette(260,-10,s=50, l=75, n=5, as_cmap=True)
#     plt.subplots(figsize=(12,8))
#     cf_matrix = confusion_matrix(y_tru, y_pre)
#     sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})
    
    return kappa

In [20]:
#clf4 catboost
clf4 = CatBoostClassifier(loss_function='Logloss')
clf4.fit(train_set1.values, train_set.iloc[:,6].values)
y_pre=clf4.predict(X_test)
computeMetric(y_tru,y_pre)

Learning rate set to 0.051997
0:	learn: 0.5308256	total: 75.8ms	remaining: 1m 15s
1:	learn: 0.4043105	total: 82.1ms	remaining: 40.9s
2:	learn: 0.3146712	total: 88.8ms	remaining: 29.5s
3:	learn: 0.2449088	total: 95.5ms	remaining: 23.8s
4:	learn: 0.1934451	total: 102ms	remaining: 20.2s
5:	learn: 0.1517692	total: 107ms	remaining: 17.8s
6:	learn: 0.1207332	total: 114ms	remaining: 16.1s
7:	learn: 0.0965690	total: 119ms	remaining: 14.8s
8:	learn: 0.0781812	total: 126ms	remaining: 13.8s
9:	learn: 0.0645772	total: 132ms	remaining: 13s
10:	learn: 0.0538813	total: 138ms	remaining: 12.4s
11:	learn: 0.0455537	total: 144ms	remaining: 11.8s
12:	learn: 0.0391380	total: 150ms	remaining: 11.4s
13:	learn: 0.0336367	total: 156ms	remaining: 11s
14:	learn: 0.0293973	total: 163ms	remaining: 10.7s
15:	learn: 0.0258458	total: 169ms	remaining: 10.4s
16:	learn: 0.0233412	total: 175ms	remaining: 10.1s
17:	learn: 0.0211118	total: 181ms	remaining: 9.86s
18:	learn: 0.0194391	total: 188ms	remaining: 9.68s
19:	learn:

171:	learn: 0.0046279	total: 1.18s	remaining: 5.68s
172:	learn: 0.0046142	total: 1.19s	remaining: 5.68s
173:	learn: 0.0046004	total: 1.19s	remaining: 5.67s
174:	learn: 0.0045903	total: 1.2s	remaining: 5.66s
175:	learn: 0.0045785	total: 1.21s	remaining: 5.65s
176:	learn: 0.0045723	total: 1.21s	remaining: 5.65s
177:	learn: 0.0045570	total: 1.22s	remaining: 5.63s
178:	learn: 0.0045249	total: 1.23s	remaining: 5.63s
179:	learn: 0.0044924	total: 1.23s	remaining: 5.62s
180:	learn: 0.0044752	total: 1.24s	remaining: 5.61s
181:	learn: 0.0044461	total: 1.25s	remaining: 5.61s
182:	learn: 0.0044351	total: 1.25s	remaining: 5.6s
183:	learn: 0.0044251	total: 1.26s	remaining: 5.6s
184:	learn: 0.0044052	total: 1.27s	remaining: 5.59s
185:	learn: 0.0043978	total: 1.27s	remaining: 5.58s
186:	learn: 0.0043880	total: 1.28s	remaining: 5.58s
187:	learn: 0.0043611	total: 1.29s	remaining: 5.57s
188:	learn: 0.0043423	total: 1.3s	remaining: 5.57s
189:	learn: 0.0043085	total: 1.3s	remaining: 5.56s
190:	learn: 0.004

352:	learn: 0.0027659	total: 2.37s	remaining: 4.35s
353:	learn: 0.0027557	total: 2.38s	remaining: 4.34s
354:	learn: 0.0027452	total: 2.38s	remaining: 4.33s
355:	learn: 0.0027404	total: 2.39s	remaining: 4.33s
356:	learn: 0.0027317	total: 2.4s	remaining: 4.32s
357:	learn: 0.0027273	total: 2.41s	remaining: 4.32s
358:	learn: 0.0027195	total: 2.41s	remaining: 4.31s
359:	learn: 0.0027097	total: 2.42s	remaining: 4.3s
360:	learn: 0.0027054	total: 2.42s	remaining: 4.29s
361:	learn: 0.0027036	total: 2.43s	remaining: 4.29s
362:	learn: 0.0026972	total: 2.44s	remaining: 4.28s
363:	learn: 0.0026895	total: 2.45s	remaining: 4.27s
364:	learn: 0.0026819	total: 2.45s	remaining: 4.27s
365:	learn: 0.0026764	total: 2.46s	remaining: 4.26s
366:	learn: 0.0026667	total: 2.47s	remaining: 4.25s
367:	learn: 0.0026601	total: 2.47s	remaining: 4.25s
368:	learn: 0.0026562	total: 2.48s	remaining: 4.24s
369:	learn: 0.0026525	total: 2.49s	remaining: 4.24s
370:	learn: 0.0026452	total: 2.49s	remaining: 4.23s
371:	learn: 0.

532:	learn: 0.0018799	total: 3.57s	remaining: 3.13s
533:	learn: 0.0018762	total: 3.58s	remaining: 3.12s
534:	learn: 0.0018713	total: 3.58s	remaining: 3.12s
535:	learn: 0.0018697	total: 3.59s	remaining: 3.11s
536:	learn: 0.0018650	total: 3.6s	remaining: 3.1s
537:	learn: 0.0018602	total: 3.6s	remaining: 3.1s
538:	learn: 0.0018558	total: 3.61s	remaining: 3.09s
539:	learn: 0.0018460	total: 3.62s	remaining: 3.08s
540:	learn: 0.0018394	total: 3.62s	remaining: 3.07s
541:	learn: 0.0018293	total: 3.63s	remaining: 3.07s
542:	learn: 0.0018257	total: 3.63s	remaining: 3.06s
543:	learn: 0.0018237	total: 3.64s	remaining: 3.05s
544:	learn: 0.0018197	total: 3.65s	remaining: 3.04s
545:	learn: 0.0018089	total: 3.65s	remaining: 3.04s
546:	learn: 0.0018068	total: 3.66s	remaining: 3.03s
547:	learn: 0.0018019	total: 3.67s	remaining: 3.02s
548:	learn: 0.0017946	total: 3.67s	remaining: 3.02s
549:	learn: 0.0017901	total: 3.68s	remaining: 3.01s
550:	learn: 0.0017873	total: 3.68s	remaining: 3s
551:	learn: 0.00178

716:	learn: 0.0012833	total: 4.77s	remaining: 1.88s
717:	learn: 0.0012829	total: 4.77s	remaining: 1.87s
718:	learn: 0.0012816	total: 4.78s	remaining: 1.87s
719:	learn: 0.0012805	total: 4.79s	remaining: 1.86s
720:	learn: 0.0012799	total: 4.79s	remaining: 1.85s
721:	learn: 0.0012777	total: 4.8s	remaining: 1.85s
722:	learn: 0.0012717	total: 4.8s	remaining: 1.84s
723:	learn: 0.0012692	total: 4.81s	remaining: 1.83s
724:	learn: 0.0012686	total: 4.82s	remaining: 1.83s
725:	learn: 0.0012676	total: 4.82s	remaining: 1.82s
726:	learn: 0.0012646	total: 4.83s	remaining: 1.81s
727:	learn: 0.0012618	total: 4.83s	remaining: 1.81s
728:	learn: 0.0012585	total: 4.84s	remaining: 1.8s
729:	learn: 0.0012560	total: 4.84s	remaining: 1.79s
730:	learn: 0.0012555	total: 4.85s	remaining: 1.78s
731:	learn: 0.0012535	total: 4.86s	remaining: 1.78s
732:	learn: 0.0012529	total: 4.86s	remaining: 1.77s
733:	learn: 0.0012510	total: 4.87s	remaining: 1.76s
734:	learn: 0.0012487	total: 4.87s	remaining: 1.76s
735:	learn: 0.0

884:	learn: 0.0009739	total: 5.77s	remaining: 750ms
885:	learn: 0.0009731	total: 5.77s	remaining: 743ms
886:	learn: 0.0009721	total: 5.78s	remaining: 736ms
887:	learn: 0.0009714	total: 5.79s	remaining: 730ms
888:	learn: 0.0009682	total: 5.79s	remaining: 723ms
889:	learn: 0.0009668	total: 5.8s	remaining: 717ms
890:	learn: 0.0009661	total: 5.81s	remaining: 710ms
891:	learn: 0.0009642	total: 5.81s	remaining: 704ms
892:	learn: 0.0009637	total: 5.82s	remaining: 697ms
893:	learn: 0.0009627	total: 5.82s	remaining: 691ms
894:	learn: 0.0009611	total: 5.83s	remaining: 684ms
895:	learn: 0.0009604	total: 5.83s	remaining: 677ms
896:	learn: 0.0009596	total: 5.84s	remaining: 671ms
897:	learn: 0.0009575	total: 5.85s	remaining: 664ms
898:	learn: 0.0009563	total: 5.85s	remaining: 658ms
899:	learn: 0.0009558	total: 5.86s	remaining: 651ms
900:	learn: 0.0009533	total: 5.87s	remaining: 644ms
901:	learn: 0.0009520	total: 5.87s	remaining: 638ms
902:	learn: 0.0009518	total: 5.88s	remaining: 631ms
903:	learn: 0

0.2851406630361132

In [21]:
def al(clf,strategy,X_L,y_L):
    learner = ActiveLearner(estimator=clf,
                            query_strategy=strategy,
                            X_training=X_L, y_training=y_L)
    return learner

# auto

In [32]:
x,y=train_set.shape


#pool 
X_Pool = train_data_auto.iloc[:,0:y-1].values
y_Pool = train_data_auto.iloc[:,y-1].values


X_in_au,y_in_au,X_re_au,y_re_au=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_au),len(X_re_au))

1000 43307


In [33]:
sampling=uncertainty_sampling

In [34]:
metric1 = al_learn(clf4,sampling,X_in_au,y_in_au,X_re_au,y_re_au)

CatBoostError: catboost/private/libs/target/target_converter.cpp:375: Target contains only one unique value

In [None]:
metric1.rename(columns = {'kappa' : 'Auto_Kappa', 'f1' : 'Auto_F1'}, inplace = True)

In [None]:
metric1

In [None]:
fig, ax = plt.subplots(figsize=(8.5, 10))
x = np.linspace(1000, 2001, 1001)
plt.plot(x,metric1)
plt.legend(loc = "best")
plt.title('Kappa of AL over Time')#Kappa F1
plt.xlabel('Number of Queried Instance')
plt.ylabel('Kappa')
my_x_ticks = np.arange(1000, 2001, 50)
plt.xticks(my_x_ticks)
plt.show()

# iforest

In [None]:
#pool 
X_Pool = train_data_if.iloc[:,0:y-1].values
y_Pool = train_data_if.iloc[:,y-1].values

n_initial=300
X_in_if,y_in_if,X_re_if,y_re_if=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_if),len(X_re_if))

In [None]:
metric2 = al_learn(clf4,sampling,X_in_if,y_in_if,X_re_if,y_re_if)

In [None]:
metric2.rename(columns = {'kappa' : 'IF_Kappa', 'f1' : 'IF_F1'}, inplace = True)

In [None]:
metric2

# oneclasssvm

In [None]:
#pool 
X_Pool = train_data_oc.iloc[:,0:y-1].values
y_Pool = train_data_oc.iloc[:,y-1].values


X_in_oc,y_in_oc,X_re_oc,y_re_oc=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_oc),len(X_re_oc))

In [None]:
metric3 = al_learn(clf4,sampling,X_in_oc,y_in_oc,X_re_oc,y_re_oc)

In [None]:
metric3.rename(columns = {'kappa' : 'OCSVM_Kappa', 'f1' : 'OCSVM_F1'}, inplace = True)

# all

In [None]:
metrics=pd.concat([metric1,metric2,metric3],axis=1)
metrics

In [None]:
metrics.to_csv("random_low_un_50_top.csv")