In [None]:
#!pip3 install tensorflow

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

#unsupervised
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from tensorflow import keras

#supervised
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier


#al
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling,margin_sampling,entropy_sampling
from modAL.disagreement import KL_max_disagreement
from modAL.batch import uncertainty_batch_sampling

#metric
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef,cohen_kappa_score,balanced_accuracy_score
from sklearn.metrics import make_scorer

#warnings
import warnings

warnings.filterwarnings('ignore')

In [None]:
train_set=pd.read_csv('./randomtrain/3901890.csv')
test_set=pd.read_csv('./randomtest/3901890.csv')
len(train_set),len(test_set)

#test set
X_test=test_set.iloc[:,0:y-1]
y_tru=test_set.iloc[:,y-1]
print(len(X_test),len(y_tru))

In [None]:
train_set1=train_set.iloc[:,:6]
test_set1=test_set.iloc[:,:6]

In [None]:
#initial_data
X_initial=pd.read_csv('./randomtrain/3901890_X_initial.csv').values
y_initial=pd.read_csv('./randomtrain/3901890_y_initial.csv').values.ravel()
X_re=pd.read_csv('./randomtrain/3901890_X_re.csv').values
y_re=pd.read_csv('./randomtrain/3901890_y_re.csv').values.ravel()
print(X_re.shape,X_initial.shape)
print(y_re.shape,y_initial.shape)

In [None]:
#this function use random grid search to find best parameters
def getPar(model,dist,data,niter):
    x,y=data.shape

    clf = model

    param_dist = dist
    scoring = make_scorer(f1_score)
    grid_search = GridSearchCV(clf, param_dist, cv=5, scoring=scoring)
    grid_search.fit(data)
    print("Best parameters:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    
    return grid_search.best_params_

# iforest

In [None]:
iforest = IsolationForest(n_estimators=100, contamination='auto')
params = {'n_estimators': [10, 50, 100],
          'max_samples': [0.1, 0.5, 1.0],
          'contamination': [0.01, 0.05, 0.1]}

para=getPar(iforest,params,train_set1,10)


iforest = IsolationForest(n_estimators=para['n_estimators'],max_samples=para['max_samples'],
                                  contamination=para['contamination'])

In [None]:
# compute score
iforest.fit(X_re)
if_scores = -iforest.score_samples(X_re)

In [127]:
# combine
if_result = pd.concat([pd.DataFrame(X_re, columns=['datetime', 'latitude', 'longitude', 'pressure', 'salinity',
       'temperature']), pd.Series(y_re, name='label'),pd.Series(if_scores, name='score')], axis=1)

# ascending by score
if_data=if_result.sort_values(by='score', ascending=False)
if_data.to_csv("if_score_high.csv")

In [None]:
train_data_if=if_data.iloc[:,:7]
train_data_if

# autoencoder

In [None]:
# normalization
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train_set1)

#  Autoencoder model
input_dim = scaled_data.shape[1]
encoding_dim = 1
hidden_dim = 2
output_dim = input_dim
input_layer = keras.layers.Input(shape=(input_dim,))
encoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(input_layer)
encoder_layer2 = keras.layers.Dense(encoding_dim, activation='relu')(encoder_layer1)
decoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(encoder_layer2)
decoder_layer2 = keras.layers.Dense(output_dim, activation=None)(decoder_layer1)
autoencoder = keras.models.Model(inputs=input_layer, outputs=decoder_layer2)

In [None]:
# compile and fit
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(scaled_data, scaled_data, epochs=50, batch_size=16, verbose=0)

In [None]:
# compute
reconstructed_data = autoencoder.predict(X_re)
mse = np.mean(np.power(X_re - reconstructed_data, 2), axis=1)
anomaly_scores = pd.Series(mse, name='anomaly_score')

In [125]:
# combine
auto_result = pd.concat([pd.DataFrame(X_re, columns=['datetime', 'latitude', 'longitude', 'pressure', 'salinity',
       'temperature']), pd.Series(y_re, name='label'),anomaly_scores,], axis=1)

auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
auto_data.to_csv("auto_score_high.csv")
train_data_auto=auto_data.iloc[:,:7]
train_data_auto

Unnamed: 0,datetime,latitude,longitude,pressure,salinity,temperature,label
107158,-0.012921,0.007223,0.388787,1.298061,0.586663,-0.156916,0
71534,-1.429395,1.265347,1.001266,0.519826,0.967276,-0.880471,0
103608,0.842185,-0.272558,-0.691385,0.497861,-1.195546,-0.250380,1
72958,-0.592390,0.392775,1.079425,1.377737,0.970699,-0.505042,0
91376,-0.527999,0.440708,1.096360,0.860923,0.970815,-0.513968,0
...,...,...,...,...,...,...,...
80671,-1.501837,1.343735,0.910565,-0.703300,0.968320,-0.197872,0
61481,0.825411,-0.181853,-0.639377,-0.439725,-1.217362,-0.121211,1
16018,0.695303,0.039079,-0.778532,2.505683,-1.206918,-0.911451,1
63978,0.494105,-0.046720,-0.737961,-0.645158,-1.240628,0.294649,1


# oneclasssvm

In [None]:
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
ocsvm.fit(train_set1)

In [None]:
scores3 = -ocsvm.decision_function(X_re)

In [126]:
# combine
oc_result = pd.concat([pd.DataFrame(X_re, columns=['datetime', 'latitude', 'longitude', 'pressure', 'salinity',
       'temperature']),pd.Series(y_re, name='label'), pd.Series(scores3, name='score')], axis=1)

oc_data=oc_result.sort_values(by='score', ascending=False)
oc_data.to_csv("oc_score_high.csv")
train_data_oc=oc_data.iloc[:,:7]
train_data_oc

Unnamed: 0,datetime,latitude,longitude,pressure,salinity,temperature,label
107158,-0.012921,0.007223,0.388787,1.298061,0.586663,-0.156916,0
71534,-1.429395,1.265347,1.001266,0.519826,0.967276,-0.880471,0
103608,0.842185,-0.272558,-0.691385,0.497861,-1.195546,-0.250380,1
72958,-0.592390,0.392775,1.079425,1.377737,0.970699,-0.505042,0
91376,-0.527999,0.440708,1.096360,0.860923,0.970815,-0.513968,0
...,...,...,...,...,...,...,...
102515,0.850576,-0.500787,-0.675920,-0.313105,-1.197809,-0.077630,1
79360,-0.519967,0.385320,1.148352,-0.640852,0.981955,0.337181,0
59187,0.333137,-0.639724,-0.286320,-0.858775,-1.226935,1.580561,1
29743,0.139989,-0.788897,0.267663,-0.532751,0.593509,0.321953,0


# active learning

In [None]:
n_initial= 100
N_QUERIES = 1000
#N_QUERIES = 200

In [None]:
def al(clf,strategy,X_L,y_L):
    learner = ActiveLearner(estimator=clf,
                            query_strategy=strategy,
                            X_training=X_L, y_training=y_L)
    return learner

In [None]:
def al_learn(clf,sampling,X_initial,y_initial,X_re,y_re):
    X_L = X_initial.copy()
    y_L = y_initial.copy()
    X_U,y_U =X_re.copy(),y_re.copy()
    #print(len(X_U),len(y_U))
    learner = al(clf,sampling,X_L,y_L)
    y_pre=learner.predict(X_test)
    unqueried_kappa=cohen_kappa_score(y_tru, y_pre)
    unqueried_f1=f1_score(y_tru,y_pre)
    #print("unqueried --------------------->",unqueried_score)
    kappa_history = [unqueried_kappa]
    f1_history = [unqueried_f1]
    
    # Query
    for index in range(N_QUERIES):
        query_index=0
        # Teach ActiveLearner model the record it has requested.
        X, y = X_U[query_index].reshape(1, -1), y_U[query_index].reshape(1, )
        #print(index+1,"query label --------------------->",y)
        learner.teach(X=X, y=y)
        
        X_U, y_U = np.delete(X_U, query_index, axis=0), np.delete(y_U, query_index)
        y_pre=learner.predict(X_test)
        kappa=cohen_kappa_score(y_tru, y_pre)
        f1=f1_score(y_tru,y_pre)
        #print(index+1,"-------------------->",kappa)
        #print(index+1,"-------------------->",f1)
        # Recall precision F1
        kappa_history.append(kappa)
        f1_history.append(f1)
    #print(len(X_U))
    #draw(performance_history)
    df_scores= pd.concat([pd.DataFrame(kappa_history,columns=['kappa']), 
                          pd.DataFrame(f1_history,columns=['f1'])],
                         axis=1)
    return df_scores 

In [None]:
# metric
def computeMetric(y_tru,y_pre):
    acc = accuracy_score(y_tru,y_pre)
    pre=precision_score(y_tru,y_pre)
    recall=recall_score(y_tru,y_pre)
    cm=confusion_matrix(y_tru,y_pre)
    f1 = f1_score(y_tru,y_pre)
    mcc=matthews_corrcoef(y_tru, y_pre)
    kappa=cohen_kappa_score(y_tru, y_pre)
    bac=balanced_accuracy_score(y_tru,y_pre)
    print("acc:",acc)
    print("balanced acc:",bac)
    print("precision:",pre)
    print("recall:",recall)
    print("cm:",cm)
    print("f1:",f1)
    print("MCC:", mcc)
    print("Kappa:",kappa)
    
    # confusion matrix
#     cmap1 = sns.diverging_palette(260,-10,s=50, l=75, n=5, as_cmap=True)
#     plt.subplots(figsize=(12,8))
#     cf_matrix = confusion_matrix(y_tru, y_pre)
#     sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})
    
    return kappa

In [None]:
#clf4 catboost
clf4 = CatBoostClassifier(loss_function='Logloss')
# dic4 = {'learning_rate': [0.03, 0.1],
#         'depth': [4, 6, 10],
#         'l2_leaf_reg': [1, 3, 5, 7, 9]}
# #para3=getPar(clf4,dic4,test_set,10)

# grid_search_result = clf4.grid_search(dic4, 
#                                        X=X_Pool, 
#                                        y=y_Pool)

clf4.fit(X_Pool, y_Pool)
y_pre=clf4.predict(X_test)
computeMetric(y_tru,y_pre)

# auto

In [None]:
x,y=train_set.shape
#pool 
X_re = train_data_auto.iloc[:,0:y-1].values
y_re = train_data_auto.iloc[:,y-1].values
print(len(X_re),len(X_initial))

In [None]:
sampling=uncertainty_sampling

In [None]:
metric1 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric1.rename(columns = {'kappa' : 'Auto_Kappa', 'f1' : 'Auto_F1'}, inplace = True)

In [None]:
fig, ax = plt.subplots(figsize=(8.5, 2), dpi=130)
x = np.linspace(100, 1101, 1001)
l1=plt.plot(x, metric1)
plt.legend(metric1.columns)
plt.title('Kappa over Queried Instance Amount')
plt.xlabel('Number of Queried Instance')
plt.ylabel('Kappa')
plt.show()

# iforest

In [None]:
x,y=train_set.shape
#pool 
X_re = train_data_if.iloc[:,0:y-1].values
y_re = train_data_if.iloc[:,y-1].values
print(len(X_re),len(X_initial))

In [None]:
metric2 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric2.rename(columns = {'kappa' : 'IF_Kappa', 'f1' : 'IF_F1'}, inplace = True)

In [None]:
metric2

# oneclasssvm

In [None]:
#pool 
X_re = train_data_oc.iloc[:,0:y-1].values
y_re = train_data_oc.iloc[:,y-1].values

In [None]:
metric3 = al_learn(clf4,sampling,X_initial,y_initial,X_re,y_re)

In [None]:
metric3.rename(columns = {'kappa' : 'OCSVM_Kappa', 'f1' : 'OCSVM_F1'}, inplace = True)

In [None]:
metric3

# all

In [None]:
metrics=pd.concat([metric1,metric2,metric3],axis=1)
metrics

In [None]:
metrics.to_csv("./result/random_high_un_1000.csv")