In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

#unsupervised
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from tensorflow import keras

#supervised
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier


#al
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling,margin_sampling,entropy_sampling
from modAL.disagreement import KL_max_disagreement
from modAL.batch import uncertainty_batch_sampling

#metric
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef,cohen_kappa_score,balanced_accuracy_score
from sklearn.metrics import make_scorer

#warnings
import warnings

warnings.filterwarnings('ignore')

In [None]:
train_set=pd.read_csv('./randomtrain/6903102.csv')
test_set=pd.read_csv('./randomtest/6903102.csv')
print(len(train_set),len(test_set))

x,y=train_set.shape
#test set
X_test=test_set.iloc[:,0:y-1]
y_tru=test_set.iloc[:,y-1]
print(len(X_test),len(y_tru))

In [None]:
train_set1=train_set.iloc[:,:6]
test_set1=test_set.iloc[:,:6]

In [None]:
#this function use random grid search to find best parameters
def getPar(model,dist,data,niter):
    x,y=data.shape
    clf = model
    param_dist = dist
    scoring = make_scorer(f1_score)
    grid_search = GridSearchCV(clf, param_dist, cv=5, scoring=scoring)
    grid_search.fit(data)
    print("Best parameters:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    
    return grid_search.best_params_

# iforest

In [None]:
iforest = IsolationForest(n_estimators=100, contamination='auto')
params = {'n_estimators': [10, 50, 100],
          'max_samples': [0.1, 0.5, 1.0],
          'contamination': [0.01, 0.05, 0.1]}

para=getPar(iforest,params,train_set1,10)


iforest = IsolationForest(n_estimators=para['n_estimators'],max_samples=para['max_samples'],
                                  contamination=para['contamination'])

In [None]:
# compute score
iforest.fit(train_set1)
if_scores = -iforest.score_samples(train_set1)

In [None]:
# combine
if_result = pd.concat([train_set,pd.Series(if_scores, name='score')], axis=1)

# ascending by score
if_data=if_result.sort_values(by='score', ascending=False)
if_data.to_csv("if_score_low_top.csv")

In [None]:
train_data_if=if_data.iloc[:,:7]
train_data_if

# autoencoder

In [None]:
# normalization
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train_set1)

#  Autoencoder model
input_dim = scaled_data.shape[1]
encoding_dim = 1
hidden_dim = 2
output_dim = input_dim
input_layer = keras.layers.Input(shape=(input_dim,))
encoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(input_layer)
encoder_layer2 = keras.layers.Dense(encoding_dim, activation='relu')(encoder_layer1)
decoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(encoder_layer2)
decoder_layer2 = keras.layers.Dense(output_dim, activation=None)(decoder_layer1)
autoencoder = keras.models.Model(inputs=input_layer, outputs=decoder_layer2)

In [None]:
# compile and fit
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(train_set1, train_set1, epochs=50, batch_size=16, verbose=0)

In [None]:
# compute
reconstructed_data = autoencoder.predict(train_set1)
mse = np.mean(np.power(train_set1 - reconstructed_data, 2), axis=1)
anomaly_scores = pd.Series(mse, name='anomaly_score')

In [None]:
# combine
auto_result = pd.concat([train_set, anomaly_scores], axis=1)

auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
auto_data.to_csv("auto_score_low_top.csv")

In [None]:
train_data_auto=auto_data.iloc[:,:7]
train_data_auto

# OneClassSVM

In [None]:
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
ocsvm.fit(train_set1)

In [None]:
scores3 = -ocsvm.decision_function(train_set1)

In [None]:
# combine
oc_result = pd.concat([train_set, pd.Series(scores3, name='score')], axis=1)

oc_data=oc_result.sort_values(by='score', ascending=False)
oc_data.to_csv("./other/oc_score_low_top.csv")
train_data_oc=oc_data.iloc[:,:7]
train_data_oc


# auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
# train_data_auto=auto_data.iloc[:,:7]
# train_data_auto

# active learning

In [None]:
#n_initial= 100
n_initial = 50
N_QUERIES = 1000

In [None]:
#Pool
def initial_data(n_initial,X_Pool,y_Pool):
    #inital
    #initial Labeled data
    #X_initial, y_initial = X_train[], y_train[initial_idx]
    X_L = X_Pool[:n_initial]
    y_L = y_Pool[:n_initial]
    # Unlabeled data
    X_U = X_Pool[n_initial:]
    y_U = y_Pool[n_initial:]
    return X_L,y_L,X_U,y_U

In [None]:
def al_learn(clf,sampling,X_initial,y_initial,X_re,y_re):
    X_L = X_initial.copy()
    y_L = y_initial.copy()
    X_U,y_U =X_re.copy(),y_re.copy()
    #print(len(X_U),len(y_U))
    learner = al(clf,sampling,X_L,y_L)
    y_pre=learner.predict(X_test)
    unqueried_kappa=cohen_kappa_score(y_tru, y_pre)
    unqueried_f1=f1_score(y_tru,y_pre)
    #print("unqueried --------------------->",unqueried_score)
    kappa_history = [unqueried_kappa]
    f1_history = [unqueried_f1]
    
    # Query
    for index in range(N_QUERIES):
        query_index=0
        # Teach ActiveLearner model the record it has requested.
        X, y = X_U[query_index].reshape(1, -1), y_U[query_index].reshape(1, )
        #print(index+1,"query label --------------------->",y)
        learner.teach(X=X, y=y)
        
        X_U, y_U = np.delete(X_U, query_index, axis=0), np.delete(y_U, query_index)
        y_pre=learner.predict(X_test)
        kappa=cohen_kappa_score(y_tru, y_pre)
        f1=f1_score(y_tru,y_pre)
        print(index+1,"-------------------->",kappa)
        print(index+1,"-------------------->",f1)
        # Recall precision F1
        kappa_history.append(kappa)
        f1_history.append(f1)
    #print(len(X_U))
    #draw(performance_history)
    df_scores= pd.concat([pd.DataFrame(kappa_history,columns=['kappa']), 
                          pd.DataFrame(f1_history,columns=['f1'])],
                         axis=1)
    return df_scores 

In [None]:
# metric
def computeMetric(y_tru,y_pre):
    acc = accuracy_score(y_tru,y_pre)
    pre=precision_score(y_tru,y_pre)
    recall=recall_score(y_tru,y_pre)
    cm=confusion_matrix(y_tru,y_pre)
    f1 = f1_score(y_tru,y_pre)
    mcc=matthews_corrcoef(y_tru, y_pre)
    kappa=cohen_kappa_score(y_tru, y_pre)
    bac=balanced_accuracy_score(y_tru,y_pre)
    print("acc:",acc)
    print("balanced acc:",bac)
    print("precision:",pre)
    print("recall:",recall)
    print("cm:",cm)
    print("f1:",f1)
    print("MCC:", mcc)
    print("Kappa:",kappa)
    
    # confusion matrix
#     cmap1 = sns.diverging_palette(260,-10,s=50, l=75, n=5, as_cmap=True)
#     plt.subplots(figsize=(12,8))
#     cf_matrix = confusion_matrix(y_tru, y_pre)
#     sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})
    
    return kappa

In [None]:
#clf4 catboost
clf4 = CatBoostClassifier(loss_function='Logloss')
clf4.fit(train_set1.values, train_set.iloc[:,6].values)
y_pre=clf4.predict(X_test)
computeMetric(y_tru,y_pre)

In [None]:
def al(clf,strategy,X_L,y_L):
    learner = ActiveLearner(estimator=clf,
                            query_strategy=strategy,
                            X_training=X_L, y_training=y_L)
    return learner

# auto

In [None]:
x,y=train_set.shape


#pool 
X_Pool = train_data_auto.iloc[:,0:y-1].values
y_Pool = train_data_auto.iloc[:,y-1].values


X_in_au,y_in_au,X_re_au,y_re_au=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_au),len(X_re_au))

In [None]:
sampling=uncertainty_sampling

In [None]:
metric1 = al_learn(clf4,sampling,X_in_au,y_in_au,X_re_au,y_re_au)

In [None]:
metric1.rename(columns = {'kappa' : 'Auto_Kappa', 'f1' : 'Auto_F1'}, inplace = True)

In [None]:
metric1

In [None]:
fig, ax = plt.subplots(figsize=(8.5, 10))
x = np.linspace(1000, 2001, 1001)
plt.plot(x,metric1)
plt.legend(loc = "best")
plt.title('Kappa of AL over Time')#Kappa F1
plt.xlabel('Number of Queried Instance')
plt.ylabel('Kappa')
my_x_ticks = np.arange(1000, 2001, 50)
plt.xticks(my_x_ticks)
plt.show()

# iforest

In [None]:
#pool 
X_Pool = train_data_if.iloc[:,0:y-1].values
y_Pool = train_data_if.iloc[:,y-1].values

n_initial=300
X_in_if,y_in_if,X_re_if,y_re_if=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_if),len(X_re_if))

In [None]:
metric2 = al_learn(clf4,sampling,X_in_if,y_in_if,X_re_if,y_re_if)

In [None]:
metric2.rename(columns = {'kappa' : 'IF_Kappa', 'f1' : 'IF_F1'}, inplace = True)

In [None]:
metric2

# oneclasssvm

In [None]:
#pool 
X_Pool = train_data_oc.iloc[:,0:y-1].values
y_Pool = train_data_oc.iloc[:,y-1].values


X_in_oc,y_in_oc,X_re_oc,y_re_oc=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_oc),len(X_re_oc))

In [None]:
metric3 = al_learn(clf4,sampling,X_in_oc,y_in_oc,X_re_oc,y_re_oc)

In [None]:
metric3.rename(columns = {'kappa' : 'OCSVM_Kappa', 'f1' : 'OCSVM_F1'}, inplace = True)

# all

In [None]:
metrics=pd.concat([metric1,metric2,metric3],axis=1)
metrics

In [None]:
metrics.to_csv("random_low_un_50_top.csv")