In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

#unsupervised
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from tensorflow import keras

#supervised
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier


#al
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling,margin_sampling,entropy_sampling
from modAL.disagreement import KL_max_disagreement
from modAL.batch import uncertainty_batch_sampling

#metric
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import matthews_corrcoef,cohen_kappa_score,balanced_accuracy_score
from sklearn.metrics import make_scorer

#warnings
import warnings

warnings.filterwarnings('ignore')

2023-04-26 12:24:22.888889: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
float_cd='PR_PF_4903220'


train_set=pd.read_csv('./randomtrain/'+float_cd+'.csv')
test_set=pd.read_csv('./randomtest/'+float_cd+'.csv')
print(len(train_set),len(test_set))

x,y=train_set.shape
#test set
X_test=test_set.iloc[:,0:y-2]
y_tru=test_set.iloc[:,y-1]
print(len(X_test),len(y_tru))

196984 84422
84422 84422


In [66]:
test_set['label'].unique()

array([0, 1])

In [69]:
train_set1=train_set.iloc[:,:y-2]
test_set1=train_set.iloc[:,:y-1]

In [5]:
#this function use random grid search to find best parameters
def getPar(model,dist,data,niter):
    x,y=data.shape
    clf = model
    param_dist = dist
    scoring = make_scorer(f1_score)
    grid_search = GridSearchCV(clf, param_dist, cv=5, scoring=scoring)
    grid_search.fit(data)
    print("Best parameters:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    
    return grid_search.best_params_

# iforest

In [6]:
iforest = IsolationForest(n_estimators=100, contamination='auto')
params = {'n_estimators': [10, 50, 100],
          'max_samples': [0.1, 0.5, 1.0],
          'contamination': [0.01, 0.05, 0.1]}

para=getPar(iforest,params,train_set1,10)


iforest = IsolationForest(n_estimators=para['n_estimators'],max_samples=para['max_samples'],
                                  contamination=para['contamination'])

Best parameters: {'contamination': 0.01, 'max_samples': 0.1, 'n_estimators': 10}
Best score: nan


In [7]:
# compute score
iforest.fit(train_set1)
if_scores = -iforest.score_samples(train_set1)

In [8]:
# combine
if_result = pd.concat([train_set,pd.Series(if_scores, name='score')], axis=1)

# ascending by score
if_data=if_result.sort_values(by='score', ascending=False)
if_data.to_csv("if_score_"+float_cd+"_top.csv")

In [9]:
train_data_if=if_data.iloc[:,:7]
train_data_if

Unnamed: 0,Date,Latitude,longitude,Pressure,Salinity,Temperature,QC
112621,-1.599904,-1.507198,-2.358760,2.247687,-1.357423,-1.537409,0
77384,-1.599904,-1.507198,-2.358760,-0.593580,0.577220,0.617612,0
196135,-1.577288,-1.570301,-2.242643,-0.584182,0.544927,0.584851,0
173222,-1.599904,-1.507198,-2.358760,-0.590447,0.559606,0.599520,0
6083,-1.599904,-1.507198,-2.358760,2.244555,-1.356445,-1.536920,0
...,...,...,...,...,...,...,...
79718,-0.764106,-0.149850,0.242207,-0.801020,0.851221,0.797187,0
55575,-0.764106,-0.149850,0.242207,-0.800143,0.849264,0.796821,0
181690,-0.764106,-0.149850,0.242207,-0.801396,0.854157,0.797799,0
162503,-0.764106,-0.149850,0.242207,-0.799955,0.858071,0.795354,0


# autoencoder

In [10]:
# normalization
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train_set1)

#  Autoencoder model
input_dim = scaled_data.shape[1]
encoding_dim = 1
hidden_dim = 2
output_dim = input_dim
input_layer = keras.layers.Input(shape=(input_dim,))
encoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(input_layer)
encoder_layer2 = keras.layers.Dense(encoding_dim, activation='relu')(encoder_layer1)
decoder_layer1 = keras.layers.Dense(hidden_dim, activation='relu')(encoder_layer2)
decoder_layer2 = keras.layers.Dense(output_dim, activation=None)(decoder_layer1)
autoencoder = keras.models.Model(inputs=input_layer, outputs=decoder_layer2)

2023-04-26 12:51:56.000412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# compile and fit
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(train_set1, train_set1, epochs=50, batch_size=16, verbose=0)

<keras.callbacks.History at 0x7fbd90a9b7c0>

In [12]:
# compute
reconstructed_data = autoencoder.predict(train_set1)
mse = np.mean(np.power(train_set1 - reconstructed_data, 2), axis=1)
anomaly_scores = pd.Series(mse, name='anomaly_score')



In [18]:
# combine
auto_result = pd.concat([train_set, anomaly_scores], axis=1)

auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
auto_data.to_csv("auto_score_"+float_cd+"_top.csv")

In [14]:
train_data_auto=auto_data.iloc[:,:7]
train_data_auto

Unnamed: 0,Date,Latitude,longitude,Pressure,Salinity,Temperature,QC
160468,-1.667736,-1.815928,-2.436509,2.246685,-1.356445,-1.534842,0
52410,-1.667736,-1.815928,-2.436509,2.244680,-1.355466,-1.534475,0
110982,-1.667736,-1.815928,-2.436509,2.241422,-1.355466,-1.534231,0
18487,-1.667736,-1.815928,-2.436509,2.238289,-1.355466,-1.533497,0
155942,-1.667736,-1.815928,-2.436509,2.235157,-1.355466,-1.533131,0
...,...,...,...,...,...,...,...
118592,-0.289869,0.047667,0.298992,-0.684425,0.598749,0.600254,0
117972,-0.289869,0.047667,0.298992,-0.706353,0.722049,0.719808,0
2692,-0.289869,0.047667,0.298992,-0.687620,0.621256,0.618590,0
185266,-0.289869,0.047667,0.298992,-0.703221,0.693671,0.696704,0


# OneClassSVM

In [15]:
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
ocsvm.fit(train_set1)

In [16]:
scores3 = -ocsvm.decision_function(train_set1)

In [17]:
# combine
oc_result = pd.concat([train_set, pd.Series(scores3, name='score')], axis=1)

oc_data=oc_result.sort_values(by='score', ascending=False)
oc_data.to_csv("./other/oc_score_"+float_cd+"_top.csv")
train_data_oc=oc_data.iloc[:,:7]
train_data_oc


# auto_data = auto_result.sort_values(by='anomaly_score', ascending=False)
# train_data_auto=auto_data.iloc[:,:7]
# train_data_auto

Unnamed: 0,Date,Latitude,longitude,Pressure,Salinity,Temperature,QC
164831,0.748975,1.971231,0.530896,2.247311,-1.322195,-1.495113,0
174379,0.748975,1.971231,0.530896,2.244617,-1.321216,-1.494624,0
161085,0.748975,1.971231,0.530896,2.241359,-1.321216,-1.493524,0
73079,0.748975,1.971231,0.530896,2.238227,-1.320238,-1.493035,0
129721,0.748975,1.971231,0.530896,2.235157,-1.320238,-1.492179,0
...,...,...,...,...,...,...,...
34442,1.110278,0.795591,0.313014,-0.596775,0.326705,0.357724,0
161525,1.110278,0.795591,0.313014,-0.615445,0.394227,0.397453,0
124859,1.110278,0.795591,0.313014,-0.593642,0.318876,0.352590,0
49630,1.110278,0.795591,0.313014,-0.562316,0.254290,0.314694,0


# active learning

In [51]:
#n_initial= 100
n_initial = 1100
N_QUERIES = 1000

In [52]:
#Pool
def initial_data(n_initial,X_Pool,y_Pool):
    #inital
    #initial Labeled data
    #X_initial, y_initial = X_train[], y_train[initial_idx]
    X_L = X_Pool[:n_initial]
    y_L = y_Pool[:n_initial]
    # Unlabeled data
    X_U = X_Pool[n_initial:]
    y_U = y_Pool[n_initial:]
    return X_L,y_L,X_U,y_U

In [53]:
def al_learn(clf,sampling,X_initial,y_initial,X_re,y_re):
    X_L = X_initial.copy()
    y_L = y_initial.copy()
    X_U,y_U =X_re.copy(),y_re.copy()
    #print(len(X_U),len(y_U))
    learner = al(clf,sampling,X_L,y_L)
    y_pre=learner.predict(X_test)
    unqueried_kappa=cohen_kappa_score(y_tru, y_pre)
    unqueried_f1=f1_score(y_tru,y_pre)
    #print("unqueried --------------------->",unqueried_score)
    kappa_history = [unqueried_kappa]
    f1_history = [unqueried_f1]
    
    # Query
    for index in range(N_QUERIES):
        query_index=0
        # Teach ActiveLearner model the record it has requested.
        X, y = X_U[query_index].reshape(1, -1), y_U[query_index].reshape(1, )
        #print(index+1,"query label --------------------->",y)
        learner.teach(X=X, y=y)
        
        X_U, y_U = np.delete(X_U, query_index, axis=0), np.delete(y_U, query_index)
        y_pre=learner.predict(X_test)
        kappa=cohen_kappa_score(y_tru, y_pre)
        f1=f1_score(y_tru,y_pre)
        print(index+1,"-------------------->",kappa)
        print(index+1,"-------------------->",f1)
        # Recall precision F1
        kappa_history.append(kappa)
        f1_history.append(f1)
    #print(len(X_U))
    #draw(performance_history)
    df_scores= pd.concat([pd.DataFrame(kappa_history,columns=['kappa']), 
                          pd.DataFrame(f1_history,columns=['f1'])],
                         axis=1)
    return df_scores 

In [54]:
# metric
def computeMetric(y_tru,y_pre):
    acc = accuracy_score(y_tru,y_pre)
    pre=precision_score(y_tru,y_pre)
    recall=recall_score(y_tru,y_pre)
    cm=confusion_matrix(y_tru,y_pre)
    f1 = f1_score(y_tru,y_pre)
    mcc=matthews_corrcoef(y_tru, y_pre)
    kappa=cohen_kappa_score(y_tru, y_pre)
    bac=balanced_accuracy_score(y_tru,y_pre)
    print("acc:",acc)
    print("balanced acc:",bac)
    print("precision:",pre)
    print("recall:",recall)
    print("cm:",cm)
    print("f1:",f1)
    print("MCC:", mcc)
    print("Kappa:",kappa)
    
    # confusion matrix
#     cmap1 = sns.diverging_palette(260,-10,s=50, l=75, n=5, as_cmap=True)
#     plt.subplots(figsize=(12,8))
#     cf_matrix = confusion_matrix(y_tru, y_pre)
#     sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})
    
    return kappa

In [58]:
train_set1

Unnamed: 0,Date,Latitude,longitude,Pressure,Salinity,Temperature
0,-0.696323,-0.838387,0.342557,-0.864737,0.724985,0.923831
1,-1.170596,-1.355354,-1.381377,-0.887167,1.212316,1.334934
2,-0.064092,0.172312,0.902710,1.539720,-1.194980,-1.338397
3,0.071427,0.769677,1.053827,-0.875952,0.910914,0.821391
4,1.607057,0.589603,-0.269866,-0.885475,0.917764,0.771150
...,...,...,...,...,...,...
196979,0.229547,-0.440313,0.968438,1.702552,-1.245866,-1.405509
196980,1.358658,0.104388,0.231578,-0.820129,1.226016,1.240318
196981,-1.035138,-1.011625,-0.641814,-0.804090,1.136965,1.071990
196982,-0.109225,-0.263815,0.622315,0.321139,-0.955229,-0.741485


In [61]:
train_set.iloc[:,]

0         0.923831
1         1.334934
2        -1.338397
3         0.821391
4         0.771150
            ...   
196979   -1.405509
196980    1.240318
196981    1.071990
196982   -0.741485
196983   -0.276473
Name: Temperature, Length: 196984, dtype: float64

In [70]:
#clf4 catboost
clf4 = CatBoostClassifier(loss_function='Logloss')
clf4.fit(train_set1.values, test_set1.values)
y_pre=clf4.predict(X_test)
computeMetric(y_tru,y_pre)

CatBoostError: catboost/private/libs/target/data_providers.cpp:612: Currently only multi-regression, multilabel and survival objectives work with multidimensional target

In [45]:
def al(clf,strategy,X_L,y_L):
    learner = ActiveLearner(estimator=clf,
                            query_strategy=strategy,
                            X_training=X_L, y_training=y_L)
    return learner

# auto

In [None]:
x,y=train_set.shape


#pool 
X_Pool = train_data_auto.iloc[:,0:y-1].values
y_Pool = train_data_auto.iloc[:,y-1].values


X_in_au,y_in_au,X_re_au,y_re_au=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_au),len(X_re_au))

In [None]:
sampling=uncertainty_sampling

In [None]:
metric1 = al_learn(clf4,sampling,X_in_au,y_in_au,X_re_au,y_re_au)

In [None]:
metric1.rename(columns = {'kappa' : 'Auto_Kappa', 'f1' : 'Auto_F1'}, inplace = True)

In [None]:
metric1

In [None]:
fig, ax = plt.subplots(figsize=(8.5, 10))
x = np.linspace(1000, 2001, 1001)
plt.plot(x,metric1)
plt.legend(loc = "best")
plt.title('Kappa of AL over Time')#Kappa F1
plt.xlabel('Number of Queried Instance')
plt.ylabel('Kappa')
my_x_ticks = np.arange(1000, 2001, 50)
plt.xticks(my_x_ticks)
plt.show()

# iforest

In [None]:
#pool 
X_Pool = train_data_if.iloc[:,0:y-1].values
y_Pool = train_data_if.iloc[:,y-1].values

n_initial=300
X_in_if,y_in_if,X_re_if,y_re_if=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_if),len(X_re_if))

In [None]:
metric2 = al_learn(clf4,sampling,X_in_if,y_in_if,X_re_if,y_re_if)

In [None]:
metric2.rename(columns = {'kappa' : 'IF_Kappa', 'f1' : 'IF_F1'}, inplace = True)

In [None]:
metric2

# oneclasssvm

In [None]:
#pool 
X_Pool = train_data_oc.iloc[:,0:y-1].values
y_Pool = train_data_oc.iloc[:,y-1].values


X_in_oc,y_in_oc,X_re_oc,y_re_oc=initial_data(n_initial,X_Pool,y_Pool)
print(len(X_in_oc),len(X_re_oc))

In [None]:
metric3 = al_learn(clf4,sampling,X_in_oc,y_in_oc,X_re_oc,y_re_oc)

In [None]:
metric3.rename(columns = {'kappa' : 'OCSVM_Kappa', 'f1' : 'OCSVM_F1'}, inplace = True)

# all

In [None]:
metrics=pd.concat([metric1,metric2,metric3],axis=1)
metrics

In [None]:
metrics.to_csv("random_low_un_50_top.csv")