In [None]:
from tf_ann_model.processing.data_management import load_dataset
train = load_dataset(file_name='AWID-CLS-R-Trn.csv')
test = load_dataset(file_name='AWID-CLS-R-Tst.csv')

In [None]:
from tf_ann_model.processing.data_management import prepare_data

X_train, y_train = prepare_data(train)
X_test, y_test = prepare_data(test)

In [None]:
from tf_ann_model.processing.data_management import load_pipeline_keras
tf_ann_pipe = load_pipeline_keras()

In [None]:
y_train_pred = tf_ann_pipe.predict(X_train)
y_test_pred = tf_ann_pipe.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import numpy as np
y_train = np.argmax(y_train, axis=1)
y_train = encoder.inverse_transform(y_train)
y_test = np.argmax(y_test, axis=1)
y_test = encoder.inverse_transform(y_test)

y_train_pred = encoder.inverse_transform(y_train_pred)
y_test_pred = encoder.inverse_transform(y_test_pred)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
labels = ['flooding', 'impersonation', 'injection', 'normal']
cm = confusion_matrix(y_train, y_train_pred, labels)
plt.figure(figsize=(10,10))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
labels = ['flooding', 'impersonation', 'injection', 'normal']
cm = confusion_matrix(y_test, y_test_pred, labels)
plt.figure(figsize=(10,10))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);

In [None]:
from sklearn.pipeline import Pipeline

data_pipe = Pipeline(tf_ann_pipe.steps[:-2])
data_pipe.steps

In [None]:
test = load_dataset(file_name='AWID-CLS-R-Tst.csv')
test_ds = downsample(test)
X_test, y_test = get_target(test_ds)

In [None]:
X_test_bad = test_ds[y_test != y_test_pred]
y_test_bad = y_test[y_test != y_test_pred]
X_test_bad = data_pipe.transform(X_test_bad)
X_test_bad['class'] = y_test_bad
X_test_bad['frame.time_epoch'] = test_ds['frame.time_epoch']

In [None]:
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd

In [None]:
from tf_ann_model.processing.feat_eng_categ import one_hot_encoder
ohe = one_hot_encoder(features='class')
X_test_bad = ohe.fit_transform(X_test_bad)

In [None]:
def h(content):
    display(HTML(content))
    
def timehist(df, tcol, target, col, target_first, clipping=9999999999999999, concat_df = False, odf = None):
    if concat_df == True:
        df = pd.concat([df, odf])
        
    title = target + ' Hist ' + col
    if( target_first==True):
        df[df[target] == 1].set_index(tcol)[col].clip(0, clipping).plot(style='.', figsize=(15, 3))
        df[df[target] == 0].set_index(tcol)[col].clip(0, clipping).plot(style='.', figsize=(15, 3))
        plt.title(title)
        plt.show()
    elif( target_first==False):
        df[df[target] == 0].set_index(tcol)[col].clip(0, clipping).plot(style='.', title= title, figsize=(15, 3))
        df[df[target] == 1].set_index(tcol)[col].clip(0, clipping).plot(style='.', title= title, figsize=(15, 3))
        plt.title(title)
        plt.show()

In [None]:
def _desc(data, col, label):
    d0 = data.describe().reset_index()
    d0.columns = [col, label]
    return d0.append({col:'unique values', label:data.unique().shape[0]}, ignore_index=True) \
             .append({col:'NaNs', label:data.isnull().sum()}, ignore_index=True) \
             .append({col:'NaNs share', label:np.round(data.isnull().sum() / data.shape[0], 4)}, ignore_index=True) \

def desc(df_train, col, target, include_test=False, df_test=None):
    d0 = _desc(df_train[col], col, 'Train')
    d1 = _desc(df_train.loc[df_train[target] == 1, col], col, 'Train normal')
    d2 = _desc(df_train.loc[df_train[target] == 0, col], col, 'Train not normal')
    if( include_test):
        d3 = _desc(df_test[col], col, 'Test')
        d4 = _desc(df_test.loc[df_test[target] == 1, col], col, 'Test normal')
        d5 = _desc(df_test.loc[df_test[target] == 0, col], col, 'Test not normal')
    if( include_test):
        dd = d0.merge(d1).merge(d2).merge(d3).merge(d4).merge(d5)
    else:
        dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = df_train[[target,col]].groupby(col)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    d0 = d0.head(N)
    d0 = d0.rename({'size':'Count in train (desc)','mean':'Mean target train','sum':'Sum target train'}, axis=1)
    display(d0)
    
    if( include_test):
        d1 = df_test[[target,col]].groupby(col)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
        d1 = d1.head(N)
        d1 = d1.rename({'size':'Count in test (desc)','mean':'Mean target test','sum':'Sum target test'}, axis=1)
        display(d1)

In [None]:
def hist1(df,col):
    plt.figure(figsize=(15, 3))
    plt.hist(df[col], bins=70);
    plt.title('Train histogram: ' + col);
    plt.show()

In [None]:
def corr1(df,col):
    N = None #10000
    num_vars = [f for f in df.columns if df[f].dtype != 'object']
    trx = df.head(N) if N is not None else df.copy()
    corrs = trx[num_vars].corrwith(trx[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'Column',0:'Correlation with ' + col}, axis=1)
    h('<b>Most correlated values with ' + col + ':</b>')
    trx = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
    def linkx(val):
        return '<a href="#c_{}">{}</a>'.format(val, val)
    trx['Column'] = trx['Column'].apply(linkx)
    h(trx.to_html(escape=False))

In [None]:
def numeric(df_input, tcol_input, target_input, col_input, target_first_input, df_test, include_test):
    timehist(df=df_input, tcol=tcol_input, target=target_input, col=col_input, target_first=target_first_input, odf=df_test, concat_df=include_test)
    hist1(df_input,col_input)
    desc(df_input, col_input, target_input, include_test, df_test)
    corr1(df_input,col_input)

In [None]:
def categorical(df, col, target, df_test, include_test):
    desc(df, col, target, include_test, df_test)

In [None]:
def proc(df, tcol, target, col, target_first, df_test, include_test):
    if col not in ['isFraud','TransactionDT']:
        h('<h3 id="c_' + col + '">' + col + '</h3>' + '<a style="font-size:11px" href="#home">(Jump to top)</a>')
        categorical(df, col, target, df_test, include_test) if df[col].dtype == 'object' else numeric(df, tcol, target, col, target_first, df_test, include_test)

In [None]:
columns = list(X_test_bad.columns)
for x in ['frame.time_epoch', 'class_normal', 'class_injection', 'class_impersonation', 'class_flooding']:
    columns.remove(x)

In [None]:
for col in columns:
    proc(df=X_test_bad, tcol='frame.time_epoch', target='class_normal', col=col, target_first=True, df_test=None, include_test=False)

In [None]:
X_test_bad.shape

In [None]:
for col in X_test_bad.columns:
    print(col)

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tf_ann_model.config import config
from tf_ann_model.processing.data_management import load_dataset, get_target
from tf_ann_model import pipeline
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [None]:
data = load_dataset(file_name=config.TRAINING_DATA_FILE)
val = load_dataset(file_name=config.TESTING_DATA_FILE)

X_train, y_train = get_target(data)
X_val, y_val = get_target(val)
    
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_val = encoder.transform(y_val)

y_train = tf.keras.utils.to_categorical( y_train, num_classes=4)
y_val = tf.keras.utils.to_categorical( y_val, num_classes=4)

In [None]:
pipeline.fe_pipe.fit(X_train)
X_val = pipeline.fe_pipe.transform(X_val)

In [None]:
X_val.shape