In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline

In [2]:
from classification_model.processing.data_management import load_dataset

In [3]:
train_orig = load_dataset(file_name='AWID-CLS-R-Trn.csv')
test_orig = load_dataset(file_name='AWID-CLS-R-Tst.csv')

In [4]:
from classification_model.processing.feat_eng_categ import one_hot_encoder
ohe = one_hot_encoder(features='class')
ohe.fit(train_orig)
train = ohe.transform(train_orig)
test = ohe.transform(test_orig)

In [5]:
from classification_model.processing.feat_eng_categ import discrete_to_categ, one_hot_encoder, categ_missing_encoder, rare_label_encoder, label_encoder
from classification_model.processing.feat_eng_num import outlier_capping, ArbitraryNumberImputer
from classification_model.processing.feat_creation import feature_creation
from classification_model.processing.feat_selection import remove_constant, remove_quasi_constant, remove_duplicates, selected_drop_features
from sklearn.preprocessing import StandardScaler

In [6]:
eda_pipe = Pipeline([
                ('fc', feature_creation()),
                ('d2c', discrete_to_categ()),
                ('rce', rare_label_encoder(tol=0.001)),
                ('cme', categ_missing_encoder()),
               ])

In [7]:
train = eda_pipe.fit_transform(train_orig)

In [8]:
le = label_encoder()
train_label = le.fit_transform(X=train)

In [9]:
rc  = remove_constant()
rqc = remove_quasi_constant() 
rd  = remove_duplicates()

train_label = rc.fit_transform(train_label)
train_label = rqc.fit_transform(train_label)
train_label = rd.fit_transform(train_label)

In [10]:
columns_to_drop = [col for col in train.columns if col not in train_label.columns ]
train.drop(columns_to_drop, axis=1, inplace=True)

In [11]:
ohe = one_hot_encoder(features='class')
train = ohe.fit_transform(train)

In [12]:
train.head()

Unnamed: 0,frame.time_epoch,frame.time_delta,frame.time_relative,frame.len,radiotap.length,radiotap.mactime,radiotap.datarate,radiotap.channel.freq,radiotap.channel.type.cck,radiotap.dbm_antsignal,...,wlan.qos.tid,wlan.qos.eosp,wlan.qos.ack,wlan.qos.amsdupresent,wlan.qos.bit4,data.len,class_normal,class_injection,class_impersonation,class_flooding
0,0.0,0.024271,0.024271,185,26,2101623000.0,1.0,2437.0,1,-47.0,...,Rare,Rare,Rare,Rare,Rare,,1,0,0,0
1,0.001631,0.001631,0.025902,185,26,2101625000.0,1.0,2437.0,1,-64.0,...,Rare,Rare,Rare,Rare,Rare,,1,0,0,0
2,0.056956,0.055325,0.081227,159,26,2101680000.0,1.0,2437.0,1,-32.0,...,Rare,Rare,Rare,Rare,Rare,,1,0,0,0
3,0.057371,0.000415,0.081642,54,26,2101682000.0,48.0,2437.0,0,-21.0,...,Rare,Rare,Rare,Rare,Rare,,1,0,0,0
4,0.057376,5e-06,0.081647,40,26,2101682000.0,24.0,2437.0,0,-24.0,...,Rare,Rare,Rare,Rare,Rare,,1,0,0,0


In [13]:
def diagnostic_plots(df, variable):
    # function takes a dataframe (df) and
    # the variable of interest as arguments

    # define figure size
    plt.figure(figsize=(16, 4))

    # histogram
    plt.subplot(1, 3, 1)
    sns.distplot(df[variable], bins=30)
    plt.title('Histogram')

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('Variable quantiles')

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')

    plt.show()

In [14]:
y = train[['classl_normal', 'class_injection', 'class_impersonation', 'class_flooding']]
train.drop(['classl_normal', 'class_injection', 'class_impersonation', 'class_flooding'], axis=1, inplace=True)

KeyError: "['classl_norma'] not in index"

In [None]:
from classification_model.processing.data_management import partition_features
NUMERIC, NUMERIC_NA, CATEG, CATEG_NA, DISCRETE, DISCRETE_NA = partition_features(train)


In [None]:
#### Numeric variables ####
for feat in NUMERIC:
    print( 'Feature:',feat)
    print('')
    print(train[feat].describe())
    print(' ')
    diagnostic_plots(train, feat)

In [None]:
for feat in CATEG:
    print( 'Feature:',feat)
    print('')
    print('Number of unique values:')
    print(train[feat].nunique())
    print('')
    print('Value distribution:')
    print((train[feat].value_counts().head(20)))
    print('')
    print('')

In [None]:
def h(content):
    display(HTML(content))
    
def timehist(df, tcol, target, col, target_first, clipping=9999999999999999, concat_df = False, odf = None):
    if concat_df == True:
        df = pd.concat([df, odf])
        
    title = target + ' Hist ' + col
    if( target_first==True):
        df[df[target] == 1].set_index(tcol)[col].clip(0, clipping).plot(style='.', figsize=(15, 3))
        df[df[target] == 0].set_index(tcol)[col].clip(0, clipping).plot(style='.', figsize=(15, 3))
        plt.title(title)
        plt.show()
    elif( target_first==False):
        df[df[target] == 0].set_index(tcol)[col].clip(0, clipping).plot(style='.', title= title, figsize=(15, 3))
        df[df[target] == 1].set_index(tcol)[col].clip(0, clipping).plot(style='.', title= title, figsize=(15, 3))
        plt.title(title)
        plt.show()

In [None]:
def _desc(data, col, label):
    d0 = data.describe().reset_index()
    d0.columns = [col, label]
    return d0.append({col:'unique values', label:data.unique().shape[0]}, ignore_index=True) \
             .append({col:'NaNs', label:data.isnull().sum()}, ignore_index=True) \
             .append({col:'NaNs share', label:np.round(data.isnull().sum() / data.shape[0], 4)}, ignore_index=True) \

def desc(df_train, col, target, include_test=False, df_test=None):
    d0 = _desc(df_train[col], col, 'Train')
    d1 = _desc(df_train.loc[df_train[target] == 1, col], col, 'Train normal')
    d2 = _desc(df_train.loc[df_train[target] == 0, col], col, 'Train not normal')
    if( include_test):
        d3 = _desc(df_test[col], col, 'Test')
        d4 = _desc(df_test.loc[df_test[target] == 1, col], col, 'Test normal')
        d5 = _desc(df_test.loc[df_test[target] == 0, col], col, 'Test not normal')
    if( include_test):
        dd = d0.merge(d1).merge(d2).merge(d3).merge(d4).merge(d5)
    else:
        dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = df_train[[target,col]].groupby(col)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    d0 = d0.head(N)
    d0 = d0.rename({'size':'Count in train (desc)','mean':'Mean target train','sum':'Sum target train'}, axis=1)
    display(d0)
        
    d1 = df_test[[target,col]].groupby(col)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    d1 = d1.head(N)
    d1 = d1.rename({'size':'Count in test (desc)','mean':'Mean target test','sum':'Sum target test'}, axis=1)
    display(d1)

In [None]:
def hist1(df,col):
    plt.figure(figsize=(15, 3))
    plt.hist(df[col], bins=70);
    plt.title('Train histogram: ' + col);
    plt.show()

In [None]:
def corr1(df,col):
    N = None #10000
    num_vars = [f for f in train.columns if train[f].dtype != 'object']
    trx = df.head(N) if N is not None else df.copy()
    corrs = trx[num_vars].corrwith(trx[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'Column',0:'Correlation with ' + col}, axis=1)
    h('<b>Most correlated values with ' + col + ':</b>')
    trx = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
    def linkx(val):
        return '<a href="#c_{}">{}</a>'.format(val, val)
    trx['Column'] = trx['Column'].apply(linkx)
    h(trx.to_html(escape=False))

In [None]:
def numeric(df_input, tcol_input, target_input, col_input, target_first_input, df_test, include_test):
    timehist(df=df_input, tcol=tcol_input, target=target_input, col=col_input, target_first=target_first_input, odf=df_test, concat_df=include_test)
    hist1(df_input,col_input)
    desc(df_input, col_input, target_input, include_test, df_test)
    corr1(df_input,col_input)

In [None]:
def categorical(df, col, target, df_test, include_test):
    desc(df, col, target, include_test, df_test)

In [None]:
def proc(df, tcol, target, col, target_first, df_test, include_test):
    if col not in ['isFraud','TransactionDT']:
        h('<h3 id="c_' + col + '">' + col + '</h3>' + '<a style="font-size:11px" href="#home">(Jump to top)</a>')
        categorical(df, col, target, df_test, include_test) if train[col].dtype == 'object' else numeric(df, tcol, target, col, target_first, df_test, include_test)

In [None]:
columns = list(train.columns)
for x in ['frame.time_epoch', 'class_normal', 'class_injection', 'class_impersonation', 'class_flooding']:
    columns.remove(x)

In [None]:
for col in columns:
    proc(df=train, tcol='frame.time_epoch', target='class_normal', col=col, target_first=True, df_test=test, include_test=True)

In [None]:
def h(content):
    display(HTML(content))
    
def timehist(df, tcol, target, col, target_first, clipping=9999999999999999, concat_df = False, odf = None):
    if concat_df == True:
        df = pd.concat([df, odf])
        
    title = target + ' Hist ' + col
    if( target_first==True):
        df[df[target] == 1].set_index(tcol)[col].clip(0, clipping).plot(style='.', figsize=(15, 3))
        df[df[target] == 0].set_index(tcol)[col].clip(0, clipping).plot(style='.', figsize=(15, 3))
        plt.title(title)
        plt.show()
    elif( target_first==False):
        df[df[target] == 0].set_index(tcol)[col].clip(0, clipping).plot(style='.', title= title, figsize=(15, 3))
        df[df[target] == 1].set_index(tcol)[col].clip(0, clipping).plot(style='.', title= title, figsize=(15, 3))
        plt.title(title)
        plt.show()

def _desc(data, col, label):
    d0 = data.describe().reset_index()
    d0.columns = [col, label]
    return d0.append({col:'unique values', label:data.unique().shape[0]}, ignore_index=True) \
             .append({col:'NaNs', label:data.isnull().sum()}, ignore_index=True) \
             .append({col:'NaNs share', label:np.round(data.isnull().sum() / data.shape[0], 4)}, ignore_index=True) \

def desc(df_train, col, target, include_test=False, df_test=None):
    d0 = _desc(df_train[col], col, 'Train')
    d1 = _desc(df_train.loc[df_train[target] == 1, col], col, 'Train normal')
    d2 = _desc(df_train.loc[df_train[target] == 0, col], col, 'Train not normal')
    if( include_test):
        d3 = _desc(df_test[col], col, 'Test')
        d4 = _desc(df_test.loc[df_test[target] == 1, col], col, 'Test normal')
        d5 = _desc(df_test.loc[df_test[target] == 0, col], col, 'Test not normal')
    if( include_test):
        dd = d0.merge(d1).merge(d2).merge(d3).merge(d4).merge(d5)
    else:
        dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = df_train[[target,col]].groupby(col)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    d0 = d0.head(N)
    d0 = d0.rename({'size':'Count in train (desc)','mean':'Mean target train','sum':'Sum target train'}, axis=1)
    display(d0)
        
    d1 = df_test[[target,col]].groupby(col)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    d1 = d1.head(N)
    d1 = d1.rename({'size':'Count in test (desc)','mean':'Mean target test','sum':'Sum target test'}, axis=1)
    display(d1)

def hist1(df,col):
    plt.figure(figsize=(15, 3))
    plt.hist(df[col], bins=70);
    plt.title('Train histogram: ' + col);
    plt.show()

def corr1(df,col):
    N = None #10000
    num_vars = [f for f in train.columns if train[f].dtype != 'object']
    trx = df.head(N) if N is not None else df.copy()
    corrs = trx[num_vars].corrwith(trx[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'Column',0:'Correlation with ' + col}, axis=1)
    h('<b>Most correlated values with ' + col + ':</b>')
    trx = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
    def linkx(val):
        return '<a href="#c_{}">{}</a>'.format(val, val)
    trx['Column'] = trx['Column'].apply(linkx)
    h(trx.to_html(escape=False))

def numeric(df_input, tcol_input, target_input, col_input, target_first_input, df_test, include_test):
    timehist(df=df_input, tcol=tcol_input, target=target_input, col=col_input, target_first=target_first_input, odf=df_test, concat_df=include_test)
    hist1(df_input,col_input)
    desc(df_input, col_input, target_input, include_test, df_test)
    corr1(df_input,col_input)

def categorical(df, col, target, df_test, include_test):
    desc(df, col, target, include_test, df_test)

def proc(df, tcol, target, col, target_first, df_test, include_test):
    if col not in ['isFraud','TransactionDT']:
        h('<h3 id="c_' + col + '">' + col + '</h3>' + '<a style="font-size:11px" href="#home">(Jump to top)</a>')
        categorical(df, col, target, df_test, include_test) if train[col].dtype == 'object' else numeric(df, tcol, target, col, target_first, df_test, include_test)

columns = list(train.columns)
for x in ['frame.time_epoch', 'class_normal', 'class_injection', 'class_impersonation', 'class_flooding']:
    columns.remove(x)

for col in columns:
    proc(df=train, tcol='frame.time_epoch', target='class_normal', col=col, target_first=True, df_test=test, include_test=True)def h(content):
    display(HTML(content))
    
def timehist_normal(col):
    N =  9999999999999999 # clip trans amount for better view
    train[train['class_normal'] == 1].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    train[train['class_normal'] == 0].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    plt.title('class_normal')
    plt.show()
    
def timehist_injection(col):
    N =  9999999999999999 # clip trans amount for better view
    train[train['class_injection'] == 0].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    train[train['class_injection'] == 1].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    plt.title('class_injection')
    plt.show()
    
def timehist_impersonation(col):
    N =  9999999999999999 # clip trans amount for better view
    train[train['class_impersonation'] == 0].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    train[train['class_impersonation'] == 1].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    plt.title('class_impersonation')
    plt.show()
    
def timehist_flooding(col):
    N =  9999999999999999 # clip trans amount for better view
    train[train['class_flooding'] == 0].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    train[train['class_flooding'] == 1].set_index('frame.time_epoch')[col].clip(0, N).plot(style='.', title='Hist ' + col, figsize=(15, 3))
    plt.title('class_flooding')
    plt.show()

def _desc(data, col, label):
    d0 = data.describe().reset_index()
    d0.columns = [col, label]
    return d0.append({col:'unique values', label:data.unique().shape[0]}, ignore_index=True) \
             .append({col:'NaNs', label:data.isnull().sum()}, ignore_index=True) \
             .append({col:'NaNs share', label:np.round(data.isnull().sum() / data.shape[0], 4)}, ignore_index=True) \

def desc_normal(col):
    d0 = _desc(train[col], col, 'Train')
    d1 = _desc(train.loc[train['class_normal'] == 1, col], col, 'Train normal')
    d2 = _desc(train.loc[train['class_normal'] == 0, col], col, 'Train not normal')
    dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = train[['class_normal',col]].fillna(-999).groupby(col)['class_normal'].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    dd = d0.head(N)
    dd = dd.rename({'size':'Count in train (desc)','mean':'Mean normal target','sum':'Sum target','TransactionID':'Count in test'}, axis=1)
    display(dd)

    h('<b>Biggest normal sum values in train (NaN = -999):</b>')
    dd = d0.sort_values('sum', ascending=False).reset_index(drop=True).head(N).merge(d1, how='left', on=col)
    dd = dd.rename({'size':'Count in train','mean':'Mean target','sum':'Sum target (desc)','TransactionID':'Count in test'}, axis=1)
    display(dd)
    
def desc_injection(col):
    d0 = _desc(train[col], col, 'Train')
    d1 = _desc(train.loc[train['class_injection'] == 1, col], col, 'Train injection')
    d2 = _desc(train.loc[train['class_injection'] == 0, col], col, 'Train not injection')
    dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = train[['class_injection',col]].fillna(-999).groupby(col)['class_injection'].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    dd = d0.head(N)
    dd = dd.rename({'size':'Count in train (desc)','mean':'Mean target','sum':'Sum target','TransactionID':'Count in test'}, axis=1)
    display(dd)

    h('<b>Biggest fraud sum values in train (NaN = -999):</b>')
    dd = d0.sort_values('sum', ascending=False).reset_index(drop=True).head(N).merge(d1, how='left', on=col)
    dd = dd.rename({'size':'Count in train','mean':'Mean target','sum':'Sum target (desc)','TransactionID':'Count in test'}, axis=1)
    display(dd)
    
def desc_impersonation(col):
    d0 = _desc(train[col], col, 'Train')
    d1 = _desc(train.loc[train['class_impersonation'] == 1, col], col, 'Train impersonatin')
    d2 = _desc(train.loc[train['class_impersonation'] == 0, col], col, 'Train not impersonation')
    dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = train[['class_impersonation',col]].fillna(-999).groupby(col)['class_impersonation'].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    dd = d0.head(N)
    dd = dd.rename({'size':'Count in train (desc)','mean':'Mean target','sum':'Sum target','TransactionID':'Count in test'}, axis=1)
    display(dd)

    h('<b>Biggest fraud sum values in train (NaN = -999):</b>')
    dd = d0.sort_values('sum', ascending=False).reset_index(drop=True).head(N).merge(d1, how='left', on=col)
    dd = dd.rename({'size':'Count in train','mean':'Mean target','sum':'Sum target (desc)','TransactionID':'Count in test'}, axis=1)
    display(dd)
    
def desc_flooding(col):
    d0 = _desc(train[col], col, 'Train')
    d1 = _desc(train.loc[train['class_flooding'] == 1, col], col, 'Train impersonatin')
    d2 = _desc(train.loc[train['class_flooding'] == 0, col], col, 'Train not impersonation')
    dd = d0.merge(d1).merge(d2)
    display(dd)
    
    h('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = train[['class_flooding',col]].fillna(-999).groupby(col)['class_flooding'].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    dd = d0.head(N)
    dd = dd.rename({'size':'Count in train (desc)','mean':'Mean target','sum':'Sum target','TransactionID':'Count in test'}, axis=1)
    display(dd)

    h('<b>Biggest fraud sum values in train (NaN = -999):</b>')
    dd = d0.sort_values('sum', ascending=False).reset_index(drop=True).head(N).merge(d1, how='left', on=col)
    dd = dd.rename({'size':'Count in train','mean':'Mean target','sum':'Sum target (desc)','TransactionID':'Count in test'}, axis=1)
    display(dd)

def hist1(col):
    plt.figure(figsize=(15, 3))
    plt.hist(train[col], bins=70);
    plt.title('Train histogram: ' + col);
    plt.show()

def corr1(col):
    N = None #10000
    num_vars = [f for f in train.columns if train[f].dtype != 'object']
    trx = train.head(N) if N is not None else tr.copy()
    corrs = trx[num_vars].corrwith(trx[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'Column',0:'Correlation with ' + col}, axis=1)
    h('<b>Most correlated values with ' + col + ':</b>')
    trx = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
    def linkx(val):
        return '<a href="#c_{}">{}</a>'.format(val, val) if val in included_cols else val
    trx['Column'] = trx['Column'].apply(linkx)
    h(trx.to_html(escape=False))
    
def numeric(col):
    timehist_normal(col)
    timehist_injection(col)
    timehist_impersonation(col)
    timehist_flooding(col)
    hist1(col)
    desc_normal(col)
    desc_injection(col)
    desc_impersonation(col)
    desc_flooding(col)
    corr1(col)
    
def categorical(col):
    desc_normal(col)
    desc_injection(col)
    desc_impersonation(col)
    desc_flooding(col)
    
def proc(col):
    if col not in ['isFraud','TransactionDT']:
        h('<h3 id="c_' + col + '">' + col + '</h3>' + '<a style="font-size:11px" href="#home">(Jump to top)</a>')
        categorical(col) if train[col].dtype == 'object' else numeric(col)

In [None]:
columns = list(train.columns)
drop_list = ['class_normal', 'class_injection', 'class_impersonation', 'class_flooding', 'frame.time_epoch' ]
columns = [ val for val in columns if val not in drop_list ]
for col in columns:
    proc(col)