In [None]:
import pickle

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import glob

pd.options.display.max_columns = None

In [None]:
df_frag = pd.concat(map(pd.read_csv, glob.glob('data/csv_fragmentedV3/*.csv')))

In [None]:
df = pd.read_csv('data/test/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv')

In [None]:
df_cols = df.columns
frag_cols = df_frag.columns

common_cols = df_cols.intersection(frag_cols)
only_frag_cols = frag_cols.difference(df_cols)

In [None]:
df_frag = df_frag.drop(only_frag_cols, axis=1)

In [None]:
df_frag['Label'] = df_frag['Label'].replace('No Label', 'Bot')

In [None]:
df_frag

In [None]:
df = pd.concat([df, df_frag], ignore_index=True, sort=False)

In [None]:
df = df.drop('Timestamp', axis=1)

In [None]:
df.head()

In [None]:
df['Label'].unique()

In [None]:
le = LabelEncoder()
le.fit(df.Label)

In [None]:
def reduce_anomalies(df, pct_anomalies=.01):
    labels = df['label'].copy()
    is_anomaly = labels != 'BENIGN'
    num_normal = np.sum(~is_anomaly)
    num_anomalies = int(pct_anomalies * num_normal)
    all_anomalies = labels[labels != 'BENIGN']
    anomalies_to_keep = np.random.choice(
        all_anomalies.index, size=num_anomalies, replace=False)
    anomalous_data = df.iloc[anomalies_to_keep].copy()
    normal_data = df[~is_anomaly].copy()
    new_df = pd.concat([normal_data, anomalous_data], axis=0)
    return new_df
# reduce_anomalies(df)

In [None]:
labels = df['Label'].copy()
int_labels = le.transform(labels)
df = df.drop('Label', axis=1)

In [None]:
int_labels.shape

In [None]:
df.shape

In [None]:
# Remove infinities and NaNs
def remove_infs(df, labels):
    assert isinstance(df, pd.DataFrame)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep], labels[indices_to_keep]

In [None]:
before_removal = len(df)
df, int_labels = remove_infs(df, int_labels)
print(f'Length before NaN drop: {before_removal}, after NaN drop: {len(df)}\n \
The df is now {len(df)/before_removal} of its original size')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df,
                                                    int_labels,
                                                    test_size=.25,
                                                    random_state=42)

In [None]:
preprocessed_data = {
    'x_train': x_train,
    'y_train': y_train,
    'x_test': x_test,
    'y_test': y_test,
    'le': le
}

In [None]:
with open('data/preprocessed_data.pickle', 'wb') as file:
    pickle.dump(preprocessed_data, file)