In [None]:
# see http://icsdweb.aegean.gr/awid for more info

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


In [None]:
# helper funcs
def class_distribution_bar(series, title):
    series['class'].value_counts(normalize=True).plot(kind="bar")

    plt.gcf()
    plt.xlabel("Class")
    plt.ylabel("Frequency")
    plt.title(title)
    
def correlated_features(corr):
    sorted_mat = corr.unstack().sort_values().dropna()

    visited = []
    to_drop = []

    print(f"{'Feature 1':30} {'Feature 2':30} {'Correlation':10}")
    for i, x in sorted_mat.items():
        if x < 0.95:
            continue
        if i[0] == i[1]:
            continue
        if ( (i[0], i[1]) in visited ) or ( (i[1], i[0]) in visited ):
            continue
        visited.append((i[0], i[1]))
        print(f"{i[0]:30} {i[1]:30} {x:02}")
        to_drop.append(i[1])
    return to_drop
# simple function to test the accuracy of a model fitted on training data on our testing data
def get_test_accuracy_of(model):
    y_preds = model.predict(X_test)
    return accuracy_score(y_preds, y_test)

In [None]:
# http://icsdweb.aegean.gr/awid/features.html

# get the names of the features
    
features = ['frame.interface_id',
 'frame.dlt',
 'frame.offset_shift',
 'frame.time_epoch',
 'frame.time_delta',
 'frame.time_delta_displayed',
 'frame.time_relative',
 'frame.len',
 'frame.cap_len',
 'frame.marked',
 'frame.ignored',
 'radiotap.version',
 'radiotap.pad',
 'radiotap.length',
 'radiotap.present.tsft',
 'radiotap.present.flags',
 'radiotap.present.rate',
 'radiotap.present.channel',
 'radiotap.present.fhss',
 'radiotap.present.dbm_antsignal',
 'radiotap.present.dbm_antnoise',
 'radiotap.present.lock_quality',
 'radiotap.present.tx_attenuation',
 'radiotap.present.db_tx_attenuation',
 'radiotap.present.dbm_tx_power',
 'radiotap.present.antenna',
 'radiotap.present.db_antsignal',
 'radiotap.present.db_antnoise',
 'radiotap.present.rxflags',
 'radiotap.present.xchannel',
 'radiotap.present.mcs',
 'radiotap.present.ampdu',
 'radiotap.present.vht',
 'radiotap.present.reserved',
 'radiotap.present.rtap_ns',
 'radiotap.present.vendor_ns',
 'radiotap.present.ext',
 'radiotap.mactime',
 'radiotap.flags.cfp',
 'radiotap.flags.preamble',
 'radiotap.flags.wep',
 'radiotap.flags.frag',
 'radiotap.flags.fcs',
 'radiotap.flags.datapad',
 'radiotap.flags.badfcs',
 'radiotap.flags.shortgi',
 'radiotap.datarate',
 'radiotap.channel.freq',
 'radiotap.channel.type.turbo',
 'radiotap.channel.type.cck',
 'radiotap.channel.type.ofdm',
 'radiotap.channel.type.2ghz',
 'radiotap.channel.type.5ghz',
 'radiotap.channel.type.passive',
 'radiotap.channel.type.dynamic',
 'radiotap.channel.type.gfsk',
 'radiotap.channel.type.gsm',
 'radiotap.channel.type.sturbo',
 'radiotap.channel.type.half',
 'radiotap.channel.type.quarter',
 'radiotap.dbm_antsignal',
 'radiotap.antenna',
 'radiotap.rxflags.badplcp',
 'wlan.fc.type_subtype',
 'wlan.fc.version',
 'wlan.fc.type',
 'wlan.fc.subtype',
 'wlan.fc.ds',
 'wlan.fc.frag',
 'wlan.fc.retry',
 'wlan.fc.pwrmgt',
 'wlan.fc.moredata',
 'wlan.fc.protected',
 'wlan.fc.order',
 'wlan.duration',
 'wlan.ra',
 'wlan.da',
 'wlan.ta',
 'wlan.sa',
 'wlan.bssid',
 'wlan.frag',
 'wlan.seq',
 'wlan.bar.type',
 'wlan.ba.control.ackpolicy',
 'wlan.ba.control.multitid',
 'wlan.ba.control.cbitmap',
 'wlan.bar.compressed.tidinfo',
 'wlan.ba.bm',
 'wlan.fcs_good',
 'wlan_mgt.fixed.capabilities.ess',
 'wlan_mgt.fixed.capabilities.ibss',
 'wlan_mgt.fixed.capabilities.cfpoll.ap',
 'wlan_mgt.fixed.capabilities.privacy',
 'wlan_mgt.fixed.capabilities.preamble',
 'wlan_mgt.fixed.capabilities.pbcc',
 'wlan_mgt.fixed.capabilities.agility',
 'wlan_mgt.fixed.capabilities.spec_man',
 'wlan_mgt.fixed.capabilities.short_slot_time',
 'wlan_mgt.fixed.capabilities.apsd',
 'wlan_mgt.fixed.capabilities.radio_measurement',
 'wlan_mgt.fixed.capabilities.dsss_ofdm',
 'wlan_mgt.fixed.capabilities.del_blk_ack',
 'wlan_mgt.fixed.capabilities.imm_blk_ack',
 'wlan_mgt.fixed.listen_ival',
 'wlan_mgt.fixed.current_ap',
 'wlan_mgt.fixed.status_code',
 'wlan_mgt.fixed.timestamp',
 'wlan_mgt.fixed.beacon',
 'wlan_mgt.fixed.aid',
 'wlan_mgt.fixed.reason_code',
 'wlan_mgt.fixed.auth.alg',
 'wlan_mgt.fixed.auth_seq',
 'wlan_mgt.fixed.category_code',
 'wlan_mgt.fixed.htact',
 'wlan_mgt.fixed.chanwidth',
 'wlan_mgt.fixed.fragment',
 'wlan_mgt.fixed.sequence',
 'wlan_mgt.tagged.all',
 'wlan_mgt.ssid',
 'wlan_mgt.ds.current_channel',
 'wlan_mgt.tim.dtim_count',
 'wlan_mgt.tim.dtim_period',
 'wlan_mgt.tim.bmapctl.multicast',
 'wlan_mgt.tim.bmapctl.offset',
 'wlan_mgt.country_info.environment',
 'wlan_mgt.rsn.version',
 'wlan_mgt.rsn.gcs.type',
 'wlan_mgt.rsn.pcs.count',
 'wlan_mgt.rsn.akms.count',
 'wlan_mgt.rsn.akms.type',
 'wlan_mgt.rsn.capabilities.preauth',
 'wlan_mgt.rsn.capabilities.no_pairwise',
 'wlan_mgt.rsn.capabilities.ptksa_replay_counter',
 'wlan_mgt.rsn.capabilities.gtksa_replay_counter',
 'wlan_mgt.rsn.capabilities.mfpr',
 'wlan_mgt.rsn.capabilities.mfpc',
 'wlan_mgt.rsn.capabilities.peerkey',
 'wlan_mgt.tcprep.trsmt_pow',
 'wlan_mgt.tcprep.link_mrg',
 'wlan.wep.iv',
 'wlan.wep.key',
 'wlan.wep.icv',
 'wlan.tkip.extiv',
 'wlan.ccmp.extiv',
 'wlan.qos.tid',
 'wlan.qos.priority',
 'wlan.qos.eosp',
 'wlan.qos.ack',
 'wlan.qos.amsdupresent',
 'wlan.qos.buf_state_indicated',
 'wlan.qos.bit4',
 'wlan.qos.txop_dur_req',
 'wlan.qos.buf_state_indicated1',
 'data.len',
 'class']

In [None]:
# import a training set
awid = pd.read_csv("data/AWID-CLS-R-Trn.csv", header=None, names=features)

# see the number of rows/columns
awid.shape

In [None]:
# they use ? as a null attribute. We will eventually have to replace them with None values
awid.head(n=9)

In [None]:
# see the distribution of response vars
awid['class'].value_counts(normalize=True)

In [None]:
# claims there are no null values because of the ?'s'
awid.isna().sum()

In [None]:
# replace the ? marks with None
awid.replace({"?": None}, inplace=True)

In [None]:
# Many missing pieces of data!
awid.isna().sum()

In [None]:
# goal is to remove columns that have over 50% of missing data
columns_with_mostly_null_data = awid.columns[awid.isnull().mean() >= 0.4]
# 72 columns are going to be affected!
columns_with_mostly_null_data.shape

In [None]:
# drop the columns with over half missing data
awid.drop(columns_with_mostly_null_data, axis=1, inplace=True)

In [None]:
awid.shape

In [None]:
# now drop the rows that have missing values
awid.dropna(inplace=True)  # drop rows with null data

In [None]:
# lost 456,169 rows
awid.shape

In [None]:
# doesn't affect our distribution too much

# 0.878763  is our null accuracy. Our model must be better than this number to be a contender
awid['class'].value_counts(normalize=True)

In [None]:
# only select numerical columns for our ML algorithms, there should be more..
awid.select_dtypes(['number']).shape

In [None]:
# transform all columns into numerical dtypes
for col in awid.columns:
    awid[col] = pd.to_numeric(awid[col], errors='ignore')

In [None]:
# that makes more sense
awid.select_dtypes(['number']).shape

In [None]:
# basic descroptive statistics
awid.describe()

In [None]:
# run correlation matrix and plot
f, ax = plt.subplots(figsize=(16, 12))
corr = awid.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=bool),
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

features_corr = correlated_features(corr)

In [None]:
print("Before drop:")
print(awid.shape)

features_to_drop = ["frame.cap_len", "frame.time_delta_displayed", "frame.time_epoch", "radiotap.mactime", "radiotap.datarate", "wlan.fc.protected"]

awid.drop(features_to_drop, inplace=True, axis=1)

print("After drop:")
print(awid.shape)

class_distribution_bar(awid, "Initial Class Distribution")

In [None]:


df = awid.sample(frac=1)


inj = df[df['class'] == 'injection']
impers = df[df['class'] == 'impersonation']
flooding = df[df['class'] == 'flooding']
normal = df[df['class'] == 'normal']


samples = min(len(inj), len(impers), len(flooding))

awid = pd.concat([inj.sample(n=samples), impers.sample(n=samples), flooding.sample(n=samples), normal.sample(n=samples)])
awid = awid.sample(frac=1).reset_index()
awid['class'].value_counts(normalize=True)

class_distribution_bar(awid, "Balanced Class Distribution")

In [None]:
print(awid.shape)
X, y = awid.select_dtypes(['number']), awid['class']

In [None]:
# do a basic naive bayes fitting
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

# fit our model to the data
nb.fit(X, y)

In [None]:
# basic metric, accuracy
from sklearn.metrics import accuracy_score

In [None]:
# simple function to test the accuracy of a model fitted on training data on our testing data
def get_test_accuracy_of(model):
    y_preds = model.predict(X_test)
    return accuracy_score(y_preds, y_test)
    
get_test_accuracy_of(nb)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X, y)

get_test_accuracy_of(lr)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

tree.fit(X, y)

get_test_accuracy_of(tree)

In [None]:
# see gini scores of decision tree features
pd.DataFrame({'feature':awid.select_dtypes(['number']).columns, 
              'importance':tree.feature_importances_}).sort_values('importance', ascending=False).head(10)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()

forest.fit(X, y)

get_test_accuracy_of(forest)

In [None]:
# Create a pipeline that will scale the numerical data and then feed the resulting data into a decision tree

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

preprocessing = Pipeline([
    ("scale", StandardScaler()),
])

pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("classifier", DecisionTreeClassifier())
])

# try varying levels of depth
params = {
    "classifier__max_depth": [None, 3, 5, 10], 
         }

# instantiate a gridsearch module
grid = GridSearchCV(pipeline, params)
# fit the module
grid.fit(X, y)

# test the best model
get_test_accuracy_of(grid.best_estimator_)

In [None]:
# try the same thing with a random forest

preprocessing = Pipeline([
    ("scale", StandardScaler()),
])

pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("classifier", RandomForestClassifier())
])

# try varying levels of depth
params = {
    "classifier__max_depth": [None, 3, 5, 10], 
         }

grid = GridSearchCV(pipeline, params)
grid.fit(X, y)
# best accuracy so far!
get_test_accuracy_of(grid.best_estimator_)

# Deep Learning Implementation

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [None]:
def train_predict_and_save_model(model, model_name, X_train, y_train, X_test, y_test, epochs = 10, batch_size=32):
    model.fit(X_train, y_train, epochs=10, batch_size=32)
    
    y_hat = model.predict(X_test)
    #Get Prediction
    new_y_hat=[]
    for i in y_hat:
        greater = i[0]
        last = i[0]
        for j in range(1,4):
            if i[j] > last:
                greater = j
                last = i[j]
        new_y_hat.append(greater)

    y_hat = new_y_hat
    
    print(f"\nAccuracy:{accuracy_score(y_test, y_hat)}")
    
    model.save(model_name)

In [None]:
epochs = 10

# Sequential Model

In [None]:
model = Sequential()
model.add(Dense(units=32, activation='relu', input_dim=(len(X_train.columns))))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=4, activation='sigmoid'))
loss_fn = keras.losses.SparseCategoricalCrossentropy()
model.compile(loss=loss_fn, optimizer='adam', metrics='accuracy')

train_predict_and_save_model(model, "SequentialModel", X_train, y_train, X_test, y_test, 2, batch_size = 32)

# ... Model

In [None]:
model = Sequential()
model.add(Dense(units=32, activation='relu', input_dim=(len(X_train.columns))))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=4, activation='sigmoid'))

model.compile(loss="mean_squared_error", optimizer="sgd", metrics='accuracy')   

train_predict_and_save_model(model, "SequentialActLinearModel", X_train, y_train, X_test, y_test, epochs, batch_size = 128)

In [None]:
#Save model on file
model.save(model_name)

In [None]:
del model

In [None]:
#load model from file
model = load_model('SequentialTFModel')

In [None]:
# Correlation Matrix
import matplotlib.pyplot as plt
import seaborn as sns


f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))

# Entire DataFrame
corr = awid.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix", fontsize=14)
plt.show()