# Network Traffic Cop (Case Study 5: DS7331)

SMU Student collaboration:

 - Allen Miller
 - Vijay Kaniti
 - Venkata 'MG' Vanga

In [1]:
#pip install vowpalwabbit

In [2]:
import pandas as pd
import numpy as np

from vowpalwabbit.sklearn_vw import VWClassifier
from vowpalwabbit import pyvw
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
pwd

'Z:\\MG_MASters_SMU\\ds7333\\CS#5'

In [34]:
data = pd.read_csv('log2.csv')

In [35]:
data

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18
...,...,...,...,...,...,...,...,...,...,...,...,...
65527,63691,80,13237,80,allow,314,192,122,6,15,4,2
65528,50964,80,13485,80,allow,4680740,67312,4613428,4675,77,985,3690
65529,54871,445,0,0,drop,70,70,0,1,0,1,0
65530,54870,445,0,0,drop,70,70,0,1,0,1,0


In [36]:
for i in data.iterrows():
    print(i)
    break

(0, Source Port             57222
Destination Port           53
NAT Source Port         54587
NAT Destination Port       53
Action                  allow
Bytes                     177
Bytes Sent                 94
Bytes Received             83
Packets                     2
Elapsed Time (sec)         30
pkts_sent                   1
pkts_received               1
Name: 0, dtype: object)


In [37]:
data.dtypes

Source Port              int64
Destination Port         int64
NAT Source Port          int64
NAT Destination Port     int64
Action                  object
Bytes                    int64
Bytes Sent               int64
Bytes Received           int64
Packets                  int64
Elapsed Time (sec)       int64
pkts_sent                int64
pkts_received            int64
dtype: object

In [38]:
data['Action'] = data['Action'].astype('category')
data.dtypes

Source Port                int64
Destination Port           int64
NAT Source Port            int64
NAT Destination Port       int64
Action                  category
Bytes                      int64
Bytes Sent                 int64
Bytes Received             int64
Packets                    int64
Elapsed Time (sec)         int64
pkts_sent                  int64
pkts_received              int64
dtype: object

In [39]:
data['Action'] = data['Action'].cat.codes
data['Action']

0        0
1        0
2        0
3        0
4        0
        ..
65527    0
65528    0
65529    2
65530    2
65531    2
Name: Action, Length: 65532, dtype: int8

In [40]:
data.Action.unique()

array([0, 2, 1, 3], dtype=int8)

In [41]:
data['Action'] = data['Action']+1

data.Action.unique()    

array([1, 3, 2, 4], dtype=int8)

## Vowpal Wabbit

In [42]:

with open('test.vw', 'w') as f:
          for i in data.iterrows():
              label = i[1]['Action']
              index = i[0]
              features = ''
              for col in i[1].index:
                  if 'Port' in col:
                      features = features + col.replace(' ', '_')+'_'+str(i[1][col])+' '
                  elif 'label' not in col:
                      features = features + col.replace(' ', '_')+':'+str(i[1][col])+' '
              line = f'{label} {index}| {features}\n'
              f.write(line)
              
              

In [43]:
vw_data = []
for i in data.iterrows():
    label = i[1]['Action']
    index = i[0]
    features = ''
    for col in i[1].index:
        if 'Port' in col:
            features = features + col.replace(' ', '_')+'_'+str(i[1][col])+' '
        elif 'label' not in col:
            features = features + col.replace(' ', '_')+':'+str(i[1][col])+' '
    line = f'{label} {index}| {features}\n'
    vw_data.append(line)

In [44]:
vw_data[0]

'1 0| Source_Port_57222 Destination_Port_53 NAT_Source_Port_54587 NAT_Destination_Port_53 Action:1 Bytes:177 Bytes_Sent:94 Bytes_Received:83 Packets:2 Elapsed_Time_(sec):30 pkts_sent:1 pkts_received:1 \n'

In [45]:
preds = pd.read_csv('port.preds', header = None, sep = ' ')
preds.head()

Unnamed: 0,0,1
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4


In [46]:
accuracy_score(data['Action'],preds.loc[:,0])

0.9993896111823232

## SVM

In [111]:
from sklearn.svm import LinearSVC
model = LinearSVC(C=1.0, loss = 'squared_hinge', dual = False, random_state = 42)

In [93]:
y = data['Action']

In [94]:
y.shape

(65532,)

In [95]:
X = data.copy()
X = X.drop(['Action'], axis = 1)

In [96]:
X.shape

(65532, 11)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = .33, shuffle = True)

In [112]:
model.fit(X_train, y_train)

LinearSVC(dual=False, random_state=42)

In [113]:
svm_preds = model.predict(X_test)

In [114]:
accuracy_score(svm_preds, y_test)

0.9766947193193378

## SGD

In [26]:
#Create Cross Validation Procedure for Classification task
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, random_state=1234, shuffle=True)


In [27]:
from sklearn.preprocessing import StandardScaler
df_1 = data.copy()
# separate independent and dependent features
if 'Action' in data:
    y = data['Action'].values # get the target variable
    del data['Action'] # get rid of the class label
    X = data.values # use everything else to predict
    
# Normalize the data
scl = StandardScaler()
scaled_X = scl.fit_transform(X)

In [28]:
#SVM_SGD 
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV


SVM = SGDClassifier()
# define parameters
loss_SVM = ['modified_huber', 'hinge']
penalty_SVM = ['l2']
alpha_SVM = [0.01, 0.1, 1, 5, 10]
class_weight_SVM = ['balanced', None]
max_iter_SVM = [500, 1000, 1500]
random_state_SVM = [1234]

# define grid search
param_grid_SVM = dict(loss=loss_SVM, penalty=penalty_SVM, alpha=alpha_SVM,
                     class_weight=class_weight_SVM, max_iter=max_iter_SVM, random_state=random_state_SVM)

grid_search_SVM = GridSearchCV(estimator=SVM, param_grid=param_grid_SVM, n_jobs=-1, cv=cv, 
                               scoring='accuracy',error_score=0, verbose=1)

In [29]:
%%time
grid_result_SVM = grid_search_SVM.fit(scaled_X, y)
# summarize results
print("Best: %f using %s" % (grid_result_SVM.best_score_, grid_result_SVM.best_params_))
means = grid_result_SVM.cv_results_['mean_test_score']
stds = grid_result_SVM.cv_results_['std_test_score']
params = grid_result_SVM.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   43.0s finished


Best: 0.947598 using {'alpha': 0.01, 'class_weight': None, 'loss': 'modified_huber', 'max_iter': 500, 'penalty': 'l2', 'random_state': 1234}
0.929668 (0.004740) with: {'alpha': 0.01, 'class_weight': 'balanced', 'loss': 'modified_huber', 'max_iter': 500, 'penalty': 'l2', 'random_state': 1234}
0.929668 (0.004740) with: {'alpha': 0.01, 'class_weight': 'balanced', 'loss': 'modified_huber', 'max_iter': 1000, 'penalty': 'l2', 'random_state': 1234}
0.929668 (0.004740) with: {'alpha': 0.01, 'class_weight': 'balanced', 'loss': 'modified_huber', 'max_iter': 1500, 'penalty': 'l2', 'random_state': 1234}
0.903223 (0.005859) with: {'alpha': 0.01, 'class_weight': 'balanced', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l2', 'random_state': 1234}
0.903223 (0.005859) with: {'alpha': 0.01, 'class_weight': 'balanced', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'random_state': 1234}
0.903223 (0.005859) with: {'alpha': 0.01, 'class_weight': 'balanced', 'loss': 'hinge', 'max_iter': 1500, 'penalty'

In [30]:
# The GridSearch algorithm determined the following optimal parameters
best_Estimator_SVM =grid_result_SVM.best_estimator_
best_Estimator_SVM

SGDClassifier(alpha=0.01, loss='modified_huber', max_iter=500,
              random_state=1234)