## Machine Learning in Cybersecurity and Privacy

### Challenge 8 - Pseudo Labelling using multiple supervised ML models

<div style="text-align: right"> By Smit Doshi (001475186) </div>

### Importing Required Libraries

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# machine learning imports
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
data = pd.read_csv('android_traffic.csv')
data.head()

Unnamed: 0,tcp_packets,dist_port_tcp,external_ips,vulume_bytes,udp_packets,tcp_urg_packet,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,source_app_packets.1,dns_query_times,type
0,36,6,3,3911,0,0,39,33,5100,4140,39,3,benign
1,117,0,9,23514,0,0,128,107,26248,24358,128,11,benign
2,196,0,6,24151,0,0,205,214,163887,24867,205,9,benign
3,6,0,1,889,0,0,7,6,819,975,7,1,benign
4,6,0,1,882,0,0,7,6,819,968,7,1,benign


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7845 entries, 0 to 7844
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tcp_packets           7845 non-null   int64 
 1   dist_port_tcp         7845 non-null   int64 
 2   external_ips          7845 non-null   int64 
 3   vulume_bytes          7845 non-null   int64 
 4   udp_packets           7845 non-null   int64 
 5   tcp_urg_packet        7845 non-null   int64 
 6   source_app_packets    7845 non-null   int64 
 7   remote_app_packets    7845 non-null   int64 
 8   source_app_bytes      7845 non-null   int64 
 9   remote_app_bytes      7845 non-null   int64 
 10  source_app_packets.1  7845 non-null   int64 
 11  dns_query_times       7845 non-null   int64 
 12  type                  7845 non-null   object
dtypes: int64(12), object(1)
memory usage: 796.9+ KB


In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
scaler = StandardScaler()
scaler.fit(data.drop('type',axis=1))

StandardScaler()

In [6]:
scaled_features = scaler.transform(data.drop('type',axis=1))

In [7]:
data.columns[:-1]

Index(['tcp_packets', 'dist_port_tcp', 'external_ips', 'vulume_bytes',
       'udp_packets', 'tcp_urg_packet', 'source_app_packets',
       'remote_app_packets', 'source_app_bytes', 'remote_app_bytes',
       'source_app_packets.1', 'dns_query_times'],
      dtype='object')

In [8]:
df_feat = pd.DataFrame(scaled_features,columns=data.columns[:-1])

In [9]:
df_feat

Unnamed: 0,tcp_packets,dist_port_tcp,external_ips,vulume_bytes,udp_packets,tcp_urg_packet,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,source_app_packets.1,dns_query_times
0,-0.143441,-0.033652,0.086046,-0.153587,-0.040693,-0.015969,-0.146231,-0.151404,-0.140898,-0.155173,-0.146231,-0.100476
1,-0.039311,-0.149817,2.138859,0.084743,-0.040693,-0.015969,-0.031980,-0.082119,-0.125803,0.090261,-0.031980,0.322821
2,0.062249,-0.149817,1.112453,0.092488,-0.040693,-0.015969,0.066867,0.018064,-0.027559,0.096440,0.066867,0.216997
3,-0.182008,-0.149817,-0.598225,-0.190328,-0.040693,-0.015969,-0.187310,-0.176684,-0.143954,-0.193594,-0.187310,-0.206300
4,-0.182008,-0.149817,-0.598225,-0.190414,-0.040693,-0.015969,-0.187310,-0.176684,-0.143954,-0.193679,-0.187310,-0.206300
...,...,...,...,...,...,...,...,...,...,...,...,...
7840,-0.189721,-0.149817,-0.940360,-0.201137,-0.040693,-0.015969,-0.193729,-0.180429,-0.144355,-0.203694,-0.193729,-0.153388
7841,-0.184579,-0.072374,-0.598225,-0.197538,-0.040693,-0.015969,-0.189878,-0.181365,-0.144477,-0.200793,-0.189878,-0.206300
7842,-0.189721,-0.149817,-0.940360,-0.201137,-0.040693,-0.015969,-0.193729,-0.180429,-0.144355,-0.203694,-0.193729,-0.153388
7843,-0.189721,-0.149817,-0.940360,-0.201137,-0.040693,-0.015969,-0.193729,-0.180429,-0.144355,-0.203694,-0.193729,-0.153388


In [10]:
for column in data.columns:
    print('unique values in {} column = {}'.format(column,data[column].nunique()))

unique values in tcp_packets column = 760
unique values in dist_port_tcp column = 169
unique values in external_ips column = 31
unique values in vulume_bytes column = 4639
unique values in udp_packets column = 14
unique values in tcp_urg_packet column = 2
unique values in source_app_packets column = 776
unique values in remote_app_packets column = 851
unique values in source_app_bytes column = 4815
unique values in remote_app_bytes column = 4874
unique values in source_app_packets.1 column = 776
unique values in dns_query_times column = 63
unique values in type column = 2


In [11]:
X = df_feat
#X = data.drop('type',axis=1)
y = data['type']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, test_size=0.20, random_state=42)

In [12]:
from sklearn.model_selection import GridSearchCV

In [25]:
param_grid ={'C':[10,100,1000,10000,100000],'gamma':[10,1,0.1,0.01,0.001,0.0001]}

In [26]:
grid = GridSearchCV(SVC(),param_grid,verbose=1,cv=5,n_jobs=-1)

In [27]:
%%time
grid.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.4min finished


CPU times: user 44.8 s, sys: 723 ms, total: 45.5 s
Wall time: 5min 7s


GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [10, 100, 1000, 10000, 100000],
                         'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001]},
             verbose=1)

In [28]:
# GridSearchCV will tell us which are the best parameters
grid.best_params_

{'C': 100000, 'gamma': 10}

In [29]:
# Predicting Using Best Parameters
g_pred = grid.predict(X_test)

In [30]:
print(confusion_matrix(y_test,g_pred))
print()
print(classification_report(y_test,g_pred))

[[822 116]
 [111 520]]

              precision    recall  f1-score   support

      benign       0.88      0.88      0.88       938
   malicious       0.82      0.82      0.82       631

    accuracy                           0.86      1569
   macro avg       0.85      0.85      0.85      1569
weighted avg       0.86      0.86      0.86      1569

