In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [42]:
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
        "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
        "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
        "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
        "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
        "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
        "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]


# if the name of the data file is "kddcup.data" then replace the code below with "kddcup.data"
df = pd.read_csv("kddcup.data.corrected", sep=",", names=columns, index_col=None)


In [43]:
df.shape

(4898431, 42)

In [44]:
df = df[df["service"] == "http"]
df = df.drop("service", axis=1)
columns.remove("service")

In [45]:
df.shape

(623091, 41)

In [46]:
df["label"].value_counts()

label
normal.       619046
back.           2203
neptune.        1801
portsweep.        16
ipsweep.          13
satan.             7
phf.               4
nmap.              1
Name: count, dtype: int64

In [47]:
df.head(5)

Unnamed: 0,duration,protocol_type,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,SF,215,45076,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,SF,162,4528,0,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,SF,236,1228,0,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,SF,233,2032,0,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,SF,239,486,0,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [48]:
for col in df.columns:
    if df[col].dtype == "object":
        encoded = LabelEncoder()
        encoded.fit(df[col])
        df[col] = encoded.transform(df[col])

In [49]:
df.head(5)

Unnamed: 0,duration,protocol_type,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,0,9,215,45076,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,0,0,9,162,4528,0,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4
2,0,0,9,236,1228,0,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,4
3,0,0,9,233,2032,0,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,4
4,0,0,9,239,486,0,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623091 entries, 0 to 4898430
Data columns (total 41 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     623091 non-null  int64  
 1   protocol_type                623091 non-null  int32  
 2   flag                         623091 non-null  int32  
 3   src_bytes                    623091 non-null  int64  
 4   dst_bytes                    623091 non-null  int64  
 5   land                         623091 non-null  int64  
 6   wrong_fragment               623091 non-null  int64  
 7   urgent                       623091 non-null  int64  
 8   hot                          623091 non-null  int64  
 9   num_failed_logins            623091 non-null  int64  
 10  logged_in                    623091 non-null  int64  
 11  num_compromised              623091 non-null  int64  
 12  root_shell                   623091 non-null  int64  
 13  su_

In [51]:
for f in range(0, 3):
    df = df.iloc[np.random.permutation(len(df))]
    
df2 = df[:500000]
labels = df2["label"]
df_validate = df[500000:]

In [52]:
df2 = df2.drop("label", axis=1)

In [53]:
x_train, x_test, y_train, y_test = train_test_split(df2, labels, test_size = 0.2, random_state = 42)

df_val_label = df_validate["label"]
df_val_train = df_validate.drop("label", axis=1)

x_val, y_val = df_val_train, df_val_label

In [54]:
print("Shapes:\nx_train:%s\ny_train:%s\n" % (x_train.shape, y_train.shape))
print("x_test:%s\ny_test:%s\n" % (x_test.shape, y_test.shape))
print("x_val:%s\ny_val:%s\n" % (x_val.shape, y_val.shape))

Shapes:
x_train:(400000, 40)
y_train:(400000,)

x_test:(100000, 40)
y_test:(100000,)

x_val:(123091, 40)
y_val:(123091,)



# Random Forest Classifier
Q2. (50 points) In this question, you will perform supervised anomaly detection for IoT network intrusion detection.
Unlike the unsupervised anomaly detection in Q1 which was only able to differentiate between normal and anomalous
data, a supervised anomaly detector can also predict the class to which an anomaly belongs to. The dataset we will
use is the same as in Q1. Starting from the notebook iot-intrusion-rf.ipynb, fill out the missing code to create and use
a Random Forest Classifier on the dataset, for anomaly detection. You should explore different hyperparameters for
the classifier to achieve the best performance. Your score will depend on the highest recall value achieved by your
model on the test set. 

In [55]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier() # Didn't need to adjust hyperparameters. This seemed to do the trick.


In [56]:
model.fit(x_train, y_train)

In [57]:
y_pred = model.predict(x_test)

In [58]:
from sklearn import metrics

print("Accuracy =", metrics.accuracy_score(y_test, y_pred))
print("Confusion Matrix =\n", metrics.confusion_matrix(y_test, y_pred, labels=None, 
                                              sample_weight=None))
print("Recall =", metrics.recall_score(y_test, y_pred, labels=None, 
                                             pos_label=1, average='weighted', 
                                             sample_weight=None))
print("Classification Report =\n", metrics.classification_report(y_test, y_pred, 
                                                                 labels=None, 
                                                                 target_names=None, 
                                                                 sample_weight=None, 
                                                                 digits=2, 
                                                                 output_dict=False))

Accuracy = 0.99999
Confusion Matrix =
 [[  364     0     0     0     0]
 [    0     2     0     0     0]
 [    0     0   319     0     0]
 [    1     0     0 99312     0]
 [    0     0     0     0     2]]
Recall = 0.99999
Classification Report =
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       364
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00       319
           4       1.00      1.00      1.00     99313
           7       1.00      1.00      1.00         2

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000

