In [1]:
import math
import timeit
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, fbeta_score

### Importing Data 

In [2]:
df= pd.read_csv('data10.csv')

In [3]:
df.head()

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,category
0,119646801,17,16,597,768,293,0,35.12,97.06,384,...,20,45580.5,32287.848,131880,32165,9922286.0,281712.56,10014561,9030908,0
1,375,2,1,38,0,38,0,19.0,26.88,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,281,3,0,46,0,46,0,15.336,26.56,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,99999555,2,2,16,0,8,8,8.0,0.0,0,...,32,1.0,0.0,1,1,99999550.0,0.0,99999553,99999553,4
4,10070086,4,4,379,408,379,0,94.75,189.5,408,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


In [4]:
# memory usage before changing Data Types
df.memory_usage(deep=True).sum() / (1024**2) 

544.290584564209

### Import Data Types Dataset 

In [5]:
df2=pd.read_csv('2018_best_dtypes.csv')

In [6]:
df2.iloc[-1, 0] = "category"  #name of last column changed to category corresponding to main dataset inplace of Label

### Changing Data Types  

In [7]:
for index, row in df2.iterrows():
    feature_name = row['Column']  
    data_type = row['Dtype']   
    df[feature_name] = df[feature_name].astype(data_type)

In [8]:
# memory usage after changing datatypes
df.memory_usage(deep=True).sum() / (1024**2)

245.52245235443115

We can see a significant reduction in memory usage

### Splitting in Features and Category

In [9]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

### Case 1: Equal no. of benign and attack records after over-sampling, proportion of different attack class records remain same

In [10]:
Counter(y)

Counter({0: 901347, 4: 19657, 3: 77595, 1: 14453, 5: 11382, 2: 9410, 6: 87})

In [11]:
sampling_strategy = round((y.value_counts()[0]/y.value_counts()[1:].sum())*y.value_counts()[1:]).to_dict()
sampling_strategy = {key: int(value) for key, value in sampling_strategy.items()}
sampling_strategy

{3: 527515, 4: 133634, 1: 98256, 5: 77378, 2: 63972, 6: 591}

#### Oversampling Attack Records

In [12]:
rus = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

X_undersampled, y_undersampled = rus.fit_resample(X, list(y))

In [13]:
y_undersampled = pd.Series(y_undersampled)

In [14]:
Counter(y_undersampled)

Counter({0: 901347,
         4: 133634,
         3: 527515,
         1: 98256,
         5: 77378,
         2: 63972,
         6: 591})

In [15]:
y_undersampled.value_counts()[1:].sum()

901346

In [16]:
sampling_strategy = {0: 901347, 1: 150224, 2: 150224, 3: 150224, 4: 150224, 5: 150224, 6: 150224}

rus = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

X_resampled, y_resampled = rus.fit_resample(X, list(y))

In [17]:
print("Class distribution after oversampling:")
y_resampled = pd.Series(y_resampled)
y_resampled.value_counts()[1:].sum()

Class distribution after oversampling:


901344

### Splitting Data in Training and Testing

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, stratify=y_resampled)

### Value Counts of Category after Splitting

In [19]:
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

We can see that stratification has led to maintained proportion between categories

### Normalization 

In [20]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Applying PCA 

In [21]:
pca = PCA(n_components=0.95, random_state=0)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [22]:
variance_ratios = pca.explained_variance_ratio_

In [23]:
variance_ratios

array([0.41705121, 0.17588499, 0.11867451, 0.06434618, 0.05190822,
       0.04990597, 0.03180781, 0.02284729, 0.01898347])

variance %s of 1st and 2nd principle components are - 38.6% and 20.8%

In [24]:
# converting the arrays given by PCA to DataFrame
X_train_pca = pd.DataFrame(X_train_pca)
X_test_pca=pd.DataFrame(X_test_pca)

### Applying Random Forest Classifier 

In [25]:
model = RandomForestClassifier()

In [26]:
# removing non-integer values from labels 
y_train=y_train.astype('int')
y_test=y_test.astype('int')

##### Calculating Time 

In [27]:
# training time for running model
training_time = timeit.timeit(lambda: model.fit(X_train_pca, y_train), number=1) 

In [28]:
start_time = timeit.default_timer()
y_pred = model.predict(X_test_pca)
testing_time = timeit.default_timer() - start_time

In [29]:
print("Average Training Time:", training_time) 
print("Testing Time:", testing_time)

Average Training Time: 1540.7864620000037
Testing Time: 11.156162100000074


### Scoring Metrics

##### Confusion Matrix 

In [30]:
confusion_matrix(y_test, y_pred)

array([[266152,     21,      1,     15,     11,   4063,    142],
       [     0,  45067,      0,      0,      0,      0,      0],
       [     0,      0,  45067,      0,      0,      0,      0],
       [    14,      0,      0,  45053,      0,      0,      0],
       [     0,      0,      0,      0,  45067,      0,      0],
       [  1634,     15,      0,      0,      0,  43418,      0],
       [     0,      0,      0,      0,      0,      0,  45068]],
      dtype=int64)

##### Accuracy

In [31]:
accuracy_score(y_test, y_pred)

0.9890608127098711

##### Precision

In [32]:
precision_score(y_test, y_pred, average='weighted')

0.9894140167601874

##### Recall

In [33]:
recall_score(y_test, y_pred, average='weighted')

0.9890608127098711

##### F-2 Score 

In [34]:
fbeta_score(y_test, y_pred, beta=2, average='weighted')

0.9890893080516204

### Case 2: Equal no. of benign and attacks records, no. of records of different attack classes must be equal

In [35]:
# samples of each to make the sum of attack records equal to Bengin
count = math.ceil(y.value_counts()[0]/6)
count

150225

In [36]:
count_of_benign = count*6

In [37]:
sampling_strategy = {0: count_of_benign}
for i in range(1, 7): sampling_strategy[i] = count

#### Oversampling Every Label to Match the Sum of Bengin and Attack Records

In [38]:
rus = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

X_resampled, y_resampled = rus.fit_resample(X, list(y))

In [39]:
y_resampled = pd.Series(y_resampled)

In [40]:
y_resampled.value_counts()

0    901350
4    150225
3    150225
1    150225
5    150225
2    150225
6    150225
Name: count, dtype: int64

### Splitting Data in Training and Testing

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, stratify=y_resampled)

### Value Counts of Category after Splitting

In [42]:
y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

We can see that stratification has led to maintained proportion between categories

### Normalization 

In [43]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Applying PCA 

In [44]:
pca = PCA(n_components=0.95, random_state=0)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [45]:
variance_ratios = pca.explained_variance_ratio_

In [46]:
variance_ratios

array([0.41870387, 0.17631678, 0.11914515, 0.06333655, 0.05120626,
       0.04995581, 0.03170566, 0.0226773 , 0.01892634])

variance %s of 1st and 2nd principle components are - 38.6% and 20.8%

In [47]:
# converting the arrays given by PCA to DataFrame
X_train_pca = pd.DataFrame(X_train_pca)
X_test_pca=pd.DataFrame(X_test_pca)

### Applying Random Forest Classifier 

In [48]:
model = RandomForestClassifier()

In [49]:
# removing non-integer values from labels 
y_train=y_train.astype('int')
y_test=y_test.astype('int')

##### Calculating Time 

In [50]:
# training time for running model
training_time = timeit.timeit(lambda: model.fit(X_train_pca, y_train), number=1) 

In [51]:
start_time = timeit.default_timer()
y_pred = model.predict(X_test_pca)
testing_time = timeit.default_timer() - start_time

In [52]:
print("Average Training Time:", training_time) 
print("Testing Time:", testing_time)

Average Training Time: 662.6514717999962
Testing Time: 9.570823800007929


### Scoring Metrics

##### Confusion Matrix 

In [53]:
confusion_matrix(y_test, y_pred)

array([[265988,     21,      0,     19,     19,   4203,    155],
       [     0,  45061,      0,      0,      0,      6,      0],
       [     0,      0,  45067,      0,      0,      0,      0],
       [     7,      0,      0,  45061,      0,      0,      0],
       [     0,      0,      0,      0,  45067,      0,      0],
       [  1669,      9,      0,      0,      0,  43390,      0],
       [     0,      0,      0,      0,      0,      0,  45068]],
      dtype=int64)

##### Accuracy

In [54]:
accuracy_score(y_test, y_pred)

0.9887058301436734

##### Precision

In [55]:
precision_score(y_test, y_pred, average='weighted')

0.9890889666884527

##### Recall

In [56]:
recall_score(y_test, y_pred, average='weighted')

0.9887058301436734

##### F-2 Score 

In [57]:
fbeta_score(y_test, y_pred, beta=2, average='weighted')

0.988736199916296