In [1]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt  
import seaborn as sns 
from scipy import stats
import pickle  
from prettytable import PrettyTable  
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

%matplotlib inline

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv
/kaggle/input/unsw-nb15/UNSW-NB15_1.csv
/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv
/kaggle/input/unsw-nb15/UNSW-NB15_LIST_EVENTS.csv
/kaggle/input/unsw-nb15/UNSW-NB15_4.csv
/kaggle/input/unsw-nb15/UNSW-NB15_3.csv
/kaggle/input/unsw-nb15/UNSW-NB15_2.csv
/kaggle/input/unsw-nb15/NUSW-NB15_features.csv


# Load Dataset UNSW-NB15

In [3]:
saved_dict = {}

In [4]:
dfs = []
for i in range(1,5):
 path = '../input/unsw-nb15/UNSW-NB15_1.csv'
 dfs.append(pd.read_csv(path.format(i), header = None))
all_data = pd.concat(dfs).reset_index(drop=True) 

In [5]:
all_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


In [6]:
df_col = pd.read_csv('https://cloudstor.aarnet.edu.au/plus/s/2DhnLGDdEECo4ys/download?path=%2FUNSW-NB15%20-%20CSV%20Files&files=NUSW-NB15_features.csv', encoding='ISO-8859-1')

URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [None]:
df_col.head()

In [None]:
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())

In [None]:
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']

In [None]:
# Saving useful info, later this will be used to transform raw test data
saved_dict['columns'] = df_col['Name'][df_col['Name']!='label'].tolist()

In [None]:
del df_col

In [None]:
all_data.head()

In [None]:
all_data.shape

# Data cleaning and pre-processing

In [None]:
# Splitting data into train and test
# All the operation like cleaning, EDA and FE will be done on train data only

train, test = train_test_split(all_data, test_size=0.2, random_state=16)

In [None]:
# Deleting the concatenated dataframe, as we don't need that anymore
del all_data

In [None]:
print(train.shape,'\n',test.shape)

In [None]:
# checking for null values
train.isnull().sum()

In [None]:
plt.figure(figsize = (8,10))
sns.heatmap(train.corr().round(2).sort_values('label', ascending=False)[['label']],annot=True)
plt.title('Koefisien Korelasi variabel label dengan variabel lainnya \n', fontsize = 15)
plt.show()

# Filling Null values

In [None]:
train['attack_cat'].value_counts()

In [None]:
# We don't have "normal" values for "attack_cat", so we must fill Null values with "normal"
train['attack_cat'] = train.attack_cat.fillna(value='normal').apply(lambda x: x.strip().lower())

In [None]:
train['attack_cat'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(7,7))

plt.pie(x=train["label"].value_counts(), 
        colors=["seagreen","firebrick"], 
        labels=["Not Attack","attack"], 
        shadow = True,
        autopct="%1.2f%%",
        explode = (0, 0.1)
        )

plt.show()
print ('Total Attack : {} '.format(train.label.value_counts()[1]))
print ('Total not Attack : {} '.format(train.label.value_counts()[0]))

In [None]:
# There are no Null values in the train data anymore
train.isnull().sum().sum()

# Information about dataset

In [None]:
# Getting name of all the columns
train.columns

# Class distribution

In [None]:
train_0, train_1 = train['label'].value_counts()[0] / len(train.index), train['label'].value_counts()[1] / len(train.index)
test_0, test_1 = test['label'].value_counts()[0] / len(test.index), test['label'].value_counts()[1] / len(test.index)

print("In Train: there are {} % of class 0 and {} % of class 1".format(train_0, train_1))
print("In Test: there are {} % of class 0 and {} % of class 1".format(test_0, test_1))

In [None]:
# Plotting target label distribution
plt.figure()
plt.title("class distribution of train and test dataset")
train['label'].value_counts().plot(kind="bar", color='b',label="train")
test['label'].value_counts().plot(kind="bar", color='orange', label="test")
plt.xlabel("Class")
plt.ylabel("Count")
plt.legend()
plt.show()

data train dan latih terlihat tidak seimbang, Persentase poin milik kelas 1 (serangan) sangat sedikit di keduanya.

# Datatype information of the Features

In [None]:
# Check if any null value present in datasets
train.isnull().sum().sum(), test.isnull().sum().sum()  # No Null in train and test data is still raw

In [None]:
train.info()

In [None]:
# All the datatypes in our dataset
train.dtypes.value_counts()

In [None]:
# Categorical feature names
train.select_dtypes(exclude=np.number).columns

In [None]:
# In the research paper it was mentioned that, this is a numerical feature not a categorical
train['ct_ftp_cmd'].unique()

In [None]:
# Removing empty space and converting it to numerical
train['ct_ftp_cmd'] = train['ct_ftp_cmd'].replace(to_replace=' ', value=0).astype(int)

In [None]:
train['ct_ftp_cmd'].unique()

In [None]:
# Categorical feature names
train.select_dtypes(exclude=np.number).columns

In [None]:
# Numeric features names
train.select_dtypes(include=np.number).columns

# Pengamatan:

1. Dalam kumpulan data ini terutama ada 2 jenis tipe data yang ada
numerik
kategoris
2. Kolom kategoris: 'proto', 'service', 'state'

3.Dari makalah penelitian kami menemukan bahwa ada kolom biner juga- numerik tetapi biner: 'is_sm_ips_ports', 'is_ftp_login'

4. Kolom numerik : 'id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', ' sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat' 'pdm', 'dmean', 'trans_depth', 'respons_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_ltm_ ct_src_ltm', 'ct_srv_dst'
5. Kolom target: 'attack_cat', 'label'

In [None]:
# Info for test data transformation
saved_dict['binary_col'] = ['is_sm_ips_ports', 'is_ftp_login']

# Fixing values of the columns

In [None]:
# see if binary columns are really binary
for col in 'is_sm_ips_ports', 'is_ftp_login':
    print(train[col].value_counts())
    print()

In [None]:
train['is_ftp_login'] = np.where(train['is_ftp_login']>1, 1, train['is_ftp_login'])

In [None]:
train['is_ftp_login'].value_counts()

In [None]:
train['service'].value_counts()

In [None]:
# removing all the "-" and replacing those with "None"
train['service'] = train['service'].apply(lambda x:"None" if x=="-" else x)

In [None]:
train['service'].value_counts()

In [None]:
# In the research paper there are not 10 unique values
train['attack_cat'].nunique()

In [None]:
train['attack_cat'].value_counts()

# save new dataset

In [None]:
train.shape, test.shape

In [None]:
train.to_csv('/kaggle/input/train.csv', index=False)
test.to_csv('/kaggle/input/test.csv', index=False)

In [None]:
pickle.dump(saved_dict, open('/kaggle/input/final_ipynb', 'wb'))