In [222]:
from scipy.io import arff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler    

In [223]:
data = arff.loadarff('KDDTrain+.arff')
data_test = arff.loadarff("KDDTest+.arff")

train_df = pd.DataFrame(data[0])
test_df = pd.DataFrame(data_test[0])
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'normal'


In [224]:
train_df['class'].value_counts(sort=True)


b'normal'     67343
b'anomaly'    58630
Name: class, dtype: int64

In [225]:
train_df['protocol_type'].groupby(train_df['class']).value_counts(sort=True)

class       protocol_type
b'anomaly'  b'tcp'           49089
            b'icmp'           6982
            b'udp'            2559
b'normal'   b'tcp'           53600
            b'udp'           12434
            b'icmp'           1309
Name: protocol_type, dtype: int64

In [226]:
train_df['su_attempted'].groupby(train_df['class']).value_counts(sort=True)

class       su_attempted
b'anomaly'  0.0             58629
            1.0                 1
b'normal'   0.0             67264
            2.0                59
            1.0                20
Name: su_attempted, dtype: int64

In [227]:
train_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.022687,0.000111,0.204409,0.001222,0.27925,0.001342,0.001103,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
std,2604.51531,5870331.0,4021269.0,0.25353,0.014366,2.149968,0.045239,23.942042,0.036603,0.045154,...,99.206213,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0
max,42908.0,1379964000.0,1309937000.0,3.0,3.0,77.0,5.0,7479.0,1.0,2.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [228]:
# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training set:')
for col_name in train_df.columns:
    if train_df[col_name].dtypes == 'object' :
        unique_cat = len(train_df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

#see how distributed the feature service is, it is evenly distributed and therefore we need to make dummies for all.
print()
print('Distribution of categories in service:')
print(train_df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'land' has 2 categories
Feature 'logged_in' has 2 categories
Feature 'is_host_login' has 2 categories
Feature 'is_guest_login' has 2 categories
Feature 'class' has 2 categories

Distribution of categories in service:
b'http'        40338
b'private'     21853
b'domain_u'     9043
b'smtp'         7313
b'ftp_data'     6860
Name: service, dtype: int64


In [229]:
# Test set
print('Test set:')
for col_name in test_df.columns:
    if test_df[col_name].dtypes == 'object' :
        unique_cat = len(test_df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'land' has 2 categories
Feature 'logged_in' has 2 categories
Feature 'is_host_login' has 2 categories
Feature 'is_guest_login' has 2 categories
Feature 'class' has 2 categories


In [230]:
print(test_df.shape)
trainservice=train_df['service'].tolist()
testservice= test_df['service'].tolist()
difference=list(set(trainservice) - set(testservice))
# string = 'service_'
difference

print(test_df['service'])
# for col in difference:
#     print(test_df[col])
#     test_df[col] = 0

test_df.shape
# difference=[string + x for x in difference]
# difference

(22544, 42)
0         b'private'
1         b'private'
2        b'ftp_data'
3           b'eco_i'
4          b'telnet'
            ...     
22539        b'smtp'
22540        b'http'
22541        b'http'
22542    b'domain_u'
22543      b'sunrpc'
Name: service, Length: 22544, dtype: object


(22544, 42)

In [231]:
#Preprocessing
#Remove duplicates
# print(train_df.shape)
# print(test_df.shape)
# train_df = train_df.drop_duplicates()
# test_df = test_df.drop_duplicates()
# print(train_df.shape)
# print(test_df.shape)

# Get output (Normal/Anomaly class)
y = train_df.iloc[:, [41]].values
# Encode string values into binary
oe = OrdinalEncoder()
y_train = oe.fit_transform(y).flatten()

# Get input (diff. params.)
x = train_df.iloc[:, 30:40].values
# feature scalling to achive more accurate predictions
st_x = StandardScaler()
x_train = st_x.fit_transform(x)

# Prepare test setup
y = test_df.iloc[:, [41]].values
y_test = oe.fit_transform(y).flatten()
x = test_df.iloc[:, 30:40].values
x_test = st_x.fit_transform(x)

In [232]:
#Train data using logistic regression

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

#Predict result for test data
predictions = logistic_regression.predict(x_test)
print(predictions)

# Get accuracy of the method
score = logistic_regression.score(x_test, y_test)
print("Accuracy: " + str(score))

# df['protocol_type'] = df['protocol_type'].replace('udp', '0')


[0. 0. 1. ... 1. 1. 0.]
Accuracy: 0.7955110007097232
