<a href="https://colab.research.google.com/github/Soumajith/NetworkTrafficAnalysis/blob/main/Intrusion_Detection_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'kdd-cup-1999-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F90131%2F208170%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240804%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240804T154410Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0aabbb8d90d96359465dc6bd630b884183b1868f774c1c39feb87bb6004d81c8f74325b2d5b12850102e388973e9a0cdadd5f2264335c01e6265c736fac744fe90c2f92bb91927653d20819c6463964acb954bf41058d0118733e824cc2045ccd67f3006a69ba7217e6485d4d48778e7502a17b01f34b4997d65c37a21655ec7e558e85bb8c2308722bfb70438a33c8fd76b54de4faa252f1d4935e9960cd8fa90d97e10732f7f90b73cd14567b5d443f0f3916b0f0f36afc3f7503527dd2306550912ae3cc768f81c03a5f9708405fbe1356a3314d74820da1651117e6e548d2d0667ec42d1a7ac2459bbbd039431c4b25ba74039e933c868043476d078bb1b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


This is the first version of my public kernel(Intrusion Detection System).

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
print(os.listdir('../input/kdd-cup-1999-data'))

In [None]:
with open("../input/kdd-cup-1999-data/kddcup.names",'r') as f:
    print(f.read())

In [None]:
cols="""duration,
protocol_type,
service,
flag,
src_bytes,
dst_bytes,
land,
wrong_fragment,
urgent,
hot,
num_failed_logins,
logged_in,
num_compromised,
root_shell,
su_attempted,
num_root,
num_file_creations,
num_shells,
num_access_files,
num_outbound_cmds,
is_host_login,
is_guest_login,
count,
srv_count,
serror_rate,
srv_serror_rate,
rerror_rate,
srv_rerror_rate,
same_srv_rate,
diff_srv_rate,
srv_diff_host_rate,
dst_host_count,
dst_host_srv_count,
dst_host_same_srv_rate,
dst_host_diff_srv_rate,
dst_host_same_src_port_rate,
dst_host_srv_diff_host_rate,
dst_host_serror_rate,
dst_host_srv_serror_rate,
dst_host_rerror_rate,
dst_host_srv_rerror_rate"""

columns=[]
for c in cols.split(','):
    if(c.strip()):
       columns.append(c.strip())

columns.append('target')
#print(columns)
print(len(columns))

In [None]:
with open("../input/kdd-cup-1999-data/training_attack_types",'r') as f:
    print(f.read())

In [None]:
attacks_types = {
    'normal': 'normal',
'back': 'dos',
'buffer_overflow': 'u2r',
'ftp_write': 'r2l',
'guess_passwd': 'r2l',
'imap': 'r2l',
'ipsweep': 'probe',
'land': 'dos',
'loadmodule': 'u2r',
'multihop': 'r2l',
'neptune': 'dos',
'nmap': 'probe',
'perl': 'u2r',
'phf': 'r2l',
'pod': 'dos',
'portsweep': 'probe',
'rootkit': 'u2r',
'satan': 'probe',
'smurf': 'dos',
'spy': 'r2l',
'teardrop': 'dos',
'warezclient': 'r2l',
'warezmaster': 'r2l',
}


READING DATASET

In [None]:
path = "../input/kdd-cup-1999-data/kddcup.data_10_percent.gz"
df = pd.read_csv(path,names=columns)

#Adding Attack Type column
df['Attack Type'] = df.target.apply(lambda r:attacks_types[r[:-1]])

df.head()

In [None]:
df.shape

In [None]:
df['target'].value_counts()

In [None]:
df['Attack Type'].value_counts()

In [None]:
df.dtypes

DATA PREPROCESSING

In [None]:
df.isnull().sum()

In [None]:
#Finding categorical features
num_cols = df._get_numeric_data().columns

cate_cols = list(set(df.columns)-set(num_cols))
cate_cols.remove('target')
cate_cols.remove('Attack Type')

cate_cols

CATEGORICAL FEATURES DISTRIBUTION

In [None]:
#Visualization
def bar_graph(feature):
    df[feature].value_counts().plot(kind="bar")

In [None]:
bar_graph('protocol_type')

Protocol type: We notice that ICMP is the most present in the used data, then TCP and almost 20000 packets of UDP type

In [None]:
plt.figure(figsize=(15,3))
bar_graph('service')

In [None]:
bar_graph('flag')

In [None]:
bar_graph('logged_in')

logged_in (1 if successfully logged in; 0 otherwise): We notice that just 70000 packets are successfully logged in.

TARGET FEATURE DISTRIBUTION

In [None]:
bar_graph('target')

Attack Type(The attack types grouped by attack, it's what we will predict)

In [None]:
bar_graph('Attack Type')

In [None]:
df.columns

DATA CORRELATION

In [None]:
df = df.dropna('columns')# drop columns with NaN

df = df[[col for col in df if df[col].nunique() > 1]]# keep columns where there are more than 1 unique values

corr = df.corr()

plt.figure(figsize=(15,12))

sns.heatmap(corr)

plt.show()

In [None]:
df['num_root'].corr(df['num_compromised'])

In [None]:
df['srv_serror_rate'].corr(df['serror_rate'])

In [None]:
df['srv_count'].corr(df['count'])

In [None]:
df['srv_rerror_rate'].corr(df['rerror_rate'])

In [None]:
df['dst_host_same_srv_rate'].corr(df['dst_host_srv_count'])

In [None]:
df['dst_host_srv_serror_rate'].corr(df['dst_host_serror_rate'])

In [None]:
df['dst_host_srv_rerror_rate'].corr(df['dst_host_rerror_rate'])

In [None]:
df['dst_host_same_srv_rate'].corr(df['same_srv_rate'])

In [None]:
df['dst_host_srv_count'].corr(df['same_srv_rate'])

In [None]:
df['dst_host_same_src_port_rate'].corr(df['srv_count'])

In [None]:
df['dst_host_serror_rate'].corr(df['serror_rate'])

In [None]:
df['dst_host_serror_rate'].corr(df['srv_serror_rate'])

In [None]:
df['dst_host_srv_serror_rate'].corr(df['serror_rate'])

In [None]:
df['dst_host_srv_serror_rate'].corr(df['srv_serror_rate'])

In [None]:
df['dst_host_rerror_rate'].corr(df['rerror_rate'])

In [None]:
df['dst_host_rerror_rate'].corr(df['srv_rerror_rate'])

In [None]:
df['dst_host_srv_rerror_rate'].corr(df['rerror_rate'])

In [None]:
df['dst_host_srv_rerror_rate'].corr(df['srv_rerror_rate'])

In [None]:
#This variable is highly correlated with num_compromised and should be ignored for analysis.
#(Correlation = 0.9938277978738366)
df.drop('num_root',axis = 1,inplace = True)

#This variable is highly correlated with serror_rate and should be ignored for analysis.
#(Correlation = 0.9983615072725952)
df.drop('srv_serror_rate',axis = 1,inplace = True)

#This variable is highly correlated with rerror_rate and should be ignored for analysis.
#(Correlation = 0.9947309539817937)
df.drop('srv_rerror_rate',axis = 1, inplace=True)

#This variable is highly correlated with srv_serror_rate and should be ignored for analysis.
#(Correlation = 0.9993041091850098)
df.drop('dst_host_srv_serror_rate',axis = 1, inplace=True)

#This variable is highly correlated with rerror_rate and should be ignored for analysis.
#(Correlation = 0.9869947924956001)
df.drop('dst_host_serror_rate',axis = 1, inplace=True)

#This variable is highly correlated with srv_rerror_rate and should be ignored for analysis.
#(Correlation = 0.9821663427308375)
df.drop('dst_host_rerror_rate',axis = 1, inplace=True)

#This variable is highly correlated with rerror_rate and should be ignored for analysis.
#(Correlation = 0.9851995540751249)
df.drop('dst_host_srv_rerror_rate',axis = 1, inplace=True)

#This variable is highly correlated with dst_host_srv_count and should be ignored for analysis.
#(Correlation = 0.9736854572953938)
df.drop('dst_host_same_srv_rate',axis = 1, inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df_std = df.std()
df_std = df_std.sort_values(ascending = True)
df_std

FEATURE MAPPING

In [None]:
df['protocol_type'].value_counts()

In [None]:
#protocol_type feature mapping
pmap = {'icmp':0,'tcp':1,'udp':2}
df['protocol_type'] = df['protocol_type'].map(pmap)

In [None]:
df['flag'].value_counts()

In [None]:
#flag feature mapping
fmap = {'SF':0,'S0':1,'REJ':2,'RSTR':3,'RSTO':4,'SH':5 ,'S1':6 ,'S2':7,'RSTOS0':8,'S3':9 ,'OTH':10}
df['flag'] = df['flag'].map(fmap)

In [None]:
df.head()

In [None]:
df.drop('service',axis = 1,inplace= True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

MODELLING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [None]:
df = df.drop(['target',], axis=1)
print(df.shape)

# Target variable and train set
Y = df[['Attack Type']]
X = df.drop(['Attack Type',], axis=1)

sc = MinMaxScaler()
X = sc.fit_transform(X)

# Split test and train data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

GAUSSIAN NAIVE BAYES

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [None]:
model1 = GaussianNB()

In [None]:
start_time = time.time()
model1.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [None]:
print("Training time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_test_pred1 = model1.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
print("Train score is:", model1.score(X_train, Y_train))
print("Test score is:",model1.score(X_test,Y_test))

DECISION TREE

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
model2 = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

In [None]:
start_time = time.time()
model2.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [None]:
print("Training time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_test_pred2 = model2.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
print("Train score is:", model2.score(X_train, Y_train))
print("Test score is:",model2.score(X_test,Y_test))

RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model3 = RandomForestClassifier(n_estimators=30)

In [None]:
start_time = time.time()
model3.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [None]:
print("Training time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_test_pred3 = model3.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
print("Train score is:", model3.score(X_train, Y_train))
print("Test score is:",model3.score(X_test,Y_test))

SUPPORT VECTOR MACHINE

In [None]:
from sklearn.svm import SVC

In [None]:
model4 = SVC(gamma = 'scale')

In [None]:
start_time = time.time()
model4.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [None]:
print("Training time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_test_pred4 = model4.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
print("Train score is:", model4.score(X_train, Y_train))
print("Test score is:", model4.score(X_test,Y_test))

LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model5 = LogisticRegression(max_iter=1200000)

In [None]:
start_time = time.time()
model5.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [None]:
print("Training time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_test_pred5 = model5.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
print("Train score is:", model5.score(X_train, Y_train))
print("Test score is:",model5.score(X_test,Y_test))

GRADIENT BOOSTING CLASSIFIER

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
model6 = GradientBoostingClassifier(random_state=0)

In [None]:
start_time = time.time()
model6.fit(X_train, Y_train.values.ravel())
end_time = time.time()

In [None]:
print("Training time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_test_pred6 = model6.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
print("Train score is:", model6.score(X_train, Y_train))
print("Test score is:", model6.score(X_test,Y_test))

Artificial Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
def fun():
    model = Sequential()

    #here 30 is output dimension
    model.add(Dense(30,input_dim =30,activation = 'relu',kernel_initializer='random_uniform'))

    #in next layer we do not specify the input_dim as the model is sequential so output of previous layer is input to next layer
    model.add(Dense(1,activation='sigmoid',kernel_initializer='random_uniform'))

    #5 classes-normal,dos,probe,r2l,u2r
    model.add(Dense(5,activation='softmax'))

    #loss is categorical_crossentropy which specifies that we have multiple classes

    model.compile(loss ='categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

    return model

In [None]:
#Since,the dataset is very big and we cannot fit complete data at once so we use batch size.
#This divides our data into batches each of size equal to batch_size.
#Now only this number of samples will be loaded into memory and processed.
#Once we are done with one batch it is flushed from memory and the next batch will be processed.
model7 = KerasClassifier(build_fn=fun,epochs=100,batch_size=64)

In [None]:
start = time.time()
model7.fit(X_train, Y_train.values.ravel())
end = time.time()

In [None]:
print('Training time')
print((end-start))

In [None]:
start_time = time.time()
Y_test_pred7 = model7.predict(X_test)
end_time = time.time()

In [None]:
print("Testing time: ",end_time-start_time)

In [None]:
start_time = time.time()
Y_train_pred7 = model7.predict(X_train)
end_time = time.time()

In [None]:
accuracy_score(Y_train,Y_train_pred7)

In [None]:
accuracy_score(Y_test,Y_test_pred7)

TRAINING ACCURACY

In [None]:
names = ['NB','DT','RF','SVM','LR','GB','ANN']
values = [87.951,99.058,99.997,99.875,99.352,99.793,99.914]
f = plt.figure(figsize=(15,3),num=10)
plt.subplot(131)
plt.ylim(80,102)
plt.bar(names,values)

In [None]:
f.savefig('training_accuracy_figure.png',bbox_inches='tight')

TESTING ACCURACY

In [None]:
names = ['NB','DT','RF','SVM','LR','GB','ANN']
values = [87.903,99.052,99.969,99.879,99.352,99.771,99.886]
f = plt.figure(figsize=(15,3),num=10)
plt.subplot(131)
plt.ylim(80,102)
plt.bar(names,values)

In [None]:
f.savefig('test_accuracy_figure.png',bbox_inches='tight')

TRAINING TIME

In [None]:
names = ['NB','DT','RF','SVM','LR','GB','ANN']
values = [1.04721,1.50483,11.45332,126.96016,56.67286,446.69099,1211.54094]
f = plt.figure(figsize=(15,3),num=10)
plt.subplot(131)
plt.bar(names,values)

In [None]:
f.savefig('train_time_figure.png',bbox_inches='tight')

TESTING TIME

In [None]:
names = ['NB','DT','RF','SVM','LR','GB','ANN']
values = [0.79089,0.10471,0.60961,32.72654,0.02198,1.41416,1.72521]
f = plt.figure(figsize=(15,3),num=10)
plt.subplot(131)
plt.bar(names,values)

In [None]:
f.savefig('test_time_figure.png',bbox_inches='tight')

DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Assuming X_train and X_test are your datasets
# Example dummy data (replace with your actual data)
# X_train = ...
# X_test = ...

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply DBSCAN for outlier detection
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_train_scaled)

# Get the labels (-1 indicates outliers)
train_labels = dbscan.labels_

# Predict outliers in the test set
test_labels = dbscan.fit_predict(X_test_scaled)

# Identify outliers
train_outliers = np.where(train_labels == -1)[0]
test_outliers = np.where(test_labels == -1)[0]

train_accuracy = accuracy_score(y_train, train_outliers)
test_accuracy = accuracy_score(y_test, test_outliers)

print("Outliers in training data:", np.where(train_outliers == 1)[0])
print("Outliers in test data:", np.where(test_outliers == 1)[0])
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [None]:
import matplotlib.pyplot as plt
names = ['DT','RF','SVM','KNN']
values = [99.058,99.997,99.875,99.89]
f = plt.figure(figsize=(15,3),num=10)
plt.subplot(131)
plt.ylim(90,102)
plt.bar(names,values)