In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("dataset_sdn.csv")
df

1.DATA PREPROCESSING


In [None]:
df.info()


In [None]:
df.describe()

In [None]:
#COLUMN NAMES
column_names= df.columns
column_names

In [None]:
# Null values sum 
df.isnull().sum().plot.bar()
plt.title("NULL Values for each column ")
plt.xlabel("Column names")
plt.ylabel("Count")

In [None]:
# Dropping rows having null values
df=df.dropna()


In [None]:
df.info()

In [None]:
# Getting unique destination 
uniq_dest=df['dst'].unique()
total_dst=len(uniq_dest)
print("Total destination : ", total_dst)
print("Different destination : ",uniq_dest)

In [None]:
# Doing analysis for malicious and normal traffic 
gp=df.groupby('label')['label'].count()
plt.bar(list(gp.index),list(gp.values),color=['g','r'])
plt.xticks(list(gp.index))
plt.xlabel("Traffic label")
plt.ylabel("Count")
plt.title("Traffic for normal and Malicious traffic")

In [None]:
ip_addr=df[df['label']==0].groupby('dst').count()['label'].index
normal_traffic=df.groupby(['dst','label']).size().unstack().fillna(0)[0]
attack_traffic=df.groupby(['dst','label']).size().unstack().fillna(0)[1]
plt.barh(ip_addr,normal_traffic,color='g', label='Normal Traffic')
plt.barh(ip_addr,attack_traffic,color='r', label='Attack Traffic')
plt.legend()
plt.xlabel("Count")
plt.ylabel("Destination IP Adresses")
plt.title("Attack and Normal traffic ")


In [None]:
# Columns containing object(string) type data
# Port no column also does not do much so ignoring it also
object_col= list(df.select_dtypes(include=['object']).columns)
object_col=object_col+['port_no']
print(object_col)
data=df.drop(columns=object_col)

# seperating data based on protocol
udp_df = df[df['Protocol']=='UDP'].drop(columns=object_col)
tcp_df = df[df['Protocol']=='TCP'].drop(columns=object_col)
icmp_df = df[df['Protocol']=='ICMP'].drop(columns=object_col)
icmp_df

2.DATA MODEL BUILDING

KNN


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

UDP

In [None]:
udp_train,udp_test, udp_train_label, udp_test_label= train_test_split(udp_df[udp_df.columns[:-1]],udp_df['label'],test_size=0.3,random_state=42)

X = udp_train
y = udp_train_label


knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X, y)

# Calculate the accuracy of the model
print(knn.score(udp_test, udp_test_label))

TCP

In [None]:
tcp_train,tcp_test, tcp_train_label, tcp_test_label= train_test_split(tcp_df[tcp_df.columns[:-1]],tcp_df['label'],test_size=0.3,random_state=42)


X = tcp_train
y = tcp_train_label


knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X, y)



# Calculate the accuracy of the model
print(knn.score(tcp_test, tcp_test_label))

ICMP

In [None]:
icmp_train,icmp_test, icmp_train_label, icmp_test_label= train_test_split(icmp_df[icmp_df.columns[:-1]],icmp_df['label'],test_size=0.3,random_state=42)

X = icmp_train
y = icmp_train_label


knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X, y)



# Calculate the accuracy of the model
print(knn.score(icmp_test, icmp_test_label))

SVM

In [None]:
from sklearn import svm

UDP

In [None]:
clf= svm.SVC(kernel='poly')
clf.fit(udp_train,udp_train_label)



clf.score(udp_test,udp_test_label)

TCP

In [None]:
clf= svm.SVC()
clf.fit(tcp_train,tcp_train_label)
metrics.accuracy_score(clf.predict(tcp_test),tcp_test_label)

ICMP

In [None]:
clf= svm.SVC()
clf.fit(icmp_train,icmp_train_label)
metrics.accuracy_score(clf.predict(icmp_test),icmp_test_label)

NAIVE BAYES

In [None]:
from sklearn.naive_bayes import GaussianNB

UDP

In [None]:
nb=GaussianNB()
nb.fit(udp_train,udp_train_label)



metrics.accuracy_score(nb.predict(udp_test),udp_test_label)

TCP

In [None]:
nb=GaussianNB()
nb.fit(tcp_train,tcp_train_label)
metrics.accuracy_score(nb.predict(tcp_test),tcp_test_label)

ICMP

In [None]:
nb=GaussianNB()
nb.fit(icmp_train,icmp_train_label)
metrics.accuracy_score(nb.predict(icmp_test),icmp_test_label)

HYBRID MODEL TESTING

In [None]:
train_x_udp,val_x_udp,train_y_udp,val_y_udp=train_test_split(udp_train,udp_train_label,stratify=udp_train_label,test_size=0.2,random_state=0)
train_x_tcp,val_x_tcp,train_y_tcp,val_y_tcp=train_test_split(tcp_train,tcp_train_label,stratify=tcp_train_label,test_size=0.2,random_state=0)
train_x_icmp,val_x_icmp,train_y_icmp,val_y_icmp=train_test_split(icmp_train,icmp_train_label,stratify=icmp_train_label,test_size=0.2,random_state=0)

In [None]:
model1=KNeighborsClassifier(n_neighbors=5)
model1.fit(train_x_udp,train_y_udp)

knn_udp_pred_val=model1.predict(val_x_udp)
knn_udp_pred_val=pd.DataFrame(knn_udp_pred_val)

knn_udp_test_val=model1.predict(udp_test)
knn_udp_test_val=pd.DataFrame(knn_udp_test_val)



In [None]:
model1=svm.SVC()
model1.fit(train_x_udp,train_y_udp)

svm_udp_pred_val=model1.predict(val_x_udp)
svm_udp_pred_val=pd.DataFrame(svm_udp_pred_val)

svm_udp_test_val=model1.predict(udp_test)
svm_udp_test_val=pd.DataFrame(svm_udp_test_val)

In [None]:
model1=GaussianNB()
model1.fit(train_x_udp,train_y_udp)

nb_udp_pred_val=model1.predict(val_x_udp)
nb_udp_pred_val=pd.DataFrame(nb_udp_pred_val)

nb_udp_test_val=model1.predict(udp_test)
nb_udp_test_val=pd.DataFrame(nb_udp_test_val)

In [None]:
rf_udp_input=pd.concat([pd.DataFrame(val_x_udp),nb_udp_pred_val,knn_udp_pred_val,svm_udp_pred_val],axis=1)
rf_udp_test=pd.concat([pd.DataFrame(udp_test),nb_udp_test_val,knn_udp_test_val,svm_udp_test_val])


rf_udp_input.drop([0],axis=1)


In [None]:
rf=RandomForestClassifier(n_estimators=100)
rf.fit(rf_udp_input,rf_udp_test)