In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")



/kaggle/input/unridd-intrusion-detection-dataset/dataset.txt


# Exploratory Data Analysis (EDA)

In [None]:
df = pd.read_csv("/kaggle/input/unridd-intrusion-detection-dataset/UNR-IDD.csv")
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.apply(pd.Series.value_counts)


In [None]:
sns.countplot(data=df, x="Binary Label")

In [None]:
sns.countplot(x=df["Label"])

In [None]:
df["Binary Label"].unique()

In [None]:
df["Label"].unique()

In [None]:
df["Port Number"].unique()

In [None]:
df["Switch ID"].unique()

In [None]:
df['Label'].value_counts()

In [None]:
df.info()

In [None]:
df.hist(bins = 50,figsize = (15,15))

# Data Pre-processing

In [None]:
#['Attack', 'Normal']=[1,0]
df_a=df[df['Binary Label']=='Attack']
df_n=df[df['Binary Label']=='Normal']


In [None]:
df_a

In [None]:
print(df_n['Label'].nunique())
# no need to use df_n as it has nomal
df_n.info()

In [None]:
df_a=df_a.drop('Binary Label', axis=1)

In [None]:
df_a["Port Number"]=df_a["Port Number"].replace(['Port#:1', 'Port#:2', 'Port#:3', 'Port#:4'],[1,2,3,4])
df_a["Switch ID"]=df_a["Switch ID"].replace(['of:000000000000000c', 'of:000000000000000a',
       'of:000000000000000b', 'of:0000000000000003',
       'of:0000000000000004', 'of:0000000000000001',
       'of:0000000000000002', 'of:0000000000000007',
       'of:0000000000000008', 'of:0000000000000005',
       'of:0000000000000006', 'of:0000000000000009'],[12,10,11,3,4,1,2,7,8,5,6,9])

df_a['Label']=df_a['Label'].replace(['TCP-SYN', 'Blackhole', 'Diversion', 'Overflow','PortScan'],
                   [0,1,2,3,4])

In [None]:
df_a.info()

In [None]:

df['Label'].value_counts().plot(kind='pie',autopct='%1.2f%%')
plt.title("Hacking Count")
plt.show()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df_a.corr(), annot=True)
plt.show()

In [None]:
#pairplot for particular features

# plt_df = df_a[['Switch ID', 'Port Number', 'Received Packets', 'Received Bytes',
#        'Sent Bytes', 'Sent Packets', 'Port alive Duration (S)',
#        'Packets Rx Dropped', 'Packets Tx Dropped', 'Packets Rx Errors',
#        'Packets Tx Errors', 'Delta Received Packets', 'Delta Received Bytes',
#        'Delta Sent Bytes', 'Delta Sent Packets',
#        'Delta Port alive Duration (S)', 'Delta Packets Rx Dropped',
#        ' Delta Packets Tx Dropped', 'Delta Packets Rx Errors',
#        'Delta Packets Tx Errors', 'Connection Point', 'Total Load/Rate',
#        'Total Load/Latest', 'Unknown Load/Rate', 'Unknown Load/Latest',
#        'Latest bytes counter', 'is_valid', 'Table ID', 'Active Flow Entries',
#        'Packets Looked Up', 'Packets Matched', 'Max Size', 'Label']]
# fig =sns.pairplot(data = plt_df,hue="Label",corner=True);
# fig.savefig("out.png") 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib 
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_a.info()

In [None]:
# Splitting the dataset into dependant and independant fetature

X = df_a.drop(['Label','Packets Rx Dropped', 'Packets Tx Dropped', 'Packets Rx Errors',
       'Packets Tx Errors','Delta Packets Rx Dropped',
       ' Delta Packets Tx Dropped', 'Delta Packets Rx Errors',
       'Delta Packets Tx Errors', 'is_valid', 'Table ID','Max Size'],axis =1)
y = df_a["Label"]

In [None]:
y.unique()

In [None]:
X.shape,y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Model Building & Training:
   Supervised machine learning is one of the most commonly used and successful types of machine learning. Supervised learning is used whenever we want to predict a certain outcome/label from a given set of features, and we have examples of features-label pairs. We build a machine learning model from these features-label pairs, which comprise our training set. Our goal is to make accurate predictions for new, never-before-seen data.

   There are two major types of supervised machine learning problems, called classification and regression. Our data set comes under regression problem, as the prediction of suicide rate is a continuous number, or a floating-point number in programming terms. The supervised machine learning models (regression) considered to train the dataset in this notebook are:

1. Logistic Regression
2. k-Nearest Neighbors 
3. Support Vector Clasifier
4. Naive Bayes
5. Decision Tree
6. Random Forest
7. Gradient Boosting
8. Catboost
9. Xgboost
10. Multilayer Perceptrons

              
  The metrics considered to evaluate the model performance are Accuracy & F1 score.

In [None]:
# Creating holders to store the model performance results
from sklearn import metrics 
ML_Model = []
accuracy = []
f1_score = []
recall = []
precision = []

#function to call for storing the results
def storeResults(model, a,b,c,d):
    ML_Model.append(model)
    accuracy.append(round(a, 3))
    f1_score.append(round(b, 3))
    recall.append(round(c, 3))
    precision.append(round(d, 3))

In [None]:
def model_report(modelname,y_train,y_test,p_train,p_test):
    #computing the accuracy, f1_score, Recall, precision of the model performance
    #computing the classification report of the model
    #storing the results. The below mentioned order of parameter passing is important
    print("Model:{}\n".format(modelname))
    
    acc_train = metrics.accuracy_score(y_train,p_train)
    acc_test = metrics.accuracy_score(y_test,p_test)
    print("Accuracy on training Data: {:.3f}".format(acc_train))
    print("Accuracy on test Data: {:.3f}\n".format(acc_test))
    
    f1_score_train = metrics.f1_score(y_train,p_train,average='micro')
    f1_score_test = metrics.f1_score(y_test,p_test,average='micro')
    print("f1_score on training Data: {:.3f}".format(f1_score_train))
    print("f1_score on test Data: {:.3f}\n".format(f1_score_test))
    

    recall_score_train = metrics.recall_score(y_train,p_train,average='micro')
    recall_score_test = metrics.recall_score(y_test,p_test,average='micro')
    print("Recall on training Data: {:.3f}".format(recall_score_train))
    print("Recall on test Data: {:.3f}\n".format(recall_score_test))

    precision_score_train = metrics.precision_score(y_train,p_train,average='micro')
    precision_score_test = metrics.precision_score(y_test,p_test,average='micro')
    print("Precision on training Data: {:.3f}".format(precision_score_train))
    print("Precision on test Data: {:.3f}\n".format(precision_score_test))
    #computing the classification report of the model
    print("Classification Report")
    print(metrics.classification_report(y_test, p_test))
    
    #storing the results
    storeResults(modelname,acc_test,f1_score_test,
             recall_score_test,precision_score_test)

## Logistic Regression

Logistic regression predicts the output of a categorical dependent variable. Therefore the outcome must be a categorical or discrete value. Logistic Regression is much similar to the Linear Regression except that how they are used. Linear Regression is used for solving Regression problems, whereas Logistic regression is used for solving the classification problems.

In [None]:
# Linear regression model 
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline

# instantiate the model
log = LogisticRegression()

# fit the model 
log.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples

p_train_log = log.predict(X_train)
p_test_log = log.predict(X_test)

In [None]:

model_report(str(log),y_train,y_test,p_train_log,p_test_log)

## K-Nearest Neighbors : Classifier

K-Nearest Neighbour is one of the simplest Machine Learning algorithms based on Supervised Learning technique. K-NN algorithm assumes the similarity between the new case/data and available cases and put the new case into the category that is most similar to the available categories.

In [None]:
# K-Nearest Neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model 
knn.fit(X_train,y_train)

In [None]:
training_accuracy = []
test_accuracy = []
# try max_depth from 1 to 20
depth = range(1,20)
for n in depth:
    knn1 = KNeighborsClassifier(n_neighbors=n)

    knn1.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(knn1.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy.append(knn1.score(X_test, y_test))
    

#plotting the training & testing accuracy for n_estimators from 1 to 20
plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("n_neighbors")
plt.legend();

In [None]:
#predicting the target value from the model for the samples
p_train_knn = knn.predict(X_train)
p_test_knn = knn.predict(X_test)

In [None]:
model_report(str(knn),y_train,y_test,p_train_knn,p_test_knn)

## Naive Bayes : Classifier

Naïve Bayes algorithm is a supervised learning algorithm, which is based on Bayes theorem and used for solving classification problems.It is mainly used in text, image classification that includes a high-dimensional training dataset. Naïve Bayes Classifier is one of the simple and most effective Classification algorithms which helps in building the fast machine learning models that can make quick predictions.

In [None]:
# Naive Bayes Classifier Model
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# instantiate the model
nb=  GaussianNB()

# fit the model 
nb.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
p_train_nb = nb.predict(X_train)
p_test_nb = nb.predict(X_test)

In [None]:
model_report(str(nb),y_train,y_test,p_train_nb,p_test_nb)

##  Decision Trees : Classifier

Decision Tree is a Supervised learning technique that can be used for both classification and Regression problems, but mostly it is preferred for solving Classification problems. It is a tree-structured classifier, where internal nodes represent the features of a dataset, branches represent the decision rules and each leaf node represents the outcome.

In [None]:
# Decision Tree Classifier model 
from sklearn.tree import DecisionTreeClassifier

# instantiate the model 
tree = DecisionTreeClassifier(max_depth=30)

# fit the model 
tree.fit(X_train, y_train)

In [None]:
#predicting the target value from the model for the samples

p_train_tree = tree.predict(X_train)
p_test_tree = tree.predict(X_test)

In [None]:
model_report(str(tree),y_train,y_test,p_train_nb,p_test_tree)

In [None]:
training_accuracy = []
test_accuracy = []
# try max_depth from 1 to 30
depth = range(1,30)
for n in depth:
    tree_test = DecisionTreeClassifier(max_depth=n)

    tree_test.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(tree_test.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy.append(tree_test.score(X_test, y_test))
    

#plotting the training & testing accuracy for max_depth from 1 to 30
plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("max_depth")
plt.legend();

##  Random Forest : Classifier

Random Forest is a popular machine learning algorithm that belongs to the supervised learning technique. It can be used for both Classification and Regression problems in ML. It is based on the concept of ensemble learning, which is a process of combining multiple classifiers to solve a complex problem and to improve the performance of the model.

In [None]:
# Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier

# instantiate the model
forest = RandomForestClassifier(n_estimators=10)

# fit the model 
forest.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
p_train_forest = forest.predict(X_train)
p_test_forest = forest.predict(X_test)

In [None]:
model_report(str(forest),y_train,y_test,p_train_nb,p_test_forest)

## Gradient Boosting Classifier
Gradient boosting classifiers are a group of machine learning algorithms that combine many weak learning models together to create a strong predictive model. Decision trees are usually used when doing gradient boosting. Boosting algorithms play a crucial role in dealing with bias variance trade-off.  Unlike bagging algorithms, which only controls for high variance in a model, boosting controls both the aspects (bias & variance), and is considered to be more effective. 

In [None]:
# Gradient Boosting Classifier Model
from sklearn.ensemble import GradientBoostingClassifier

# instantiate the model
gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)

# fit the model 
gbc.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
p_train_gbc = gbc.predict(X_train)
p_test_gbc = gbc.predict(X_test)

In [None]:
model_report(str(gbc),y_train,y_test,p_train_nb,p_test_gbc)

## CatBoost Classifier

CatBoost is a recently open-sourced machine learning algorithm from Yandex. It can easily integrate with deep learning frameworks like Google’s TensorFlow and Apple’s Core ML. It can work with diverse data types to help solve a wide range of problems that businesses face today.

In [None]:
#  catboost Classifier Model
from catboost import CatBoostClassifier

# instantiate the model
cat = CatBoostClassifier(learning_rate  = 0.1)

# fit the model 
cat.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
p_train_cat = cat.predict(X_train)
p_test_cat = cat.predict(X_test)

In [None]:
model_report(str(cat),y_train,y_test,p_train_cat,p_test_cat)

##  XGBoost Classifier

XGBoost is an implementation of gradient boosted decision trees designed for speed and performance that is dominative competitive machine learning. In this post you will discover how you can install and create your first XGBoost model in Python

In [None]:
#  XGBoost Classifier Model
from xgboost import XGBClassifier

# instantiate the model
xgb = XGBClassifier()

# fit the model 
xgb.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
p_train_xgb = xgb.predict(X_train)
p_test_xgb = xgb.predict(X_test)

In [None]:
model_report(str(xgb),y_train,y_test,p_train_xgb,p_test_xgb)

##  Multi-layer Perceptron classifier

MLPClassifier stands for Multi-layer Perceptron classifier which in the name itself connects to a Neural Network. Unlike other classification algorithms such as Support Vectors or Naive Bayes Classifier, MLPClassifier relies on an underlying Neural Network to perform the task of classification.


In [None]:
# Multi-layer Perceptron Classifier Model
from sklearn.neural_network import MLPClassifier

# instantiate the model
mlp = MLPClassifier()
#mlp = GridSearchCV(mlpc, parameter_space)

# fit the model 
mlp.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
p_train_mlp = mlp.predict(X_train)
p_test_mlp = mlp.predict(X_test)

In [None]:
model_report(str(mlp),y_train,y_test,p_train_mlp,p_test_mlp)

##  Comparision of Models
To compare the models performance, a dataframe is created. The columns of this dataframe are the lists created to store the results of the model.

In [None]:
#creating dataframe
result = pd.DataFrame({ 'ML Model' : ML_Model,
                        'Accuracy' : accuracy,
                        'f1_score' : f1_score,
                        'Recall'   : recall,
                        'Precision': precision,
                      })

In [None]:
#Sorting the datafram on accuracy
sorted_result=result.sort_values(by=['Accuracy', 'f1_score'],ascending=False).reset_index(drop=True)
sorted_result

## Storing High Score Model 

In [None]:
##  high_score_model ---> XGBoost Classifier Model
import pickle

high_score_model = XGBClassifier()

high_score_model.fit(X_train,y_train)

# dump information to that file
#pickle.dump(high_score_model, open('pickle/model.pkl', 'wb'))

In [None]:
pickle.dump(high_score_model, open('model.pkl', 'wb'))

In [None]:
#checking the feature improtance in the model
plt.figure(figsize=(9,7))
n_features = X_train.shape[1]
plt.barh(range(n_features), gbc.feature_importances_, align='center')
plt.yticks(np.arange(n_features), X_train.columns)
plt.title("Feature importances using permutation on full model")
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()