# This Notebook starts from Preprocessed Dataframework. "df_spark.csv" is the dataframe

In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools

In [2]:
df_spark = pd.read_csv('df_spark.csv')

In [3]:
df_spark.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,7,21,30,2,3,30,2,3,59,7,2,0
1,1,7,32,70,2,4,68,2,4,134,7,2,0
2,2,7,15,1,2,2,0,2,2,0,7,2,0
3,3,7,33,77,2,7,73,2,7,144,7,2,0
4,4,7,55,78,3,7,74,3,7,146,8,2,0


In [4]:
df_spark = df_spark.drop(columns="Unnamed: 0")

In [5]:
df_spark.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,7,21,30,2,3,30,2,3,59,7,2,0
1,7,32,70,2,4,68,2,4,134,7,2,0
2,7,15,1,2,2,0,2,2,0,7,2,0
3,7,33,77,2,7,73,2,7,144,7,2,0
4,7,55,78,3,7,74,3,7,146,8,2,0


# In the following code X contains features and y contains label

In [7]:
y = df_spark.iloc[:,0].values
X = df_spark.iloc[:,1:].values

# The whole dataset is split into 80:20 ratio. X_train contains 80% of the features, X_test contains 20% of the features and y_train contains 80% corresponding label of X_train and y_test contains 20% corresponding label of X_test

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state = 1)

# 5-Fold Cross validation Estimation for Logistic Regression

In [12]:
pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(penalty='l2', random_state=0))])
train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [30]:
for i in train_sizes:
    print(i)

57270
114540
171810
229080
286350


In [31]:
for i in train_mean:
    print(i)

0.9809708398812642
0.9807019381875328
0.9852767592107561
0.9823686048541994
0.9832561550550027


In [32]:
for i in test_mean:
    print(i)

0.9664216730632615
0.9730514079742971
0.977663927500444
0.979675511645041
0.9830616753741129


# 5-Fold Cross validation Estimation for SVM

In [34]:
pipe_svc = Pipeline([('scl', StandardScaler()),('clf', LinearSVC())])
train_sizes_svc, train_scores_svc, test_scores_svc = learning_curve(estimator=pipe_svc,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean_svc = np.mean(train_scores_svc, axis=1)
train_std_svc = np.std(train_scores_svc, axis=1)
test_mean_svc = np.mean(test_scores_svc, axis=1)
test_std_svc = np.std(test_scores_svc, axis=1)

In [35]:
for i in train_mean_svc:
    print(i)

0.9829753797799894
0.9816710319539025
0.98534436095477
0.9822630423300055
0.9831438229870928


In [12]:
for i in test_mean_svc:
    print(i)

0.9603030479831771
0.9712605169621694
0.9774823412881591
0.9796084579451501
0.9825783562928292


# 5-Fold Cross validation Estimation for Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
pipe_rnd = Pipeline([('scl', StandardScaler()),('clf', RandomForestClassifier(n_estimators=10))])
train_sizes_rnd, train_scores_rnd, test_scores_rnd = learning_curve(estimator=pipe_rnd,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean_rnd = np.mean(train_scores_rnd, axis=1)
train_std_rnd = np.std(train_scores_rnd, axis=1)
test_mean_rnd = np.mean(test_scores_rnd, axis=1)
test_std_rnd = np.std(test_scores_rnd, axis=1)

In [20]:
for i in train_mean_rnd:
    print(i)

0.9866876200453991
0.9884634188929631
0.9906862231534834
0.992856643967173
0.9942168674698795


In [21]:
for i in test_mean_rnd:
    print(i)

0.9439088829638763
0.9685588432554251
0.9801726981234566
0.9884841598922233
0.9932225637013902


# Mean values of Training and Testing accuracies and Standard Deviation of Training and Testing accuracies are given below

In [58]:
np.mean(train_mean) , np.mean(train_mean_svc), np.mean(train_mean_tree), np.mean(train_mean_rnd), np.mean(train_mean_mlp)

(0.9825350341636725,
 0.9827354393092291,
 0.9899853295348444,
 0.9899843194255066,
 0.988992106220478)

In [60]:
np.mean(train_std) , np.mean(train_std_svc), np.mean(train_std_tree), np.mean(train_std_rnd), np.mean(train_std_mlp)

(0.0012669016539291768,
 0.0015004132547957309,
 0.000812529982066585,
 0.0008124067495914715,
 0.0013105445891091817)

In [61]:
np.mean(test_mean) , np.mean(test_mean_svc), np.mean(test_mean_tree), np.mean(test_mean_rnd), np.mean(test_mean_mlp)

(0.9745988472667129,
 0.9762471837085901,
 0.9707884454993927,
 0.9725458663799749,
 0.9611862286384287)

In [62]:
np.mean(test_std) , np.mean(test_std_svc), np.mean(test_std_tree), np.mean(test_std_rnd), np.mean(test_std_mlp)

(0.005597696152259861,
 0.006471826851518314,
 0.016850233251830146,
 0.014866038573080196,
 0.021343692164590724)

# Evaluation Metrics Calculations for Logisitic Regression

In [13]:
pipe_lr = pipe_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
y_pred_train = pipe_lr.predict(X_train)

In [15]:
y_pred_test = pipe_lr.predict(X_test)

In [17]:
accuracy_score(y_train, y_pred_train)

0.9882626976588255

In [18]:
accuracy_score(y_test, y_pred_test)

0.9879311067342749

In [20]:
target_names = ['Normal', 'DoSattack', 'scan', 'malitiousControl', 'malitiousOperation', 'spying', 'dataProbing', 'wrongSetUp']

In [21]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

                    precision    recall  f1-score   support

            Normal       0.96      0.65      0.78      4602
         DoSattack       0.88      0.88      0.88       279
              scan       0.98      0.92      0.95       720
  malitiousControl       0.78      0.48      0.59       650
malitiousOperation       0.90      0.50      0.64      1242
            spying       0.00      0.00      0.00       412
       dataProbing       0.90      1.00      0.95        94
        wrongSetUp       0.99      1.00      0.99    278353

          accuracy                           0.99    286352
         macro avg       0.80      0.68      0.72    286352
      weighted avg       0.99      0.99      0.99    286352



In [22]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

            Normal       0.96      0.66      0.78      1178
         DoSattack       0.87      0.87      0.87        63
              scan       0.97      0.92      0.94       169
  malitiousControl       0.76      0.50      0.60       155
malitiousOperation       0.88      0.47      0.61       305
            spying       0.00      0.00      0.00       120
       dataProbing       0.93      1.00      0.97        28
        wrongSetUp       0.99      1.00      0.99     69571

          accuracy                           0.99     71589
         macro avg       0.80      0.68      0.72     71589
      weighted avg       0.99      0.99      0.99     71589



In [24]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)

In [26]:
for i in cnf_matrix:
    for j in i:
        print(j, end=' &')
    print()

775 &0 &0 &0 &0 &0 &0 &403 &
0 &55 &0 &0 &0 &0 &0 &8 &
0 &0 &155 &0 &0 &0 &0 &14 &
0 &0 &0 &78 &0 &0 &0 &77 &
0 &0 &5 &5 &143 &0 &2 &150 &
0 &0 &0 &0 &16 &0 &0 &104 &
0 &0 &0 &0 &0 &0 &28 &0 &
34 &8 &0 &20 &3 &15 &0 &69491 &


# Evaluation Metrics Calculations for SVM

In [27]:
pipe_svc = pipe_svc.fit(X_train, y_train)
y_pred_train = pipe_svc.predict(X_train)
y_pred_test = pipe_svc.predict(X_test)

NameError: name 'pipe_svc' is not defined

In [38]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9826332625579706, 0.9827627149422398)

In [39]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

                    precision    recall  f1-score   support

            Normal       0.97      0.65      0.78      4602
         DoSattack       0.00      0.00      0.00       279
              scan       0.78      0.04      0.07       720
  malitiousControl       1.00      0.16      0.27       650
malitiousOperation       0.00      0.00      0.00      1242
            spying       0.00      0.00      0.00       412
       dataProbing       0.00      0.00      0.00        94
        wrongSetUp       0.98      1.00      0.99    278353

       avg / total       0.98      0.98      0.98    286352



  'precision', 'predicted', average, warn_for)


In [40]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

            Normal       0.96      0.66      0.78      1178
         DoSattack       0.00      0.00      0.00        63
              scan       0.83      0.06      0.11       169
  malitiousControl       1.00      0.21      0.35       155
malitiousOperation       0.00      0.00      0.00       305
            spying       0.00      0.00      0.00       120
       dataProbing       0.00      0.00      0.00        28
        wrongSetUp       0.98      1.00      0.99     69571

       avg / total       0.98      0.98      0.98     71589



  'precision', 'predicted', average, warn_for)


In [41]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)
for i in cnf_matrix:
    for j in i:
        print(j, end='&')
    print()

775&0&0&0&0&0&0&403&
0&0&0&0&0&0&0&63&
0&0&10&0&0&0&0&159&
0&0&0&33&0&0&0&122&
0&0&2&0&0&0&0&303&
0&0&0&0&0&0&0&120&
0&0&0&0&0&0&0&28&
34&0&0&0&0&0&0&69537&


# Evaluation Metrics Calculations for Random Forest

In [47]:
pipe_rnd = pipe_rnd.fit(X_train, y_train)
y_pred_train = pipe_rnd.predict(X_train)
y_pred_test = pipe_rnd.predict(X_test)

In [48]:
y_pred_train = pipe_rnd.predict(X_train)
y_pred_test = pipe_rnd.predict(X_test)

In [49]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9942413532994356, 0.994119208258252)

In [50]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

                    precision    recall  f1-score   support

            Normal       0.98      0.65      0.78      4602
         DoSattack       1.00      1.00      1.00       279
              scan       1.00      1.00      1.00       720
  malitiousControl       1.00      1.00      1.00       650
malitiousOperation       1.00      1.00      1.00      1242
            spying       1.00      1.00      1.00       412
       dataProbing       1.00      1.00      1.00        94
        wrongSetUp       0.99      1.00      1.00    278353

       avg / total       0.99      0.99      0.99    286352



In [51]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

            Normal       0.98      0.66      0.79      1178
         DoSattack       1.00      1.00      1.00        63
              scan       1.00      1.00      1.00       169
  malitiousControl       1.00      1.00      1.00       155
malitiousOperation       1.00      1.00      1.00       305
            spying       1.00      1.00      1.00       120
       dataProbing       1.00      1.00      1.00        28
        wrongSetUp       0.99      1.00      1.00     69571

       avg / total       0.99      0.99      0.99     71589



In [37]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)

In [38]:
cnf_matrix

array([[  775,     0,     0,     0,     0,     0,     0,   403],
       [    0,    55,     0,     0,     0,     0,     0,     8],
       [    0,     0,   155,     0,     0,     0,     0,    14],
       [    0,     0,     0,    78,     0,     0,     0,    77],
       [    0,     0,     5,     5,   143,     0,     2,   150],
       [    0,     0,     0,     0,    16,     0,     0,   104],
       [    0,     0,     0,     0,     0,     0,    28,     0],
       [   34,     8,     0,    20,     3,    15,     0, 69491]],
      dtype=int64)