# Import Libraries

In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn import metrics

# Import Dataset

In [100]:
df = pd.read_csv('Data_exfiltration Dataset UPDATED.csv')
print(df.head())

        stime  flgs  proto  sport  dport  pkts  bytes  state       ltime  seq  \
0  1529283982  1000    100    138    138     4    986   1000  1529285063   15   
1  1529283953  1000    200  60864    443    12   1053   2000  1529284065   12   
2  1529284104  1000    100    138    138     4   1086   1000  1529284835   23   
3  1529283913  1000    300  40279    135    26   1560   3000  1529285076   10   
4  1529284204  1000    200  36682     22    28   5098   2000  1529284204   30   

   ...       max  spkts  dpkts  sbytes  dbytes       rate      srate  \
0  ...  0.000000      4      0     986       0   0.002777   0.002777   
1  ...  0.044681      8      4     697     356   0.098173   0.062473   
2  ...  0.000108      4      0    1086       0   0.004102   0.004102   
3  ...  0.000164     13     13     780     780   0.021496   0.010318   
4  ...  0.489598     13     15    1640    3458  55.147282  24.509903   

       drate  attack    attack_category  
0   0.000000       0             Norma

# Preparing Dataset for training

In [101]:
dataset = df.dropna()

In [102]:
X = dataset.iloc[:, 0:23].values
y = dataset.iloc[:, 23].values

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(y_test)

[1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0
 1]


# Training Random Forest Algorithm

In [104]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
print(rf_y_pred)

[1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0
 1]


In [105]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_y_pred = tree.predict(X_test)
print(tree_y_pred)

[1 0 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0
 1]


# AdaBoost Algorithm

In [106]:
ada = AdaBoostClassifier(random_state = 1)
ada.fit(X_train, y_train)
ada_y_pred = ada.predict(X_test)
print(ada_y_pred)

[1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0
 1]


# Support VECTOR Machine

In [107]:
svc =  SVC()
svc.fit(X_train, y_train)
svc_y_pred = svc.predict(X_test)
print(svc_y_pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1]


# GradientBoostingRegressor

In [108]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_y_pred = gb.predict(X_test)
print(gb_y_pred)

[1 0 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0
 1]


# SDG Regressor

In [109]:
sdg =  SGDClassifier()
sdg.fit(X_train, y_train)
sdg_y_pred = sdg.predict(X_test)
print(sdg_y_pred)

[1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1]


In [110]:
lr = LogisticRegression(solver='lbfgs', max_iter=200)
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
print(lr_y_pred)

[1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0
 1]


# Export Result

In [111]:
from sklearn import metrics
print('RANDOM FOREST R2 Score:', metrics.r2_score(y_test,rf_y_pred))
print('RANDOM FOREST Mean Absolute Error:', metrics.mean_absolute_error(y_test,rf_y_pred))
print('RANDOM FOREST Mean Squared Error:', metrics.mean_squared_error(y_test,rf_y_pred))
print('RANDOM FOREST Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,rf_y_pred)))

print('ADABOOST R2 Score:', metrics.r2_score(y_test,tree_y_pred))
print('ADABOOST Mean Absolute Error:', metrics.mean_absolute_error(y_test,ada_y_pred))
print('ADABOOST Mean Squared Error:', metrics.mean_squared_error(y_test,ada_y_pred))
print('ADABOOST Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,ada_y_pred)))

print('DECISION TREE R2 Score:', metrics.r2_score(y_test,tree_y_pred))
print('DECISION TREE Mean Absolute Error:', metrics.mean_absolute_error(y_test,tree_y_pred))
print('DECISION TREE Mean Squared Error:', metrics.mean_squared_error(y_test,tree_y_pred))
print('DECISION TREE Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,tree_y_pred)))

print('SVM R2 Score:', metrics.r2_score(y_test,svc_y_pred))
print('SVM Mean Absolute Error:', metrics.mean_absolute_error(y_test,svc_y_pred))
print('SVM Mean Squared Error:', metrics.mean_squared_error(y_test,svc_y_pred))
print('SVM Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,svc_y_pred)))

print('GB R2 Score:', metrics.r2_score(y_test,gb_y_pred))
print('GB Mean Absolute Error:', metrics.mean_absolute_error(y_test,gb_y_pred))
print('GB Mean Squared Error:', metrics.mean_squared_error(y_test,gb_y_pred))
print('GB Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,gb_y_pred)))

print('SDG R2 Score:', metrics.r2_score(y_test,sdg_y_pred))
print('SDG Mean Absolute Error:', metrics.mean_absolute_error(y_test,sdg_y_pred))
print('SDG Mean Squared Error:', metrics.mean_squared_error(y_test,sdg_y_pred))
print('SDG Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,sdg_y_pred)))

print('LR R2 Score:', metrics.r2_score(y_test,lr_y_pred))
print('LR Mean Absolute Error:', metrics.mean_absolute_error(y_test,lr_y_pred))
print('LR Mean Squared Error:', metrics.mean_squared_error(y_test,lr_y_pred))
print('LR Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,lr_y_pred)))

RANDOM FOREST R2 Score: 0.5632183908045976
RANDOM FOREST Mean Absolute Error: 0.07894736842105263
RANDOM FOREST Mean Squared Error: 0.07894736842105263
RANDOM FOREST Root Mean Squared Error: 0.28097574347450816
ADABOOST R2 Score: 0.7088122605363985
ADABOOST Mean Absolute Error: 0.0
ADABOOST Mean Squared Error: 0.0
ADABOOST Root Mean Squared Error: 0.0
DECISION TREE R2 Score: 0.7088122605363985
DECISION TREE Mean Absolute Error: 0.05263157894736842
DECISION TREE Mean Squared Error: 0.05263157894736842
DECISION TREE Root Mean Squared Error: 0.22941573387056177
SVM R2 Score: -0.3103448275862071
SVM Mean Absolute Error: 0.23684210526315788
SVM Mean Squared Error: 0.23684210526315788
SVM Root Mean Squared Error: 0.4866642633922876
GB R2 Score: 0.8544061302681992
GB Mean Absolute Error: 0.02631578947368421
GB Mean Squared Error: 0.02631578947368421
GB Root Mean Squared Error: 0.16222142113076254
SDG R2 Score: -0.6015325670498086
SDG Mean Absolute Error: 0.2894736842105263
SDG Mean Squared Er