In [241]:
!pip uninstall scikit-learn

^C


In [76]:
from datetime import datetime
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [77]:
df_being = pd.read_csv('traffic_log3.csv')
df_malicious = pd.read_csv('traffic_log10.csv')

In [78]:
df = pd.concat([df_being, df_malicious])

In [79]:
# Handling outliers in packet_size (example using Z-score)
z_scores = np.abs((df['packet_size'] - df['packet_size'].mean()) / df['packet_size'].std())
df = df[z_scores < 3]  # Assuming 3 standard deviations as the cutoff

In [80]:
# Normalize numerical features
scaler = StandardScaler()
df[['source_port', 'target_port', 'packet_size', 'packet_length']] = scaler.fit_transform(
    df[['source_port', 'target_port', 'packet_size', 'packet_length']])

In [81]:
df['source_ip'] = df['source_ip'].apply(lambda x: float(x.replace('.', '')))
df['target_ip'] = df['target_ip'].apply(lambda x: float(x.replace('.', '')))

In [82]:
encoder = LabelEncoder()
df['attack_mode_encoded'] = encoder.fit_transform(df['attack_mode'])

In [83]:
X = df[['source_port', 'target_port', 'packet_size', 'packet_length',
                  'source_ip', 'target_ip', 'attack_mode_encoded' ]]

In [84]:
y = encoder.fit_transform(df['label'])  # Reusing the previously fitted encoder

In [85]:
# Re-split the data after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Logistic Regression

In [26]:
model = LogisticRegression(max_iter=1000, random_state=42)

In [27]:
# Training the model again to see any impact of the preprocessing
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [28]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 98.55 %
fail accuracy = 1.45 %


In [29]:
classification_rep

'              precision    recall  f1-score   support\n\n           0       0.55      0.17      0.26     10108\n           1       0.99      1.00      0.99    666082\n\n    accuracy                           0.99    676190\n   macro avg       0.77      0.59      0.63    676190\nweighted avg       0.98      0.99      0.98    676190\n'

### Random Forest

In [86]:
classifier = RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=0)

In [87]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [88]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 100.00 %
fail accuracy = 0.00 %


### Decision Tree

In [64]:
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)

In [65]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [66]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 100.00 %
fail accuracy = 0.00 %


In [67]:
classification_rep

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     10108\n           1       1.00      1.00      1.00    666082\n\n    accuracy                           1.00    676190\n   macro avg       1.00      1.00      1.00    676190\nweighted avg       1.00      1.00      1.00    676190\n'

### KNeighbors

In [234]:
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)

In [235]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [236]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 100.00 %
fail accuracy = 0.00 %


### SVC

In [109]:
classifier = SVC(kernel='rbf', random_state=0)

In [None]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [None]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

### NAIVE-BAYES

In [97]:
classifier = GaussianNB()

In [98]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [99]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.75 %
fail accuracy = 0.25 %


In [None]:
# Initialize and train the XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model's performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

In [100]:
# importing the joblib libraray
import joblib

In [101]:
joblib.dump(model, 'rf.joblib')

['rf.joblib']

In [102]:
model

In [61]:
model = joblib.load('rf.joblib')

In [103]:
df_test = pd.read_csv('traffic_log10.csv')

In [104]:
df_test

Unnamed: 0,timestamp,source_ip,target_ip,source_port,target_port,packet_size,attack_mode,label,packet_length,packet_summary
0,2024-04-20T23:02:46.062581,192.168.1.100,192.168.31.185,27432,100,60,http,normal,80,IP / TCP 192.168.1.100:27432 > 192.168.31.185:...
1,2024-04-20T23:02:46.067191,192.168.1.100,192.168.31.185,34853,100,60,http,normal,80,IP / TCP 192.168.1.100:34853 > 192.168.31.185:...
2,2024-04-20T23:02:46.072180,192.168.1.100,192.168.31.185,54786,100,60,http,normal,80,IP / TCP 192.168.1.100:54786 > 192.168.31.185:...
3,2024-04-20T23:02:46.081156,192.168.1.100,192.168.31.185,5702,100,60,http,normal,80,IP / TCP 192.168.1.100:5702 > 192.168.31.185:h...
4,2024-04-20T23:02:46.091130,192.168.1.100,192.168.31.185,23023,100,60,http,normal,80,IP / TCP 192.168.1.100:23023 > 192.168.31.185:...
...,...,...,...,...,...,...,...,...,...,...
124883,1713689493,192.168.31.185,162.247.241.14,59409,443,1161,tcp,normal,1195,Ether / IP / TCP 192.168.31.185:59409 > 162.24...
124884,1713689493,162.247.241.14,192.168.31.185,443,59409,26,tcp,normal,60,Ether / IP / TCP 162.247.241.14:https > 192.16...
124885,1713689493,192.168.31.185,162.247.243.29,56810,443,20,tcp,normal,54,Ether / IP / TCP 192.168.31.185:56810 > 162.24...
124886,1713689493,162.247.241.14,192.168.31.185,443,59409,439,tcp,normal,473,Ether / IP / TCP 162.247.241.14:https > 192.16...


In [105]:
df_test[['source_port', 'target_port', 'packet_size', 'packet_length',
                  'source_ip', 'target_ip', 'attack_mode']]

Unnamed: 0,source_port,target_port,packet_size,packet_length,source_ip,target_ip,attack_mode
0,27432,100,60,80,192.168.1.100,192.168.31.185,http
1,34853,100,60,80,192.168.1.100,192.168.31.185,http
2,54786,100,60,80,192.168.1.100,192.168.31.185,http
3,5702,100,60,80,192.168.1.100,192.168.31.185,http
4,23023,100,60,80,192.168.1.100,192.168.31.185,http
...,...,...,...,...,...,...,...
124883,59409,443,1161,1195,192.168.31.185,162.247.241.14,tcp
124884,443,59409,26,60,162.247.241.14,192.168.31.185,tcp
124885,56810,443,20,54,192.168.31.185,162.247.243.29,tcp
124886,443,59409,439,473,162.247.241.14,192.168.31.185,tcp


In [106]:
# Normalize numerical features
scaler = StandardScaler()
df_test[['source_port', 'target_port', 'packet_size', 'packet_length']] = scaler.fit_transform(
    df_test[['source_port', 'target_port', 'packet_size', 'packet_length']])

df_test['source_ip'] = df_test['source_ip'].apply(lambda x: x.replace('.', ''))
df_test['target_ip'] = df_test['target_ip'].apply(lambda x: x.replace('.', ''))

encoder = LabelEncoder()
df_test['attack_mode_encoded'] = encoder.fit_transform(df_test['attack_mode'])

X = df_test[['source_port', 'target_port', 'packet_size', 'packet_length',
                  'source_ip', 'target_ip', 'attack_mode_encoded']]

In [107]:
X

Unnamed: 0,source_port,target_port,packet_size,packet_length,source_ip,target_ip,attack_mode_encoded
0,-0.169890,-0.373419,-1.269651,-0.169820,1921681100,19216831185,0
1,0.180192,-0.373419,-1.269651,-0.169820,1921681100,19216831185,0
2,1.120520,-0.373419,-1.269651,-0.169820,1921681100,19216831185,0
3,-1.194991,-0.373419,-1.269651,-0.169820,1921681100,19216831185,0
4,-0.377882,-0.373419,-1.269651,-0.169820,1921681100,19216831185,0
...,...,...,...,...,...,...,...
124883,1.338608,-0.355593,0.788151,2.306121,19216831185,16224724114,2
124884,-1.443082,2.708787,-1.333198,-0.214232,16224724114,19216831185,2
124885,1.216001,-0.355593,-1.344412,-0.227555,19216831185,16224724329,2
124886,-1.443082,2.708787,-0.561289,0.702866,16224724114,19216831185,2


In [108]:
model.predict(X)

array([0, 0, 0, ..., 1, 0, 1])