In [141]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder, StandardScaler

import pandas as pd
import joblib

In [145]:
# Load the dataset to examine its structure and content
file_path = 'network_datum2.csv'
data = pd.read_csv(file_path)
# Display the first few rows of the dataset and its summary information
data.head()

Unnamed: 0,timestamp,packet_size,ttl,proto,csum,src_ip,dst_ip,src_port,dst_port,tcp_flag,type_icmp,code_icmp,csum_icmp,port_no,rx_bytes_ave,tx_bytes_ave,request_type,id,label
0,0.019356,103,128,6,0,192.168.31.185,3.81.183.197,62157,8095,24,0,0,0,0,103.0,103.0,tcp,3.81.183.197192.168.31.185809562157,normal
1,0.019359,60,241,6,44430,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.5,81.5,tcp,192.168.31.1853.81.183.197621578095,normal
2,0.01936,103,128,6,0,192.168.31.185,3.81.183.197,62157,8095,24,0,0,0,0,88.666667,88.666667,tcp,3.81.183.197192.168.31.185809562157,normal
3,0.019361,60,241,6,44429,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.5,81.5,tcp,192.168.31.1853.81.183.197621578095,normal
4,0.019361,60,241,6,44429,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.5,81.5,tcp,192.168.31.1853.81.183.197621578095,malicious


## KNeighborsClassifier

In [146]:
# Load the dataset
data = pd.read_csv('network_datum2.csv')

# Convert IP addresses to numerical format by removing dots and converting to integers
data['src_ip'] = data['src_ip'].apply(lambda x: int(x.replace('.', '')))
data['dst_ip'] = data['dst_ip'].apply(lambda x: int(x.replace('.', '')))
data['id'] = data['id'].apply(lambda x: int(x.replace('.', '')))

# Encoding request types with integers
def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    elif x == 'http':
        return 4
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Convert categorical labels to numbers
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split the data into features and target
X = data.drop(['label', 'timestamp'], axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize K-Nearest Neighbors classifier with a suitable number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
model = knn_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [147]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.29 %
fail accuracy = 0.71 %


## SVC

In [148]:
# Load the dataset
data = pd.read_csv('network_datum2.csv')

# Convert IP addresses to numerical format by replacing dots and converting to integers
data['src_ip'] = data['src_ip'].apply(lambda x: int(x.replace('.', '')))
data['dst_ip'] = data['dst_ip'].apply(lambda x: int(x.replace('.', '')))
data['id'] = data['id'].apply(lambda x: int(x.replace('.', '')))

# Encoding request types with integers
def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    elif x == 'http':
        return 4
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Convert categorical labels to numbers
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split the data into features and target
X = data.drop(['label', 'timestamp'], axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using Support Vector Machine classifier
svc_classifier = SVC()

# Training the classifier
model = svc_classifier.fit(X_train, y_train)

# Predicting the test set results
y_pred = model.predict(X_test)

In [149]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.29 %
fail accuracy = 0.71 %


## Random Forest

In [150]:
# Load the dataset to examine its structure and content
file_path = 'network_datum2.csv'
data = pd.read_csv(file_path)

# Convert IP addresses to numerical format using a simple hash function
data['src_ip'] = data['src_ip'].str.replace('.','')
data['dst_ip'] = data['dst_ip'].str.replace('.','')
data['id'] = data['id'].str.replace('.','')

def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    elif x == 'http':
        return 4
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Split the data into training and testing sets
X = data.drop(['label', 'timestamp'], axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(n_estimators=50, criterion="entropy", random_state=0)

model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [151]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.91 %
fail accuracy = 0.09 %


In [152]:
# classification_report(y_test, y_pred).split('\n')

## Decision Tree

In [153]:
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)

In [154]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [155]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.89 %
fail accuracy = 0.11 %


# XGBoost

In [156]:
# Re-load and re-process the data from scratch
file_path = 'network_datum2.csv'
data = pd.read_csv(file_path)

# Convert IP addresses to numerical format using a simple hash function
data['src_ip'] = data['src_ip'].apply(hash)
data['dst_ip'] = data['dst_ip'].apply(hash)
data['id'] = data['id'].replace('.','').apply(hash)

data.drop(['timestamp'], axis = 1, inplace = True)
# Encode the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    elif x == 'http':
        return 4
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Standardize numerical features except for IP addresses which are hashed
numerical_features = [col for col in data.columns if col not in ['label', 'src_ip', 'dst_ip']]
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Splitting the data into features and labels
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [157]:
# Re-initialize the XGBoost classifier and train again
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = model.predict(X_test)

# Evaluate the model with the same metrics
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

In [158]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.89 %
fail accuracy = 0.11 %


In [162]:
data = pd.read_csv('network_datum2.csv')

# Convert IP addresses to numerical format by removing dots and converting to integers
data['src_ip'] = data['src_ip'].apply(lambda x: int(x.replace('.', '')))
data['dst_ip'] = data['dst_ip'].apply(lambda x: int(x.replace('.', '')))
data['id'] = data['id'].apply(lambda x: int(x.replace('.', '')))

# Encoding request types with integers
def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    elif x == 'http':
        return 4
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Convert categorical labels to numbers
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split the data into features and target
X = data.drop(['label', 'timestamp'], axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
logistic_classifier = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence

# Train the classifier
model = logistic_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [163]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 99.89 %
fail accuracy = 0.11 %


## Export Model

In [160]:
joblib.dump(model, 'rf.joblib')

['rfff.joblib']

## Test

In [None]:
df = pd.read_csv('data_for_test.csv')
df = df.iloc[[0]]
# Convert IP addresses to numerical format using a simple hash function
df['src_ip'] = df['src_ip'].apply(hash)
df['dst_ip'] = df['dst_ip'].apply(hash)
df['id'] = df['id'].apply(hash)
# Encode the labels
df.drop(['timestamp'], axis = 1, inplace = True)
y_pred = model.predict(df.loc[0,:].to_numpy().reshape(1,-1))
model.predict(df)[0]