#### Import Libraries

In [100]:
# Import the libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix

In [73]:
# Import data

df = pd.read_csv('network_activity_data.csv')

In [74]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,0,0,0,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0
1,0,1,1,0,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
2,0,0,2,1,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
3,0,0,3,0,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
4,0,0,3,0,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [75]:
df.shape

(125973, 42)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  int64  
 2   service                      125973 non-null  int64  
 3   flag                         125973 non-null  int64  
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 

In [77]:
# Missing Values

df.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [78]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,125973.0,287.14465,2604.515,0.0,0.0,0.0,0.0,42908.0
protocol_type,125973.0,0.250649,0.5652061,0.0,0.0,0.0,0.0,2.0
service,125973.0,10.10711,12.76029,0.0,2.0,3.0,14.0,69.0
flag,125973.0,0.622197,1.025197,0.0,0.0,0.0,1.0,10.0
src_bytes,125973.0,45566.743,5870331.0,0.0,0.0,44.0,276.0,1379964000.0
dst_bytes,125973.0,19779.114421,4021269.0,0.0,0.0,0.0,516.0,1309937000.0
land,125973.0,0.000198,0.01408607,0.0,0.0,0.0,0.0,1.0
wrong_fragment,125973.0,0.022687,0.25353,0.0,0.0,0.0,0.0,3.0
urgent,125973.0,0.000111,0.01436603,0.0,0.0,0.0,0.0,3.0
hot,125973.0,0.204409,2.149968,0.0,0.0,0.0,0.0,77.0


In [79]:
df['attack'].value_counts()

attack
0    67343
1    58630
Name: count, dtype: int64

#### Splitting features and target

In [80]:
X = df.drop(columns='attack', axis=1)
X

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,0,0,491,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,0,1,1,0,146,0,0,0,0,0,...,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0,0,2,1,0,0,0,0,0,0,...,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,0,0,3,0,232,8153,0,0,0,0,...,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
4,0,0,3,0,199,420,0,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,0,2,1,0,0,0,0,0,0,...,255,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125969,8,1,2,0,105,145,0,0,0,0,...,255,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125970,0,0,15,0,2231,384,0,0,0,0,...,255,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00
125971,0,0,35,1,0,0,0,0,0,0,...,255,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


In [81]:
Y = df['attack']
Y

0         0
1         0
2         1
3         0
4         0
         ..
125968    1
125969    0
125970    0
125971    1
125972    0
Name: attack, Length: 125973, dtype: int64

#### Training and Testing

In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=None)

In [83]:
X.shape

(125973, 41)

In [84]:
X_train.shape

(88181, 41)

In [85]:
X_test.shape

(37792, 41)

In [86]:
Y.shape

(125973,)

In [87]:
Y_train.shape

(88181,)

In [88]:
Y_test.shape

(37792,)

#### Model Building

XGBoost is used to build the model, where the goal is binary classification

In [117]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, Y_train)

predictions = model.predict(X_test)

print("XGBoost Classification Report:")
print(classification_report(Y_test, predictions))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(Y_test, predictions))

Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20203
           1       1.00      1.00      1.00     17589

    accuracy                           1.00     37792
   macro avg       1.00      1.00      1.00     37792
weighted avg       1.00      1.00      1.00     37792

XGBoost Confusion Matrix:
[[20185    18]
 [   23 17566]]


In [118]:
# Train the model

model.fit(X_train, Y_train)

Parameters: { "use_label_encoder" } are not used.



#### Evaluation

In [119]:
# accuracy score on training data

X_train_prediction = model.predict(X_train)
training_accuracy = accuracy_score(X_train_prediction, Y_train)
print("The training accuracy is", training_accuracy)

The training accuracy is 0.9999432984429752


In [120]:
# accuracy score on testing data

X_test_prediction = model.predict(X_test)
testing_accuracy = accuracy_score(X_test_prediction, Y_test)
print("The testing accuracy is", testing_accuracy)

The testing accuracy is 0.9989151143099069


In [122]:
# Read the CSV file
input_csv = pd.read_csv('test_cases.csv')

# Check the shape of the input data
print("Shape of input data:", input_csv.shape)

# Ensure there are 41 columns in the input CSV (excluding the target column)
expected_num_features = 41
if input_csv.shape[1] != expected_num_features:
    raise ValueError(f"Expected {expected_num_features} features but got {input_csv.shape[1]}")

# Convert the DataFrame to a NumPy array
input_data = input_csv.values

# Since the data is already in the correct shape, no need to reshape
# The shape should be (n_samples, n_features)

# Make predictions using the model
predictions = model.predict(input_data)

# Print the predictions
print(predictions)


Shape of input data: (6, 41)
[0 0 0 0 0 0]


In [126]:
# Make predictions on the entire training data
train_predictions = model.predict(X_train)

# Print classification report for the entire training data
print("Training Data Classification Report:")
print(classification_report(Y_train, train_predictions))

# Print confusion matrix for the entire training data
print("Training Data Confusion Matrix:")
print(confusion_matrix(Y_train, train_predictions))


Training Data Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47140
           1       1.00      1.00      1.00     41041

    accuracy                           1.00     88181
   macro avg       1.00      1.00      1.00     88181
weighted avg       1.00      1.00      1.00     88181

Training Data Confusion Matrix:
[[47135     5]
 [    0 41041]]


In [127]:
# Make predictions on the entire training data
test_predictions = model.predict(X_test)

# Print classification report for the entire testing data
print("Testing Data Classification Report:")
print(classification_report(Y_test, test_predictions))

# Print confusion matrix for the entire testing data
print("Testing Data Confusion Matrix:")
print(confusion_matrix(Y_test, test_predictions))

Testing Data Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20203
           1       1.00      1.00      1.00     17589

    accuracy                           1.00     37792
   macro avg       1.00      1.00      1.00     37792
weighted avg       1.00      1.00      1.00     37792

Testing Data Confusion Matrix:
[[20185    18]
 [   23 17566]]


### Tuning the Model

#### Hyperparameter Selection

In [130]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, Y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': None, 'n_estimators': 200, 'subsample': 1.0}
Best Cross-Validation Score: 0.999160815155226


In [131]:
# Initialize the XGBoost model with the best hyperparameters
final_model = xgb.XGBClassifier(
    colsample_bytree=0.6,
    learning_rate=0.2,
    max_depth=None,
    n_estimators=200,
    subsample=1.0,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Train the model on the entire training dataset
final_model.fit(X_train, Y_train)


Parameters: { "use_label_encoder" } are not used.



In [133]:
import joblib

# Save the final model to a file
joblib.dump(final_model, 'final_xgboost_model.pkl')


['final_xgboost_model.pkl']

In [147]:
new_input = pd.read_csv('test_cases.csv')

new_data = new_input.values

In [148]:
# Load the saved model
loaded_model = joblib.load('final_xgboost_model.pkl')

# Predict on new data
new_predictions = loaded_model.predict(new_data)

print(new_predictions)

[0 1 0 1 0 0]


In [149]:
# Predict on the test dataset
refined_model_predictions = loaded_model.predict(X_test)

# Evaluate the model's performance on the test data
print("Refined Model Classification Report:")
print(classification_report(Y_test, refined_model_predictions))

print("Refined Model Confusion Matrix:")
print(confusion_matrix(Y_test, refined_model_predictions))

Refined Model Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20203
           1       1.00      1.00      1.00     17589

    accuracy                           1.00     37792
   macro avg       1.00      1.00      1.00     37792
weighted avg       1.00      1.00      1.00     37792

Refined Model Confusion Matrix:
[[20191    12]
 [   22 17567]]
