In [1]:
def find_csv_delimiter(file_path, max_lines=5):
    with open(file_path, 'r', newline='') as file:
        sample_lines = [file.readline().strip() for _ in range(max_lines)]

    delimiters = [',', ';', '\t', '|']  # Common delimiters to check

    best_delimiter = ','
    max_delimiter_count = 0

    for delimiter in delimiters:
        delimiter_count = sum(line.count(delimiter) for line in sample_lines)
        if delimiter_count > max_delimiter_count:
            best_delimiter = delimiter
            max_delimiter_count = delimiter_count

    return best_delimiter

import csv
import numpy as np
from sklearn.preprocessing import LabelEncoder

def extract_data_from_csv(file_path, delimiter=','):
    data = []  # Create a list to store the data

    # Open the CSV file for reading
    with open(file_path, mode='r', newline='') as file:
        # Create a CSV reader object with the pipe delimiter

        csv_reader = csv.reader(file, delimiter=delimiter)

        # Read the header row
        header = next(csv_reader)

        # Iterate through the rows in the CSV file
        for row in csv_reader:
            data.append(row)

    return data

data = []

import os
cpt = 0
for dirname, _, filenames in os.walk('drive/MyDrive/traffic'):

        for filename in filenames:
            if cpt < 2:
                file_path = os.path.join(dirname, filename)
                delimiter = find_csv_delimiter(file_path)
                data += extract_data_from_csv(file_path, delimiter)
                print(os.path.join(dirname, filename))
                cpt+=1
            else:
                break


# Convert your data to a NumPy array
data = np.array(data)

print(data.shape)

drive/MyDrive/traffic/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv
drive/MyDrive/traffic/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv
(1011957, 23)


In [2]:
print(data[0])

['1525879831.015811' 'CUmrqr4svHuSXJy5z7' '192.168.100.103' '51524'
 '65.127.233.163' '23' 'tcp' '-' '2.999051' '0' '0' 'S0' '-' '-' '0' 'S'
 '3' '180' '0' '0' '-' 'Malicious' 'PartOfAHorizontalPortScan']


In [3]:
columns_to_remove = [0, 1, 2, 4, 12, 13, 14, 20, 22]

# Remove columns using NumPy's array slicing
data = np.delete(data, columns_to_remove, axis=1)

# Make Malicious = 1 and Benign = 0
for row in data:
    if row[-1] == 'Benign':
        row[-1] = 0
    else:
        row[-1] = 1

columns_to_transform = [3, 4, 5, 6, 8]
# Columns to put 0 if '-'
zeros = [4, 5, 6]
# Replace '-'
for row in data:
    for column in columns_to_transform:
        if row[column] == '-' and column in zeros:
            row[column] = 0
        elif row[column] == '-' and column not in zeros:
            row[column] = 'Unkown'

columns_to_convert_to_float = [4]
# Convert columns to float
for row in data:
    for column in columns_to_convert_to_float:
        row[column] = float(row[column])

columns_to_convert_to_int = [0, 1, 5, 6, 8, 9, 10, 11]

# Convert columns to int
for row in data:
    for column in columns_to_convert_to_int:
        try:
            # Attempt to convert the value to an integer
            row[column] = int(row[column])
        except (ValueError, TypeError):
            pass

# Remove rows where first column contains ip address
rows_to_remove = []
for index, row in enumerate(data):
    # If the first column is an IP address
    if row[0].count('.') == 3:
        rows_to_remove.append(index)

# Delete the rows by index
for index in sorted(rows_to_remove, reverse=True):
    del data[index]

In [4]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# columns_to_onehot = [0, 1, 2, 3, 7, 8]
columns_to_onehot = [2, 3, 7, 8]

print('Before one-hot encoding features:')
print(data[1:10])
print(data[1].shape)

onehot_encoder = OneHotEncoder(sparse_output=True)

dataCopy = data.copy()

addedCols = 0
for column in columns_to_onehot:
    column_values = data[:, column]
    onehot_encoded = onehot_encoder.fit_transform(column_values.reshape(-1, 1)).toarray()
    dataCopy = np.delete(dataCopy, column + addedCols, axis=1)

    # Insert the new columns
    for i, encoded_column in enumerate(onehot_encoded.T):
        dataCopy = np.insert(dataCopy, column + i + addedCols, encoded_column, axis=1)

    addedCols += onehot_encoded.shape[1] - 1

data = dataCopy


print('After one-hot encoding features:')
print(data[1:10])
print(data[1].shape)

Before one-hot encoding features:
[['56305' '23' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0' '1']
 ['41101' '23' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0' '1']
 ['60905' '23' 'tcp' 'Unkown' '2.998796' '0' '0' 'S0' 'S' '3' '180' '0'
  '0' '1']
 ['44301' '23' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0' '1']
 ['50244' '23' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0' '1']
 ['34243' '49560' 'tcp' 'Unkown' '2.998804' '0' '0' 'S0' 'S' '3' '180'
  '0' '0' '0']
 ['34840' '21288' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0'
  '0']
 ['58525' '23' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0' '1']
 ['43849' '8080' 'tcp' 'Unkown' '0.0' '0' '0' 'S0' 'S' '1' '60' '0' '0'
  '1']]
(14,)
After one-hot encoding features:
[['56305' '23' '0.0' ... '0' '0' '1']
 ['41101' '23' '0.0' ... '0' '0' '1']
 ['60905' '23' '0.0' ... '0' '0' '1']
 ...
 ['34840' '21288' '0.0' ... '0' '0' '0']
 ['58525' '23' '0.0' ... '0' '0' '1']
 ['43849' '8080' '0.0' ..

In [5]:
# Check if any of the data contains strings
for row in data:
    for column in row:
        if isinstance(column, str):
            #Convert the value to a float, if possible
            try:
                column = float(column)
            except ValueError:
                pass

# Initialize an empty list to store preprocessed data
preprocessed_data = []

# Iterate through the rows in the data
for row in data:
    try:
        # Convert all columns to floats in this row
        float_row = [float(column) if column != '-' else 0.0 for column in row]
        preprocessed_data.append(float_row)
    except ValueError:
        print('Skipping row with non-convertible values:', row)

data = preprocessed_data

# Check if data contains strings
for row in data:
    for column in row:
        if isinstance(column, str):
            print('Error: String found in data: ', column)
            break

preprocessed_data = data

# Convert preprocessed_data to a normal Python list of lists
preprocessed_data = [list(row) for row in preprocessed_data]

# # Print the preprocessed data
# for row in preprocessed_data:
#     print(row)

In [6]:
import random
import numpy as np

# Separate data

# Define the split ratios for training, validation, and test datasets
train_ratio = 0.70  # 70% for training
val_ratio = 0.15   # 15% for validation
test_ratio = 0.15  # 15% for testing

train_val_indices = int((train_ratio + val_ratio) * len(preprocessed_data))

train_val_data = preprocessed_data[:train_val_indices]
test_data = preprocessed_data[train_val_indices:]

# Shuffle the data randomly
random.shuffle(train_val_data)
random.shuffle(test_data)

# Calculate the split points
total_records = len(train_val_data)
train_split = int(train_ratio * total_records)
val_split = int(val_ratio * total_records)

# Split the data into training, validation
train_data = train_val_data[:train_split]
val_data = train_val_data[train_split:]

In [7]:
print((train_val_data[0]))

[55322.0, 8080.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 60.0, 0.0, 0.0, 1.0]


In [8]:
train_labels = []
val_labels = []
test_labels = []

# Separate the labels from features vectors
train_temp = []
for row in train_data:
    newRow = []
    newRow = row[:len(row)-1]
    train_temp.append(newRow)
    train_labels.append(row[-1])

val_temp = []
for row in val_data:
    newRow = []
    newRow = row[:len(row)-1]
    val_temp.append(newRow)
    val_labels.append(row[-1])

test_temp = []
for row in test_data:
    newRow = []
    newRow = row[:len(row)-1]
    test_temp.append(newRow)
    test_labels.append(row[-1])

train_data = train_temp
val_data = val_temp
test_data = test_temp

In [9]:
# Convert your data to NumPy arrays
train_data = np.array(train_data)
train_labels = np.array(train_labels)

val_data = np.array(val_data)
val_labels = np.array(val_labels)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

# Print rows and columns of the data
print(train_data.shape)
print(train_labels.shape)
print(val_data.shape)
print(val_labels.shape)

(602114, 159)
(602114,)
(258049, 159)
(258049,)


In [10]:
!git clone https://github.com/keras-team/keras-tuner

Cloning into 'keras-tuner'...
remote: Enumerating objects: 9259, done.[K
remote: Counting objects: 100% (766/766), done.[K
remote: Compressing objects: 100% (306/306), done.[K
remote: Total 9259 (delta 580), reused 550 (delta 460), pack-reused 8493[K
Receiving objects: 100% (9259/9259), 2.18 MiB | 8.48 MiB/s, done.
Resolving deltas: 100% (6600/6600), done.


In [11]:
cd keras-tuner

/content/keras-tuner


In [12]:
!pip install .

Processing /content/keras-tuner
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting kt-legacy (from keras-tuner==1.4.7)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Building wheels for collected packages: keras-tuner
  Building wheel for keras-tuner (pyproject.toml) ... [?25l[?25hdone
  Created wheel for keras-tuner: filename=keras_tuner-1.4.7-py3-none-any.whl size=183062 sha256=d2be1362b70f3887ff7074b435a8d1fedd4d350b150d946828e01ae0ee565a02
  Stored in directory: /root/.cache/pip/wheels/6a/f2/f6/4d216b7ba0b7c0374eb8c129c16da679bd15329b761cbad121
Successfully built keras-tuner
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [13]:
import kerastuner

  import kerastuner


In [14]:
import tensorflow as tf
from tensorflow import keras
from kerastuner import Tuner
from kerastuner.tuners import BayesianOptimization

# Check if train data contains strings
for row in train_data:
    for column in row:
        if isinstance(column, str):
            print('Error: String found in data: ', column)
            break

# Check if val data contains strings
for row in val_data:
    for column in row:
        if isinstance(column, str):
            print('Error: String found in data: ', column)
            break

In [15]:
import gc
gc.collect()

43

In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define your XGBoost classifier and hyperparameter search space
xgb_model = XGBClassifier()
param_space = {
    'n_estimators': [100],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    xgb_model,
    param_space,
    n_iter=5,  # Adjust the number of iterations as needed
    scoring='accuracy',  # Use the appropriate scoring metric
    n_jobs=-1,  # Use all available CPU cores for parallel processing
    cv=5,  # Number of cross-validation folds
    random_state=42,  # Set a random seed for reproducibility
    verbose=3
)

# Perform hyperparameter optimization
random_search.fit(train_data, train_labels)

# Get the best hyperparameters and the best model
best_xgb_hps = random_search.best_params_
best_xgb_model = random_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [17]:
print(best_xgb_hps)

{'subsample': 1.0, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}


In [18]:
from sklearn.metrics import classification_report, confusion_matrix
# Evaluate the Random Forest model on the validation data
sgboost_val_predictions = random_search.predict(val_data)
sgboost_val_accuracy = np.mean(sgboost_val_predictions == val_labels)
print("Validation Accuracy (Random Forest):", sgboost_val_accuracy)

# Calculate and print classification report and confusion matrix for Random Forest
sgboost_val_report = classification_report(val_labels, sgboost_val_predictions)
sgboost_val_confusion = confusion_matrix(val_labels, sgboost_val_predictions)
print("Validation Classification Report (XGBoost):")
print(sgboost_val_report)
print("Validation Confusion Matrix (XGBoost):")
print(sgboost_val_confusion)

Validation Accuracy (Random Forest): 0.9994380912152343
Validation Classification Report (XGBoost):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    119241
         1.0       1.00      1.00      1.00    138808

    accuracy                           1.00    258049
   macro avg       1.00      1.00      1.00    258049
weighted avg       1.00      1.00      1.00    258049

Validation Confusion Matrix (XGBoost):
[[119106    135]
 [    10 138798]]


In [19]:
# Evaluate the Random Forest model on the test data
sgboost_test_predictions = random_search.predict(test_data)
sgboost_test_accuracy = np.mean(sgboost_test_predictions == test_labels)
print("Test Accuracy (SGBoost):", sgboost_test_accuracy)

# Calculate and print classification report and confusion matrix for Random Forest
sgboost_test_report = classification_report(test_labels, sgboost_test_predictions)
sgboost_test_confusion = confusion_matrix(test_labels, sgboost_test_predictions)
print("Test Classification Report (XGBoost):")
print(sgboost_test_report)
print("Test Confusion Matrix (XGBoost):")
print(sgboost_test_confusion)

Test Accuracy (SGBoost): 0.9993939154380278
Test Classification Report (XGBoost):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     74101
         1.0       1.00      1.00      1.00     77693

    accuracy                           1.00    151794
   macro avg       1.00      1.00      1.00    151794
weighted avg       1.00      1.00      1.00    151794

Test Confusion Matrix (XGBoost):
[[74032    69]
 [   23 77670]]


In [20]:
from joblib import dump, load

# Save the trained model to a file
dump(best_xgb_model, 'xgboost_model.joblib')

# Load the model from the file
loaded_model = load('xgboost_model.joblib')

# Now you can use the loaded model to make predictions
predictions = loaded_model.predict(test_data)
print(predictions)

[0 1 0 ... 1 1 0]


In [21]:
best_xgb_model.save_model('/my_model.json')

In [23]:
!pip install scapy

Collecting scapy
  Downloading scapy-2.5.0.tar.gz (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.5.0-py2.py3-none-any.whl size=1444327 sha256=7ece5b22754799551e80f51f614ed72fe2890a229e9fb96ab77cfc626a264bfb
  Stored in directory: /root/.cache/pip/wheels/82/b7/03/8344d8cf6695624746311bc0d389e9d05535ca83c35f90241d
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.5.0


In [24]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
print(data[0])

In [None]:
print(data[0])

In [None]:
from joblib import dump, load

perstest = data

# Save the trained model to a file
dump(best_xgb_model, 'xgboost_model.joblib')

# Load the model from the file
loaded_model = load('xgboost_model.joblib')

# Now you can use the loaded model to make predictions
predictions = loaded_model.predict(perstest)
print(predictions)