In [58]:
import scapy
from tqdm import tqdm
from scapy.all import rdpcap
from scapy.all import PcapReader, IP, IPv6, TCP, UDP, Raw, Ether
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, InputLayer
import os 
from tensorflow.keras.layers import Reshape
from tensorflow.keras.models import Model
import pandas as pd

In [59]:
dataset_path = 'dataset'

categorized_filenames = {
    'video_stream': [],
    'messaging': [],
    'file_transfer': [],
    'regular_browsing': []
}

In [60]:
for filename in os.listdir(dataset_path):
    # Check if the file is a pcap file
    if filename.endswith('.pcap'):
        # Determine the category based on keywords in the filename
        if 'netflix' in filename or 'youtube' in filename:
            categorized_filenames['video_stream'].append(filename)
        elif 'chat' in filename:
            categorized_filenames['messaging'].append(filename)
        elif 'regular' in filename:  # Assuming 'transfer' is indicative of file transfer
            categorized_filenames['regular_browsing'].append(filename)
        else:
            categorized_filenames['file_transfer'].append(filename)  # Default category

In [61]:
for category, filenames in categorized_filenames.items():
    print(f"{category}: {filenames}")


video_stream: ['nonvpn_netflix_capture1.pcap', 'vpn_youtube_capture1.pcap']
messaging: ['nonvpn_skype-chat_capture18.pcap', 'nonvpn_skype-chat_capture6.pcap', 'nonvpn_skype-chat_capture14.pcap', 'nonvpn_skype-chat_capture39.pcap', 'nonvpn_skype-chat_capture15.pcap', 'nonvpn_skype-chat_capture7.pcap', 'nonvpn_skype-chat_capture19.pcap', 'nonvpn_skype-chat_capture12.pcap', 'nonvpn_skype-chat_capture1.pcap', 'nonvpn_skype-chat_capture13.pcap', 'nonvpn_skype-chat_capture10.pcap', 'nonvpn_skype-chat_capture2.pcap', 'nonvpn_skype-chat_capture3.pcap', 'nonvpn_skype-chat_capture11.pcap', 'nonvpn_skype-chat_capture16.pcap', 'nonvpn_skype-chat_capture8.pcap', 'nonvpn_skype-chat_capture9.pcap', 'nonvpn_skype-chat_capture5.pcap', 'vpn_skype-chat_capture6.pcap']
file_transfer: ['vpn_voip_capture3.pcap', 'nonvpn_sftp_newcapture2.pcap']
regular_browsing: ['regular_browsing.pcap', 'regular_browsing1.pcap']


In [62]:
input_data = []
application_types = []

In [63]:
for filename in os.listdir(dataset_path):
    if filename.endswith('.pcap'):
        if 'netflix' in filename or 'youtube' in filename:
            categorized_filenames['video_stream'].append(filename)
        elif 'chat' in filename:
            categorized_filenames['messaging'].append(filename)
        elif 'regular' in filename:
            categorized_filenames['regular_browsing'].append(filename)    
        else:
            categorized_filenames['file_transfer'].append(filename)

In [64]:
def extract_data(application_type, file_name, data_size):
    packets = rdpcap(f"dataset/{file_name}")
    for packet in packets[:data_size]:
        try:
            if Raw in packet:
                payload_bytes = packet[Raw].load
                decimal_vectors = payload_to_decimal(payload_bytes)
            else:
                decimal_vectors = []
            input_data.append(decimal_vectors)
            application_types.append(application_type)
        except Exception as e:
            print(e)

In [65]:
def payload_to_decimal(payload_bytes):
    return [int(b) for b in payload_bytes]

In [67]:
data_size = 300
for category, files in categorized_filenames.items():
    for file_name in tqdm(files):
        extract_data(category, file_name, data_size)

100%|███████████████████████████████████████████████████████| 4/4 [01:21<00:00, 20.48s/it]
100%|█████████████████████████████████████████████████████| 38/38 [00:25<00:00,  1.48it/s]
100%|███████████████████████████████████████████████████████| 4/4 [00:28<00:00,  7.16s/it]
100%|███████████████████████████████████████████████████████| 4/4 [00:33<00:00,  8.43s/it]


In [80]:
#max_length = max(len(vector) for vector in input_data)
max_length = 20
input_data = [seq[:20] if len(seq) > 20 else seq for seq in input_data]
padded_vectors = pad_sequences(input_data, maxlen=20, padding='post')
encoder = OneHotEncoder()
labels_encoded = encoder.fit_transform(np.array(application_types).reshape(-1, 1)).toarray()

X_train, X_test, y_train, y_test = train_test_split(padded_vectors, labels_encoded, test_size=0.2, random_state=42)

In [81]:
model = Sequential()
model.add(Reshape((max_length, 1), input_shape=(max_length,)))  # Add this line if reshaping is necessary
model.add(LSTM(20))
model.add(Dense(20, activation='relu'))
model.add(Dense(len(encoder.categories_[0]), activation='softmax'))

In [82]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [83]:
model.fit(X_train, y_train, epochs=1, batch_size=64, validation_split=0.1)



<keras.src.callbacks.History at 0x2e0daa6a0>

In [84]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test set accuracy: {accuracy*100:.2f}%')

Test set accuracy: 89.35%


In [85]:
intermediate_layer_model = Model(inputs=model.input, outputs=model.layers[2].output)

In [86]:
max_length = 20

single_input = pad_sequences(input_data[:1], maxlen=max_length, padding='post')

In [87]:
output_from_last_hidden_layer = intermediate_layer_model.predict(single_input)



In [88]:
print(len(output_from_last_hidden_layer[0]))

20


In [92]:
model.save("archive/neural_net_payload.h5")

  saving_api.save_model(


In [89]:
input_output_pairs = []


In [55]:
for sequence in input_data:
    padded_sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
    output = intermediate_layer_model.predict(padded_sequence)
    output_flattened = output.flatten()
    input_output_pairs.append([sequence, output_flattened])

df = pd.DataFrame(input_output_pairs, columns=['Input', 'Output'])

