In [10]:
import nfstream
import numpy as np
import os
import pandas as pd
import pefile
import seaborn as sns
import struct
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import re

le = LabelEncoder()

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.nn import TransformerEncoder, TransformerEncoderLayer


### network data feature extraction

In [11]:
dataset_dir = '/home/mani/Desktop/main_project/data/network_data/ids-dataset/'
benign_dir = os.path.join(dataset_dir, 'benign/')
malicious_dir = os.path.join(dataset_dir, 'malicious/')
benign_files = os.listdir(benign_dir)
malicious_files = os.listdir(malicious_dir)

features = ['src_port', 'dst_port', 'protocol', 'ip_version', 'vlan_id',
	   'tunnel_id', 'bidirectional_first_seen_ms',
	   'bidirectional_last_seen_ms', 'bidirectional_duration_ms',
	   'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms',
	   'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets',
	   'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms',
	   'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes',
	   'application_name', 'application_category_name', 'user_agent',
	   'content_type']

def get_features_from_pacp_file(pcap_file):
	flow = nfstream.NFStreamer(source=pcap_file, statistical_analysis=False).to_pandas()
	# encode the application_name, application_category_name, user_agent, content_type
	flow['application_name'] = le.fit_transform(flow['application_name'])
	flow['application_category_name'] = le.fit_transform(flow['application_category_name'])
	flow['user_agent'] = le.fit_transform(flow['user_agent'])
	flow['content_type'] = le.fit_transform(flow['content_type'])
	# drop the columns which are not required
	flow = flow.drop(['id', 'expiration_id', 'src_ip', 'src_mac', 'src_oui', 'dst_ip', 'dst_mac', 'dst_oui', 'application_is_guessed', 'application_confidence', 'requested_server_name', 'client_fingerprint', 'server_fingerprint'], axis=1)
	flow = flow.drop(["vlan_id", "tunnel_id"], axis=1)
	flow = flow.fillna(0)
	flow = flow.drop_duplicates()
	# print the column name if it has string datatype
	for col in flow.columns:
		if flow[col].dtype == 'object':
			print(col)
	features = flow.values.tolist()
	return features

# Create a DataFrame for benign features
benign_features = []
for file in tqdm(benign_files):
	features = get_features_from_pacp_file(benign_dir + file)
	benign_features.extend(features)

# Create a DataFrame for malicious features
malicious_features = []
for file in tqdm(malicious_files):
	features = get_features_from_pacp_file(malicious_dir + file)
	malicious_features.extend(features)

benign_labels = [0] * len(benign_features)
malicious_labels = [1] * len(malicious_features)


  0%|          | 0/108 [00:00<?, ?it/s]

100%|██████████| 108/108 [00:26<00:00,  4.10it/s]
  df = pd.read_csv(temp_file_path)
  df = pd.read_csv(temp_file_path)
  df = pd.read_csv(temp_file_path)
  df = pd.read_csv(temp_file_path)
100%|██████████| 20/20 [00:36<00:00,  1.83s/it]


In [None]:
network_data.head()

In [None]:
for col in network_data.columns:
	print(col)

### portable excutable file feature extraction

In [7]:
def get_features_from_pe(file_path):
    # Extract header information
    PEfile = pefile.PE(file_path)
    feature_vector = [PEfile.OPTIONAL_HEADER.DATA_DIRECTORY[6].Size, PEfile.OPTIONAL_HEADER.DATA_DIRECTORY[6].VirtualAddress, PEfile.OPTIONAL_HEADER.MajorImageVersion, PEfile.OPTIONAL_HEADER.MajorOperatingSystemVersion, PEfile.OPTIONAL_HEADER.DATA_DIRECTORY[0].VirtualAddress,
                      PEfile.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size, PEfile.OPTIONAL_HEADER.DATA_DIRECTORY[12].VirtualAddress, PEfile.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size, PEfile.OPTIONAL_HEADER.MajorLinkerVersion, PEfile.FILE_HEADER.NumberOfSections, PEfile.OPTIONAL_HEADER.SizeOfStackReserve, PEfile.OPTIONAL_HEADER.DllCharacteristics]

    machine_type = PEfile.FILE_HEADER.Machine
    num_sections = len(PEfile.sections)
    entry_point = PEfile.OPTIONAL_HEADER.AddressOfEntryPoint
    feature_vector.extend([machine_type, num_sections,  entry_point])

    # Extract section information
    section_info = [section.SizeOfRawData for section in PEfile.sections]
    feature_vector += section_info

    # Extract ASCII and Unicode strings
    with open(file_path, 'rb') as f:
        data = f.read()
    ascii_strings = re.findall(b'[ -~]{4,}', data)
    unicode_strings = re.findall(b'[\x20-\x7E\x80-\xFE]{4,}', data)
    feature_vector.extend([len(ascii_strings), len(unicode_strings)])

    # Extract import information
    imports = {}
    for entry in PEfile.DIRECTORY_ENTRY_IMPORT:
        try:
            dll_name = entry.dll.decode('utf-8')
            imports[dll_name] = [func.name.decode(
                'utf-8') for func in entry.imports]
        except:
            dll_name = entry.dll
    feature_vector.extend(list(imports.keys()))

    return feature_vector


benign_dir = "/home/mani/Desktop/main_project/data/DikeDataset/files/benign"
malicious_dir = "/home/mani/Desktop/main_project/data/DikeDataset/files/malware"


pe_data = pd.DataFrame(columns=['Name', 'MachineType', 'NumberOfSections', 'AddressOfEntryPoint',
                  'SizeOfRawData', 'NumberOfAsciiStrings', 'NumberOfUnicodeStrings', 'Imports', 'Label'])

for file_name in tqdm(os.listdir(benign_dir)):
    file_path = os.path.join(benign_dir, file_name)
    try:
        features = get_features_from_pe(file_path)
        pe_data.loc[len(pe_data.index)] = [file_name, features[0], features[1],
                                 features[2], features[3], features[4], features[5], features[6], 0]
    except Exception as e:
        # print(f"file {file_name} caused error")
        # print(e)
        continue

for file_name in tqdm(os.listdir(malicious_dir)):
    file_path = os.path.join(malicious_dir, file_name)
    try:
        features = get_features_from_pe(file_path)
        pe_data.loc[len(pe_data.index)] = [file_name, features[0], features[1],
                                 features[2], features[3], features[4], features[5], features[6], 1]
    except Exception as e:
        # print(f"file {file_name} caused error")
        # print(e)
        continue

# label encoding
pe_data['Imports'] = le.fit_transform(pe_data['Imports'])

# save data
pe_data.to_csv("../data/csv/pe_data.csv", index=False)

100%|██████████| 1082/1082 [01:02<00:00, 17.25it/s]
100%|██████████| 10841/10841 [07:27<00:00, 24.25it/s]


### data loading

In [53]:
network_data = pd.read_csv("../data/csv/network_data.csv")
pe_data = pd.read_csv("../data/csv/pe_data.csv")

# train test split
from sklearn.model_selection import train_test_split
x_network = network_data.drop(['label'], axis=1)
y_network = network_data['label']
x_pe = pe_data.drop(['Name', 'Label'], axis=1)
y_pe = pe_data['Label']

# print shape
print(x_network.shape)
print(y_network.shape)
print(x_pe.shape)
print(y_pe.shape)

print(x_network.head())

(128293, 23)
(128293,)
(9876, 7)
(9876,)
   123  123.1  17  4  1533042976474  1533042976481  7  2  180  \
0  123    123  17  4  1533043015474  1533043015478  4  2  180   
1  123    123  17  4  1533043146474  1533043146479  5  2  180   
2  123    123  17  4  1533043143474  1533043143476  2  2  180   
3  123    123  17  4  1533043237474  1533043237481  7  2  180   
4  123    123  17  4  1533043277474  1533043277478  4  2  180   

   1533042976474.1  ...  90  1533042976481.1  1533042976481.2  0.1  1.1  90.1  \
0    1533043015474  ...  90    1533043015478    1533043015478    0    1    90   
1    1533043146474  ...  90    1533043146479    1533043146479    0    1    90   
2    1533043143474  ...  90    1533043143476    1533043143476    0    1    90   
3    1533043237474  ...  90    1533043237481    1533043237481    0    1    90   
4    1533043277474  ...  90    1533043277478    1533043277478    0    1    90   

   0.2  1.2  0.3  0.4  
0    0    1    0    0  
1    0    1    0    0  
2    0   

### model 

In [49]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pca = PCA(n_components=7)

transformed_x_network = pca.fit_transform(x_network)

transformed_x_pe = pca.fit_transform(x_pe)

# Check the shapes
print(transformed_x_network.shape)
print(transformed_x_pe.shape)

(128293, 7)
(9876, 7)


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(benign_features + malicious_features, [0]*len(
    benign_features) + [1]*len(malicious_features), test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = pd.DataFrame(X_train), pd.DataFrame(X_test), pd.DataFrame(y_train), pd.DataFrame(y_test)
X_train.to_csv("../data/csv/network/trainset.csv", index=False)
X_test.to_csv("../data/csv/network/testset.csv", index=False)
y_train.to_csv("../data/csv/network/trainlabel.csv", index=False)
y_test.to_csv("../data/csv/network/testlabel.csv", index=False)


print(f"no.of train samples: {len(X_train)}")
print(f"no.of test samples: {len(X_test)}")

no.of train samples: 100264
no.of test samples: 25067


In [15]:
X_train = pd.read_csv("../data/csv/network/trainset.csv")
X_test = pd.read_csv("../data/csv/network/testset.csv")
y_train = pd.read_csv("../data/csv/network/trainlabel.csv")
y_test = pd.read_csv("../data/csv/network/testlabel.csv")

In [5]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
# Assuming your data has the shape (9876, 7)
# X_train is your input data, and y_train is your binary labels (0 or 1)


# Define the LSTM model
model = Sequential()

# Add the first LSTM layer with 100 units and input shape (timesteps, features)
model.add(LSTM(100, input_shape=(
    X_train.shape[1],1), return_sequences=True))
model.add(Dropout(0.2))  # Adding dropout for regularization

# Add a second LSTM layer with 50 units
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))

# Add a third LSTM layer with 25 units
model.add(LSTM(25))
model.add(Dropout(0.2))

# Add a Dense layer with 50 units
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.2))

# Add the output layer with one unit and a sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model with binary crossentropy loss and an optimizer of your choice
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001), metrics=['accuracy'])



In [90]:
print(np.any(np.isnan(X_train)))
print(np.all(np.isfinite(X_train)))

False
True


In [6]:
# split the data into train and test
# X_train_network, X_test_network, y_train_network, y_test_network = train_test_split(
#     transformed_x_network, y_network, test_size=0.2, random_state=42)

# X_train_network = X_train_network.reshape(
#     X_train_network.shape[0], X_train_network.shape[1], 1)

# Train the model
model.fit(X_train, y_train,validation_split=0.2, epochs=10, batch_size=32)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fa6605d26b0>

In [None]:
preds = model.predict(X_test_network.reshape( X_test_network.shape[0], X_test_network.shape[1], 1))

# accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test_network, preds.round()))


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming your data has the shape (9876, 7)
# X_train is your input data, and y_train is your binary labels (0 or 1)



# Reshape the input data to match the input shape of Conv1D layer
X_train = transformed_x_network.reshape(
    (transformed_x_network.shape[0], transformed_x_network.shape[1], 1))

y_train = y_network
# Define the CNN model
model = Sequential()

# Add a 1D convolutional layer with 32 filters, kernel size 3, and ReLU activation
model.add(Conv1D(32, kernel_size=3, activation='relu',
          input_shape=(X_train.shape[1], 1)))
# Add a max pooling layer
model.add(MaxPooling1D(pool_size=2))
# Add another 1D convolutional layer with 64 filters, kernel size 3, and ReLU activation
model.add(Conv1D(64, kernel_size=3, activation='relu'))
# Add another max pooling layer
model.add(MaxPooling1D(pool_size=2))
# Flatten the output for the fully connected layers
model.add(Flatten())
# Add a dense layer with 128 units and ReLU activation
model.add(Dense(128, activation='relu'))
# Add dropout for regularization
model.add(Dropout(0.5))
# Add the output layer with one unit and a sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model with binary crossentropy loss and an optimizer of your choice
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)