In [None]:
!pip install pefile




In [None]:
!pip install pefile pandas scikit-learn scipy



In [None]:
import pefile
import mmap
import os
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.io import arff
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import shutil

In [None]:
def extract_pe_header(file_path):
    try:
        with open(file_path, "rb") as f:
            mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
            pe = pefile.PE(data=mm, fast_load=True)
            pe_header = mm.read(1024)
            return pe_header
    except pefile.PEFormatError as e:
        print(f"Error parsing file: {e}")
        return None

def save_to_arff(pe_header_data, output_file, id):
    with open(output_file, "a") as f:
        f.write(f"{id},0,")
        for i, byte in enumerate(pe_header_data):
            f.write(f"{byte},")
        f.write("\n")

In [None]:
# Folder path containing the executable files
folder_path = r"/content/drive/MyDrive/"

# Output file path for the consolidated .arff
output_file = "pe_header.arff"

# Write the arff header information if the file doesn't exist
if not os.path.exists(output_file):
    with open(output_file, "w") as f:
        f.write("@relation pe_header\n")
        f.write("\n")
        f.write("@attribute ID numeric\n")
        f.write("@attribute GR numeric\n")
        for i in range(1024):
            f.write(f"@attribute {i} numeric\n")
        f.write("\n")
        f.write("@data\n")

# ID counter starting from 13000
id_counter = 13000

In [None]:
# Load the training dataset
data, meta = arff.loadarff('/content/drive/MyDrive/dataset.arff')
df = pd.DataFrame(data)
df = df.drop(['ID', 'filename', 'family'], axis=1)
X = df.drop(['GR'], axis=1)
y = df['GR']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features based on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train the Random Forest classifier

rf = RandomForestClassifier()
# Create the directory for processed files if it doesn't exist
processed_folder = r"downloads_processed"
os.makedirs(processed_folder, exist_ok=True)
print("X train shape: ",X_train.shape)
print("X train scaled shape: ",X_train_scaled.shape)
print("Y train shape: ",y_train.shape)


X train shape:  (1725, 1024)
X train scaled shape:  (1725, 1024)
Y train shape:  (1725,)


# Random Forest

In [None]:
rf.fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Scale the test features based on training data
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_pred = rf.predict(X_test_scaled)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       222
         1.0       0.95      0.97      0.96       210

    accuracy                           0.96       432
   macro avg       0.96      0.96      0.96       432
weighted avg       0.96      0.96      0.96       432

Confusion Matrix:
[[211  11]
 [  6 204]]
Accuracy: 0.9606481481481481


In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Assuming rf is your trained Random Forest model and X_test, y_test are your test features and labels

# Scale the test features based on training data
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_pred = rf.predict(X_test_scaled)

# Calculate precision, recall, F1-score, and support
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')

# Print the precision, recall, and F1-score with many decimal points
print(f'Precision: {precision:.15f}')
print(f'Recall: {recall:.15f}')
print(f'F1-score: {fscore:.15f}')


Precision: 0.948837209302326
Recall: 0.971428571428571
F1-score: 0.960000000000000


# RNN implementation

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming X_train_scaled, y_train are already prepared from previous steps

# Reshape X_train_scaled to have 3 dimensions (samples, time steps, features)
X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))

# Build the RNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.SimpleRNN(128, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32)

# Assuming X_test_scaled, y_test are already prepared from previous steps

# Reshape X_test_scaled to have 3 dimensions (samples, time steps, features)
X_test_reshaped = np.reshape(X_test_scaled, (X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9490740895271301


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred_prob = model.predict(X_test_reshaped)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten predictions and true labels for sklearn metrics
y_pred_flat = y_pred.flatten()
y_test_flat = y_test.to_numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test_flat, y_pred_flat)
precision = precision_score(y_test_flat, y_pred_flat)
recall = recall_score(y_test_flat, y_pred_flat)
f1 = f1_score(y_test_flat, y_pred_flat)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_flat, y_pred_flat))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_flat, y_pred_flat))

# Print additional metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95       222
         1.0       0.94      0.96      0.95       210

    accuracy                           0.95       432
   macro avg       0.95      0.95      0.95       432
weighted avg       0.95      0.95      0.95       432

Confusion Matrix:
[[209  13]
 [  9 201]]
Accuracy: 0.9490740740740741
Precision: 0.9392523364485982
Recall: 0.9571428571428572
F1-score: 0.9481132075471699


# Ensemble model

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for LSTM input
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build and train the LSTM model
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Extract features using the LSTM model
lstm_features = lstm_model.predict(X_train)

# Train a Random Forest classifier on the extracted features
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(lstm_features, y_train)

# Evaluate the ensemble model
lstm_test_features = lstm_model.predict(X_test)
y_pred_rf = rf_classifier.predict(lstm_test_features)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Ensemble Model Accuracy:", accuracy_rf)
print(classification_report(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Ensemble Model Accuracy: 0.9583333333333334
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95       222
         1.0       0.94      0.96      0.95       210

    accuracy                           0.95       432
   macro avg       0.95      0.95      0.95       432
weighted avg       0.95      0.95      0.95       432



In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred_prob = rf_classifier.predict(lstm_test_features)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten predictions and true labels for sklearn metrics
y_test_flat = y_test.to_numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test_flat, y_pred)
precision = precision_score(y_test_flat, y_pred)
recall = recall_score(y_test_flat, y_pred)
f1 = f1_score(y_test_flat, y_pred)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_flat, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_flat, y_pred))

# Print additional metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       222
         1.0       0.94      0.97      0.96       210

    accuracy                           0.96       432
   macro avg       0.96      0.96      0.96       432
weighted avg       0.96      0.96      0.96       432

Confusion Matrix:
[[210  12]
 [  6 204]]
Accuracy: 0.9583333333333334
Precision: 0.9444444444444444
Recall: 0.9714285714285714
F1-score: 0.9577464788732395


# CNN model

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Reshape X_train_scaled and X_test_scaled to have 3 dimensions (samples, timesteps, features)
X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_reshaped = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Build the CNN model for 1D data
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(32, 3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print("Test Accuracy:", accuracy)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9513888955116272


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred_prob = model.predict(X_test_reshaped)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten predictions and true labels for sklearn metrics
y_test_flat = y_test.to_numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test_flat, y_pred)
precision = precision_score(y_test_flat, y_pred)
recall = recall_score(y_test_flat, y_pred)
f1 = f1_score(y_test_flat, y_pred)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_flat, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_flat, y_pred))

# Print additional metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.97      0.95       222
         1.0       0.97      0.93      0.95       210

    accuracy                           0.95       432
   macro avg       0.95      0.95      0.95       432
weighted avg       0.95      0.95      0.95       432

Confusion Matrix:
[[215   7]
 [ 14 196]]
Accuracy: 0.9513888888888888
Precision: 0.9655172413793104
Recall: 0.9333333333333333
F1-score: 0.9491525423728815


# Dataset 2:


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

# Load the dataset
dataset_path = '/content/drive/MyDrive/pe_section_headers.csv'
df = pd.read_csv(dataset_path)

# Preprocess the data
X = df[['size_of_data', 'virtual_address', 'entropy', 'virtual_size']]
y = df['malware']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape the data for LSTM input
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))


Random Forest

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report and confusion matrix
print('Classification Report:')
print(classification_report(y_test, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9672017553990068
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.42      0.51       354
           1       0.98      0.99      0.98      8305

    accuracy                           0.97      8659
   macro avg       0.82      0.70      0.75      8659
weighted avg       0.96      0.97      0.96      8659

Confusion Matrix:
[[ 147  207]
 [  77 8228]]


In [None]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the metrics
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Precision: 0.9754593953764078
Recall: 0.990728476821192
F1-score: 0.9830346475507766


In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Assuming rf is your trained Random Forest model and X_test, y_test are your test features and labels

# Scale the test features based on training data
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate precision, recall, F1-score, and support
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='binary')

# Print the precision, recall, and F1-score with many decimal points
print(f'Precision: {precision:.15f}')
print(f'Recall: {recall:.15f}')
print(f'F1-score: {fscore:.15f}')


Precision: 0.959117681025523
Recall: 1.000000000000000
F1-score: 0.979132280122613




# RNN

In [None]:

# Preprocess the data
X = df[['size_of_data', 'virtual_address', 'entropy', 'virtual_size']]
y = df['malware']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape the data for Simple RNN input
X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the RNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.SimpleRNN(128, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.15113796293735504, Accuracy: 0.9593486785888672


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred_prob = model.predict(X_test_reshaped)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten predictions and true labels for sklearn metrics
y_pred_flat = y_pred.flatten()
y_test_flat = y_test.to_numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test_flat, y_pred_flat)
precision = precision_score(y_test_flat, y_pred_flat)
recall = recall_score(y_test_flat, y_pred_flat)
f1 = f1_score(y_test_flat, y_pred_flat)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_flat, y_pred_flat))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_flat, y_pred_flat))

# Print additional metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.01      0.02       354
           1       0.96      1.00      0.98      8305

    accuracy                           0.96      8659
   macro avg       0.81      0.51      0.50      8659
weighted avg       0.95      0.96      0.94      8659

Confusion Matrix:
[[   4  350]
 [   2 8303]]
Accuracy: 0.9593486545790507
Precision: 0.9595516006009477
Recall: 0.9997591812161348
F1-score: 0.9792428352400047


Ensemble

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf

# Load the dataset
dataset_path = '/content/drive/MyDrive/pe_section_headers.csv'
df = pd.read_csv(dataset_path)

# Preprocess the data
X = df[['size_of_data', 'virtual_address', 'entropy', 'virtual_size']]
y = df['malware']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for LSTM input
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build and train the LSTM model
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Extract features using the LSTM model
lstm_features = lstm_model.predict(X_train)

# Train a Random Forest classifier on the extracted features
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(lstm_features, y_train)

# Evaluate the ensemble model
lstm_test_features = lstm_model.predict(X_test)
y_pred_rf = rf_classifier.predict(lstm_test_features)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Ensemble Model Accuracy:", accuracy_rf)
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred_prob = rf_classifier.predict(lstm_test_features)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten predictions and true labels for sklearn metrics
y_test_flat = y_test.to_numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test_flat, y_pred)
precision = precision_score(y_test_flat, y_pred)
recall = recall_score(y_test_flat, y_pred)
f1 = f1_score(y_test_flat, y_pred)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_flat, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_flat, y_pred))

# Print additional metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


CNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Load the dataset
dataset_path = '/content/drive/MyDrive/pe_section_headers.csv'
df = pd.read_csv(dataset_path)

# Preprocess the data
X = df[['size_of_data', 'virtual_address', 'entropy', 'virtual_size']]
y = df['malware']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for CNN input
X_reshaped = X_scaled.reshape((X_scaled.shape[0], X_scaled.shape[1], 1))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build the CNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Flatten predictions and true labels for sklearn metrics
y_pred_flat = y_pred.flatten()
y_test_flat = y_test.to_numpy()

# Calculate additional metrics
accuracy = accuracy_score(y_test_flat, y_pred_flat)
precision = precision_score(y_test_flat, y_pred_flat)
recall = recall_score(y_test_flat, y_pred_flat)
f1 = f1_score(y_test_flat, y_pred_flat)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_flat, y_pred_flat))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_flat, y_pred_flat))

# Print additional metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


In [None]:
folder_path = r"/content/drive/MyDrive/MALWARE"

In [None]:
# Infinite loop to continuously check for new files
while True:
    # Iterate over all files in the folder
    file_list = os.listdir(folder_path)

    # Check if there are any files in the directory
    if len(file_list) == 0:
        break

    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)

        # Check if the file is an .exe file and hasn't been processed before
        if os.path.isfile(file_path) and file_name.lower().endswith((".exe",)):
            try:
                # Extract PE header data
                pe_header_data = extract_pe_header(file_path)

                # Save the data to the consolidated .arff file with ID and GR
                save_to_arff(pe_header_data, output_file, id_counter)

                # Increment ID counter
                id_counter += 1

                # Copy the processed file to the processed folder
                processed_file_path = os.path.join(processed_folder, file_name)
                shutil.copy2(file_path, processed_file_path)
                os.chmod(file_path, 0o777)

                # Remove the original file (optional)
                os.remove(file_path)

            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    # Pause the loop execution for a specified duration (e.g., 5 seconds)
    time.sleep(5)



In [None]:
    # Load the new dataset
    new_data, new_meta = arff.loadarff('pe_header.arff')
    new_df = pd.DataFrame(new_data)

    # Replace empty strings with a default value (e.g., '0' or np.nan)
    new_df.replace('', np.nan, inplace=True)  # Replace empty strings with np.nan

    # Convert attribute columns to float type
    for column in new_df.columns:
        if new_df[column].dtype == object:  # Check if the column contains strings
            new_df[column] = pd.to_numeric(new_df[column], errors='coerce')

    new_df.dropna(inplace=True)

    # Proceed if there are samples available in the new dataset
    if new_df.shape[0] > 0:
        # Drop unnecessary columns
        new_df = new_df.drop(['ID', 'GR'], axis=1)

        # Perform label encoding if applicable
        if 'family' in new_df.columns:
            label_encoder = LabelEncoder()
            new_df['family'] = label_encoder.fit_transform(new_df['family'])

        # Scale the features of the new dataset using the trained scaler
        new_df_scaled = scaler.transform(new_df)

        # Predict the "GR" column for the new dataset
        new_predictions = rf.predict(new_df_scaled)

        # Print predicted GR values
        print("Predicted GR:", new_predictions)

        # Quarantine files with predicted GR value 1
        quarantine_folder = r"quarantine"
        os.makedirs(quarantine_folder, exist_ok=True)

        for file_name, prediction in zip(os.listdir(processed_folder), new_predictions):
            if file_name.lower().endswith((".exe",)) and prediction == 1:
                file_path = os.path.join(processed_folder, file_name)
                quarantined_file_path = os.path.join(quarantine_folder, file_name)

                try:
                    shutil.move(file_path, quarantined_file_path)
                    print(f"File {file_name} quarantined.")
                    os.remove('pe_header.arff')

                except Exception as e:
                    print(f"Error moving file {file_name} to quarantine: {e}")

    else:
        print("No new samples found in the dataset.")
    time.sleep(5)

StopIteration: 

# New Section