In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import os

In [6]:
script_dir = os.getcwd() 
csv_path = os.path.join(script_dir, 'data', 'dataset.csv')

In [8]:
# Load Dataset
try:
    df = pd.read_csv(csv_path)
    print(f"Dataset loaded successfully from {csv_path}. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: Dataset not found at {csv_path}")
    print("Please ensure 'dynamic_api_calls.csv' is in the same directory as this script.")
    exit()

Dataset loaded successfully from c:\Users\shash\OneDrive\Downloads\GitHub\IoT-USB-HoneyNet\data\dataset.csv. Shape: (43876, 102)


In [9]:
# Prepare Data
columns_to_drop = [] 
if 'hash' in df.columns:
    columns_to_drop.append('hash')
if 'malware' in df.columns:
    columns_to_drop.append('malware')
else: 
    print("Error: 'malware' column not found in dataset.")
    exit()

X = df.drop(columns=columns_to_drop, axis=1)
y = df['malware']
print(f"Features shape: {X.shape}, Labels shape: {y.shape}")

Features shape: (43876, 100), Labels shape: (43876,)


In [10]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set shape: {X_train.shape[0]}, Test set shape: {X_test.shape[0]}")
print("Data Preparation Completed.")

Training set shape: 35100, Test set shape: 8776
Data Preparation Completed.


In [11]:
# Train Model
model = RandomForestClassifier(n_estimators=100,
                               random_state=42,
                               n_jobs=-1,
                               class_weight='balanced')
model.fit(X_train, y_train)
print("Model training completed.")

Model training completed.


In [12]:
model_filename = 'usb_honeypot_model.pkl'
model_path = os.path.join(script_dir, model_filename)
joblib.dump(model, model_path)
print(f"Model saved successfully as {model_filename}")

Model saved successfully as usb_honeypot_model.pkl


In [13]:
print("Evaluating model on the test set...")
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")


Evaluating model on the test set...
Model Accuracy on Test Set: 98.59%
