In [None]:
# imports
import pandas as pd  # Import pandas library for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations
import matplotlib.pyplot as plt  # Import matplotlib for plotting graphs
import seaborn as sns  # Import seaborn for advanced data visualization
import pickle  # Import pickle to save and load Python objects (like trained models)
import tensorflow as tf  # Import TensorFlow for building and training deep learning models
from tensorflow.keras.models import Sequential  # Import Sequential model class from Keras for building neural networks
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout  # Import Conv1D layer for 1D convolution (useful for sequential data like time series)
from sklearn import preprocessing  # Import preprocessing utilities from scikit-learn
from sklearn.model_selection import train_test_split  # Import function to split dataset into training and testing sets
from sklearn.preprocessing import StandardScaler  # Import preprocessing utilities from scikit-learn
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier for building a Random Forest model
from sklearn.model_selection import GridSearchCV  # Import GridSearchCV for hyperparameter tuning
from sklearn.model_selection import cross_val_score  # Import cross_val_score for model evaluation using cross-validation
from sklearn.neural_network import MLPClassifier  # Import MLPClassifier for building a Multi-Layer Perceptron model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, precision_recall_curve  # Import accuracy_score to calculate accuracy of predictions


In [None]:
# Load datasets (Make sure the CSV file is in the same directory as this script. you can change the path if needed and use the v2 dataset)

botnet_df_v2 = pd.read_csv('BotNeTIoT-L01-v2.csv')  

In [None]:
 

# botnet_df_v2 features
print('botnet_df_v2 features:')   
print(botnet_df_v2.columns)   
print('\n')   
print('*'*50)   

In [None]:
print('-'*10, 'HEAD', '-'*10)   
print(botnet_df_v2.head())   
print('\n')   

print('-'*10, 'DESCRIBE', '-'*10)   
print(botnet_df_v2.describe())   
print('\n')   

print('-'*10, 'INFO', '-'*10)   
print(botnet_df_v2.info())   
print('\n')   

print('-'*10, 'MISSING VALUES', '-'*10)   
print(botnet_df_v2.isnull().sum())   
print('\n')   

print('-'*10, 'DATA TYPES', '-'*10)   
print(botnet_df_v2.dtypes)   
print('\n')   

# Check for unique values
print('-'*10, 'UNIQUE VALUES', '-'*10)   
print(botnet_df_v2.nunique())   
print('\n')   

In [None]:
# Display Unique values for attack and attack_subtype
print('-'*10, 'UNIQUE VALUES FOR ATTACK AND ATTACK_SUBTYPE', '-'*10)   
print(botnet_df_v2['Attack'].unique())   
print(botnet_df_v2['Attack_subType'].unique())   
print('\n')   

# Pie chart for attack and device_name
print('-'*10, 'PIE CHART FOR ATTACK AND DEVICE_NAME', '-'*10)   
plt.figure(figsize=(10, 10))  
plt.subplot(2, 1, 1)  
botnet_df_v2['Attack'].value_counts().plot.pie(autopct='%1.1f%%')  
plt.title('Attack')  
plt.subplot(2, 1, 2)  
botnet_df_v2['Device_Name'].value_counts().plot.pie(autopct='%1.1f%%')  
plt.title('Device_Name')  
plt.show()  

## Before proceeding to the correlation analysis, we need to convert the categorical features to numerical for better processing

In [None]:
#load a sample of 100,000 rows for quicker processing during development. If you have enough memory, you can load the full dataset by commenting out this line.
botnet_df_v2 = pd.read_csv('BotNeTIoT-L01-v2.csv')   
df = botnet_df_v2.sample(n=100000, random_state=1)# Sample 100,000 rows for quicker processing during development
print(df.columns)   

## Encoding categorical features

In [None]:
# Convert Device name to numerical
le = preprocessing.LabelEncoder()  
df['Device_Name'] = le.fit_transform(df['Device_Name'])  
df['Attack'] = le.fit_transform(df['Attack'])  
df['Attack_subType'] = le.fit_transform(df['Attack_subType'])  

print('-'*10, 'DATA TYPES', '-'*10)   
print(df.dtypes)   
print('\n')   

print('Device_Name:', df['Device_Name'].unique())   
print('Attack:', df['Attack'].unique())   
print('Attack_subType:', df['Attack_subType'].unique())   

# Feature Correlation Analysis

In [None]:
# Plot 'Attack' vs 'label'
sns.countplot(x='Attack', hue='label', data=df)  
plt.title('Attack vs Label')  
plt.show()  

# Plot 'Attack_subType' vs 'label'
sns.countplot(x='Attack_subType', hue='label', data=df)  
plt.title('Attack_subType vs Label')  
plt.show()  

# Drop attack and attack subtype
df = df.drop(['Attack', 'Attack_subType'], axis=1)  

print('-'*10, 'Data Types', '-'*10)   
print(df.dtypes)   

# Save the data
df.to_csv('BoTNeTIoT-L01-v2-prepared.csv', index=False)  

Attack and Attack_subtype are strongly correlated to the label so they should be dropped as they artificially inflate model performance without generalizing to unseen data.

In [None]:
df = pd.read_csv('BoTNeTIoT-L01-v2-prepared.csv')  

def drop_highly_correlated_features(df, threshold=0.95):  
    df_copy = df.copy()  
    corr_matrix = df_copy.corr().abs()  
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]  
    df_copy.drop(columns=to_drop, inplace=True)  
    return df_copy, corr_matrix, to_drop  

df, corr, dropped_features = drop_highly_correlated_features(df)  

print("Dropped features:", dropped_features)   
print("Remaining features:", df.columns.tolist())   

corr_after = df.corr().abs()  
plt.figure(figsize=(30, 30))  
sns.heatmap(corr_after, annot=True, cmap="coolwarm")  
plt.title('Correlation Heatmap (After Feature Removal)')  
plt.show()  

# print the correlation matrix
print(corr_after)   


Saving the highest correlation pair to later visualize the data distribution

#### Feature and Target Selection

Label Feature as Target

0: Malicious behavior (attack)
When the Attack column has values like mirai, gafgyt, and specific subtypes like ack or tcp.

1: Normal behavior
When the Attack and Attack_subType columns indicate Normal.

The label feature serves as the final determination of whether the data corresponds to an attack or normal behavior. This makes it an ideal target variable.

In [None]:
# Define features and target
features = df.drop(columns=['label'])  
target = df['label']  

print('-'*10, 'FEATURES AND TARGET', '-'*10)   
print('Features:', features.columns.tolist())   
print('Target:', target.name)   

## Split Dataset

Setting up the dataset for modeling by splitting and scaling the data

In [None]:
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0, stratify=target)  # Import function to split dataset into training and testing sets

# Save the split data
print('-' * 10, 'DATA SPLITS', '-' * 10)   
print('Training Features Shape:', X_train.shape)   
print('Test Features Shape:', X_test.shape)   
print('Training Target Shape:', y_train.shape)   
print('Test Target Shape:', y_test.shape)   

## Feature Scaling

Since we're making a classification task it's understandable that only the X features get to be scaled. Our target variable is also categorial which me that the scaling of features y are not applicable.
To ensure consistency, the X_test also gets to be scaled.

In [None]:
# Scaling
scaler = StandardScaler()  # Import StandardScaler to normalize features
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

# Save the scaled data
pd.DataFrame(X_train_scaled, columns=X_train.columns).to_csv('X_train_scaled.csv', index=False)  
pd.DataFrame(X_test_scaled, columns=X_test.columns).to_csv('X_test_scaled.csv', index=False)  

X_train.to_csv('X_train.csv', index=False)  
X_test.to_csv('X_test.csv', index=False)  
y_train.to_csv('y_train.csv', index=False)  
y_test.to_csv('y_test.csv', index=False)  

#### Feature Importance

Identify the most relevant features to improve the model's performance

In [None]:
# Read data
X_train_scaled = pd.read_csv('X_train_scaled.csv')  
X_train = pd.read_csv('X_train.csv')  
y_train = pd.read_csv('y_train.csv')  

X_test_scaled = pd.read_csv('X_test_scaled.csv')  
X_test = pd.read_csv('X_test.csv')  
y_test = pd.read_csv('y_test.csv')  

print('Data loaded')   
print('X_train_scaled shape:', X_train_scaled.shape)   
print('X_train shape:', X_train.shape)   
print('y_train shape:', y_train.shape)   
print('X_test_scaled shape:', X_test_scaled.shape)   
print('X_test shape:', X_test.shape)   
print('y_test shape:', y_test.shape)   

y_train = y_train.values.ravel()  


### Random Forest

In [None]:
def train_random_forest(X_train, y_train, n_estimators=100, top_n_features=10):  
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42, verbose=1, n_jobs=-1)  # Import RandomForestClassifier for building a Random Forest model
    rf.fit(X_train, y_train)  
    
    feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns)  
    important_features = feature_importance.nlargest(top_n_features).index  
    
    feature_importance.nlargest(top_n_features).plot(kind='barh', title='Feature Importance')  
    plt.show()  
    
    return rf, important_features  

def perform_grid_search(X_train, y_train, param_grid):  
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)  # Import RandomForestClassifier for building a Random Forest model
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)  # Import GridSearchCV for hyperparameter tuning
    grid_search.fit(X_train, y_train)  
    
    print("Best Parameters:", grid_search.best_params_)   
    print("Best Score:", grid_search.best_score_)   
    
    return grid_search.best_estimator_  

param_grid = {  
    'n_estimators': [100, 200, 300],  
    'max_depth': [10, 20, 30, 40, 50],  
    'min_samples_split': [2, 5, 10],  
}  

rf, important_features = train_random_forest(X_train_scaled, y_train, top_n_features=10)  

top_X_train = X_train_scaled[important_features]  
top_X_test = X_test_scaled[important_features]  

best_model = perform_grid_search(top_X_train, y_train, param_grid)  

y_pred = best_model.predict(top_X_test)  
y_pred_proba = best_model.predict_proba(top_X_test)[:, 1]  

conf_matrix = confusion_matrix(y_test, y_pred)  # Import confusion_matrix to show prediction errors
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')  
plt.xlabel('Predicted')  
plt.ylabel('Actual')  
plt.title('Confusion Matrix')  
plt.show()  

# Load the model
filename = 'best_model.sav'  
pickle.dump(best_model, open(filename, 'wb'))  

# Load the model
loaded_model = pickle.load(open(filename, 'rb'))  
result = loaded_model.score(top_X_test, y_test)  
print(result)   

print('*'*10 + 'Results' + '*'*10)   
print('Accuracy:', accuracy_score(y_test, y_pred))   
print('Precision:', precision_score(y_test, y_pred))   
print('Recall:', recall_score(y_test, y_pred))   
print('F1 Score:', f1_score(y_test, y_pred))   
print('ROC AUC Score:', roc_auc_score(y_test, y_pred_proba))   

### Neural Networks

In [None]:
mlp = MLPClassifier(max_iter=1000)  # Import MLPClassifier for building a Multi-Layer Perceptron model
mlp.fit(top_X_train, y_train)  

# Cross-validation score
print('*' * 10 + ' Cross Validation Score ' + '*' * 10)   
print(cross_val_score(mlp, top_X_train, y_train, cv=5, scoring='accuracy').mean())   

# Define Hyperparameter Grid for MLP
param_grid_mlp = {  
    'hidden_layer_sizes': [(100,), (200,), (300,)],  
    'activation': ['logistic', 'tanh', 'relu'],  
}  

# Results
print('*' * 10 + ' Results ' + '*' * 10)   
print('Accuracy:', round(accuracy_score(y_test, mlp.predict(top_X_test)), 3))   
print('Precision:', round(precision_score(y_test, mlp.predict(top_X_test)), 3))   
print('Recall:', round(recall_score(y_test, mlp.predict(top_X_test)), 3))   
print('F1 Score:', round(f1_score(y_test, mlp.predict(top_X_test)), 3))   
print('ROC AUC Score:', round(roc_auc_score(y_test, mlp.predict_proba(top_X_test)[:, 1]), 3))   

# Load the model
# Save the trained MLP model
filename = 'mlp_model.sav'  
pickle.dump(mlp, open(filename, 'wb'))  

loaded_model = pickle.load(open(filename, 'rb'))  
result = loaded_model.score(top_X_test, y_test)  
print(result)   


### Convolutional Neural Networks

In [None]:
def build_cnn(input_shape):  
    model = Sequential([  # Import Sequential model class from Keras for building neural networks
        Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape),  # Import Conv1D layer for 1D convolution (useful for sequential data like time series)
        MaxPooling1D(pool_size=2),  # Import MaxPooling1D for downsampling 1D data
        Conv1D(filters=64, kernel_size=3, activation='relu'),  # Import Conv1D layer for 1D convolution (useful for sequential data like time series)
        MaxPooling1D(pool_size=2),  # Import MaxPooling1D for downsampling 1D data
        Flatten(),  # Import Flatten layer to convert multi-dimensional input to 1D
        Dense(128, activation='relu'),  # Import Dense (fully connected) layer
        Dropout(0.5),  # Import Dropout for regularization to prevent overfitting
        Dense(1, activation='sigmoid')  # Import Dense (fully connected) layer
    ])  
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  
    return model  

# Reshape data for CNN (assuming time series-like data with 1D structure)
top_X_train_cnn = top_X_train.values.reshape((top_X_train.shape[0], top_X_train.shape[1], 1))  
top_X_test_cnn = top_X_test.values.reshape((top_X_test.shape[0], top_X_test.shape[1], 1))  

# Initialize and train CNN model
cnn_model = build_cnn(input_shape=(top_X_train.shape[1], 1))  
history = cnn_model.fit(top_X_train_cnn, y_train, epochs=10, batch_size=32, validation_split=0.2)  

# Save CNN model
cnn_model.save('cnn_model.h5')  

# Results
cnn_predictions = (cnn_model.predict(top_X_test_cnn) > 0.5).astype("int32")  
print('Results')   
print('Accuracy:', accuracy_score(y_test, cnn_predictions))   
print('Precision:', precision_score(y_test, cnn_predictions))   
print('Recall:', recall_score(y_test, cnn_predictions))   
print('F1 Score:', f1_score(y_test, cnn_predictions))   
print('ROC AUC Score:', roc_auc_score(y_test, cnn_model.predict(top_X_test_cnn)))   

# Plot Results
fpr, tpr, thresholds = roc_curve(y_test, cnn_model.predict(top_X_test_cnn))  # Import roc_curve to compute Receiver Operating Characteristic curve
roc_auc = auc(fpr, tpr)  # Import auc to compute Area Under Curve

plt.figure()  
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)  # Import auc to compute Area Under Curve
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  
plt.xlim([0.0, 1.0])  
plt.ylim([0.0, 1.05])  
plt.xlabel('False Positive Rate')  
plt.ylabel('True Positive Rate')  
plt.title('Receiver Operating Characteristic (ROC)')  
plt.legend(loc='lower right')  
plt.show()  

precision, recall, thresholds = precision_recall_curve(y_test, cnn_model.predict(top_X_test_cnn))  # Import precision_recall_curve to compute precision-recall tradeoff

plt.figure()  
plt.plot(recall, precision, color='darkorange', lw=2, label='Precision-Recall curve')  
plt.xlim([0.0, 1.0])  
plt.ylim([0.0, 1.05])  
plt.xlabel('Recall')  
plt.ylabel('Precision')  
plt.title('Precision-Recall Curve')  
plt.legend(loc='lower left')  
plt.show()  

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, cnn_predictions)  # Import confusion_matrix to show prediction errors
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')  
plt.xlabel('Predicted')  
plt.ylabel('Actual')  
plt.title('Confusion Matrix')  
plt.show()  

# Plot loss and accuracy
plt.figure()  
plt.plot(history.history['accuracy'], label='accuracy')  
plt.plot(history.history['loss'], label='loss')  
plt.xlabel('Epoch')  
plt.ylabel('Loss/Accuracy')  
plt.title('Loss and Accuracy')  
plt.legend()  
plt.show()  
