In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
data = pd.read_csv("/content/cybersecurity_attacks.csv")


Data Preprocessing

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Columns to drop
columns_to_drop = [
    'Payload Data', 'Device Information', 'Geo-location Data',
    'Timestamp', 'User Information', 'Proxy Information'
]

# Encode remaining categorical columns
categorical_columns = [
    'Traffic Type', 'Malware Indicators', 'Alerts/Warnings',
    'Attack Signature', 'Severity Level', 'Network Segment',
    'Firewall Logs', 'IDS/IPS Alerts', 'Log Source',
    'Protocol', 'Packet Type', 'Attack Type', 'Action Taken',
    'Source IP Address', 'Destination IP Address'
]

label_encoders = {}

# Iterate through columns to encode
for column in categorical_columns:
    if column in data.columns:
        le = LabelEncoder()
        # Fill NaN with a placeholder and encode as strings
        data[column] = data[column].fillna('Unknown').astype(str)
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Drop irrelevant columns
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')

# Check for non-numeric columns
non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns

if len(non_numeric_columns) > 0:
    # Print details of non-numeric columns for debugging
    print("Non-numeric columns and their unique values:", {col: data[col].unique() for col in non_numeric_columns})
    raise ValueError("Some columns are still non-numeric after processing!")

print("All columns are now numeric!")


All columns are now numeric!


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report



# Define the target variable and features
X = data.drop(columns=['Malware Indicators'])  # Features
y = data['Malware Indicators']  # Target variable


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model on the training dataset
rf_classifier.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = rf_classifier.predict(X_test)


# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, rf_classifier.predict_proba(X_test)[:, 1])  # For binary classification

# Print the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Evaluation Metrics:
Accuracy: 0.4964
Precision: 0.4964
Recall: 0.4964
F1 Score: 0.4955
AUC-ROC: 0.5034

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.54      0.52      4000
           1       0.50      0.46      0.47      4000

    accuracy                           0.50      8000
   macro avg       0.50      0.50      0.50      8000
weighted avg       0.50      0.50      0.50      8000

