In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv("cybersecurity_attacks.csv")


In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix  # Import sparse matrix module

# Example dataset for reference (use your actual dataset)
# data = pd.read_csv("your_dataset.csv")

# Define columns
columns_to_drop = [
    'Payload Data', 'Device Information', 'Geo-location Data',
    'Timestamp', 'User Information', 'Proxy Information'
]

categorical_columns = [
    'Traffic Type', 'Malware Indicators', 'Alerts/Warnings',
    'Attack Signature', 'Severity Level', 'Network Segment',
    'Firewall Logs', 'IDS/IPS Alerts', 'Log Source',
    'Protocol', 'Packet Type', 'Attack Type', 'Action Taken',
    'Source IP Address', 'Destination IP Address'
]

# Drop irrelevant columns
data = data.drop(columns=columns_to_drop, errors='ignore')

# Detect numeric and categorical columns
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = [col for col in categorical_columns if col in data.columns]

# Handle missing values and preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        # Scale numeric features
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_columns),
        
        # Encode categorical features (keep it sparse)
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))  # Use sparse output
        ]), categorical_columns)
    ]
)

# Fit and transform data
processed_data = preprocessor.fit_transform(data)

# Debug: Check shapes
print(f"Processed data shape: {processed_data.shape}")

# Extract column names
numeric_feature_names = numeric_columns
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_columns)

# Combine all feature names
all_feature_names = numeric_feature_names + list(categorical_feature_names)

# Debug: Compare shapes
print(f"Total features expected: {len(all_feature_names)}")

# Check alignment
assert processed_data.shape[1] == len(all_feature_names), "Feature name count does not match processed data columns!"

# Convert processed data to DataFrame
processed_data_df = pd.DataFrame.sparse.from_spmatrix(processed_data, columns=all_feature_names)

# Debug: Print a sample of the DataFrame
print(processed_data_df.head())

# Final checks
if processed_data_df.isnull().any().any():
    raise ValueError("There are still missing values after preprocessing!")

print("Data is fully preprocessed and ready for modeling!")


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Columns to drop
columns_to_drop = [
    'Payload Data', 'Device Information', 'Geo-location Data',
    'Timestamp', 'User Information', 'Proxy Information'
]

# Encode remaining categorical columns
categorical_columns = [
    'Traffic Type', 'Malware Indicators', 'Alerts/Warnings',
    'Attack Signature', 'Severity Level', 'Network Segment',
    'Firewall Logs', 'IDS/IPS Alerts', 'Log Source',
    'Protocol', 'Packet Type', 'Attack Type', 'Action Taken',
    'Source IP Address', 'Destination IP Address'
]

label_encoders = {}

# Iterate through columns to encode
for column in categorical_columns:
    if column in data.columns:
        le = LabelEncoder()
        # Fill NaN with a placeholder and encode as strings
        data[column] = data[column].fillna('Unknown').astype(str)
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Drop irrelevant columns
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')

# Check for non-numeric columns
non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns

if len(non_numeric_columns) > 0:
    # Print details of non-numeric columns for debugging
    print("Non-numeric columns and their unique values:", {col: data[col].unique() for col in non_numeric_columns})
    raise ValueError("Some columns are still non-numeric after processing!")

print("All columns are now numeric!")

ValueError: fill_value='Unknown' (of type <class 'str'>) cannot be cast to the input data that is dtype('int32'). Make sure that both dtypes are of the same kind.

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Columns to drop
columns_to_drop = [
    'Payload Data', 'Device Information', 'Geo-location Data',
    'Timestamp', 'User Information', 'Proxy Information'
]

# Encode remaining categorical columns
categorical_columns = [
    'Traffic Type', 'Malware Indicators', 'Alerts/Warnings',
    'Attack Signature', 'Severity Level', 'Network Segment',
    'Firewall Logs', 'IDS/IPS Alerts', 'Log Source',
    'Protocol', 'Packet Type', 'Attack Type', 'Action Taken',
    'Source IP Address', 'Destination IP Address'
]

label_encoders = {}

# Iterate through columns to encode
for column in categorical_columns:
    if column in data.columns:
        le = LabelEncoder()
        # Fill NaN with a placeholder and encode as strings
        data[column] = data[column].fillna('Unknown').astype(str)
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Drop irrelevant columns
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')

# Check for non-numeric columns
non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns

if len(non_numeric_columns) > 0:
    # Print details of non-numeric columns for debugging
    print("Non-numeric columns and their unique values:", {col: data[col].unique() for col in non_numeric_columns})
    raise ValueError("Some columns are still non-numeric after processing!")

print("All columns are now numeric!")


All columns are now numeric!


In [13]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import BatchNormalization


# Step 1: Separate features and target
X = data.drop(columns=['Malware Indicators'])
y = data['Malware Indicators']

# Ensure `X` and `y` have the same number of samples
if X.shape[0] != y.shape[0]:
    raise ValueError(f"Initial mismatch in number of samples: X={X.shape[0]}, y={y.shape[0]}")

# Step 2: Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Reshape `X` for CNN input
num_features = X.shape[1]
reshaped_dim1 = int(np.ceil(np.sqrt(num_features)))
reshaped_dim2 = int(np.ceil(num_features / reshaped_dim1))

# Padding to match reshaped dimensions
if reshaped_dim1 * reshaped_dim2 > num_features:
    padding = np.zeros((X.shape[0], reshaped_dim1 * reshaped_dim2 - num_features))
    X = np.hstack((X, padding))

X = X.reshape(-1, reshaped_dim1, reshaped_dim2, 1)

# Ensure target size matches reshaped feature size
y = y.iloc[:X.shape[0]].reset_index(drop=True)

# Final shape check
if X.shape[0] != y.shape[0]:
    raise ValueError(f"Mismatch after reshaping: X={X.shape[0]}, y={y.shape[0]}")

# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import BatchNormalization

# Updated CNN Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(reshaped_dim1, reshaped_dim2, 1)),
    BatchNormalization(),
    MaxPooling2D((2, 2), padding='same'),
    Dropout(0.3),

    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2), padding='same'),
    Dropout(0.4),

    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2), padding='same'),
    Dropout(0.5),

    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks for Training
callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1),
    EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
]

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,  # Increased epochs for better learning
    batch_size=64,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5071 - loss: 0.8283 - val_accuracy: 0.5028 - val_loss: 0.6940 - learning_rate: 0.0010
Epoch 2/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5070 - loss: 0.7181 - val_accuracy: 0.4909 - val_loss: 0.6941 - learning_rate: 0.0010
Epoch 3/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5085 - loss: 0.6988 - val_accuracy: 0.5003 - val_loss: 0.6939 - learning_rate: 0.0010
Epoch 4/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5000 - loss: 0.6973 - val_accuracy: 0.5113 - val_loss: 0.6929 - learning_rate: 0.0010
Epoch 5/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5058 - loss: 0.6955 - val_accuracy: 0.4880 - val_loss: 0.6940 - learning_rate: 0.0010
Epoch 6/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m