In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
file_path = 'cyber_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Clean columns with string representations of lists
def clean_column(column):
    return column.apply(lambda x: float(ast.literal_eval(x)[0]) if isinstance(x, str) and '[' in x else float(x))

# Apply cleaning to relevant columns
columns_to_clean = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
for col in columns_to_clean:
    data[col] = clean_column(data[col])

# Define features and target
features = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
data['anomaly'] = np.where((data['sus'] == 1) | (data['evil'] == 1), 1, 0)  # Label anomalies
data = data.dropna(subset=features)  # Ensure no missing values
X = data[features].astype(float)
y = data['anomaly']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the CNN model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification output
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Predictions and classification report
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Save the model (optional)
model.save('anomaly_detection_cnn_model.h5')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 156ms/step - accuracy: 0.1555 - loss: 0.7277 - val_accuracy: 0.0909 - val_loss: 0.6922
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2088 - loss: 0.7094 - val_accuracy: 1.0000 - val_loss: 0.6850
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5407 - loss: 0.6966 - val_accuracy: 1.0000 - val_loss: 0.6781
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.8770 - loss: 0.6722 - val_accuracy: 1.0000 - val_loss: 0.6725
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.9200 - loss: 0.6747 - val_accuracy: 1.0000 - val_loss: 0.6669
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.9466 - loss: 0.6670 - val_accuracy: 1.0000 - val_loss: 0.6610
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37



Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = 'cyber_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Clean columns with string representations of lists
def clean_column(column):
    return column.apply(lambda x: float(ast.literal_eval(x)[0]) if isinstance(x, str) and '[' in x else float(x))

# Apply cleaning to relevant columns
columns_to_clean = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
for col in columns_to_clean:
    data[col] = clean_column(data[col])

# Define features and target
features = ['DnsAnswerTTL', 'NumberOfAnswers', 'DnsResponseCode', 'DnsOpCode']
data['anomaly'] = np.where((data['sus'] == 1) | (data['evil'] == 1), 1, 0)  # Label anomalies
data = data.dropna(subset=features)  # Ensure no missing values
X = data[features].astype(float)
y = data['anomaly']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train the Logistic Regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# Print classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.92

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.00      0.00      0.00         1

    accuracy                           0.92        13
   macro avg       0.46      0.50      0.48        13
weighted avg       0.85      0.92      0.89        13


Confusion Matrix:
[[12  0]
 [ 1  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
