In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Machine learning/new2/simulated_ddos_attack.csv")

# Check available columns
print("Columns in dataset:", df.columns)

# Feature Engineering: Only add features that exist in the dataset
if 'duration' in df.columns:
    df['duration'] = df['duration'].replace(0, 1)  # Avoid division by zero

if 'packet_count' in df.columns and 'duration' in df.columns:
    df['packet_rate'] = df['packet_count'] / df['duration']
else:
    print("Skipping 'packet_rate' as required columns are missing.")

if 'byte_count' in df.columns and 'duration' in df.columns:
    df['byte_rate'] = df['byte_count'] / df['duration']
else:
    print("Skipping 'byte_rate' as required columns are missing.")

# Drop high-cardinality categorical columns (IPs) if they exist
for col in ['src_ip', 'dst_ip']:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# One-Hot Encode 'protocol' if it exists
if 'protocol' in df.columns:
    df = pd.get_dummies(df, columns=['protocol'])

# Encode target variable if present
if 'attack_type' in df.columns:
    label_encoder = LabelEncoder()
    df['attack_type'] = label_encoder.fit_transform(df['attack_type'])
else:
    raise KeyError("Target column 'attack_type' is missing from the dataset.")

# Splitting features and target
X = df.drop(columns=['attack_type'])
y = df['attack_type']

# Handling Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Finding the best k-value and distance metric
best_k = 1
best_accuracy = 0
for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='minkowski')
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

# Train final KNN model with best k
knn = KNeighborsClassifier(n_neighbors=best_k, weights='distance', metric='minkowski')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)

# Compute confusion matrix and performance metrics
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Best k: {best_k}")
print(f"Final Accuracy: {final_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Columns in dataset: Index(['timestamp', 'src_ip', 'dst_ip', 'protocol', 'src_port', 'dst_port',
       'length', 'attack_type'],
      dtype='object')
Skipping 'packet_rate' as required columns are missing.
Skipping 'byte_rate' as required columns are missing.
Best k: 1
Final Accuracy: 61.13%
Confusion Matrix:
[[275  90  66  68   0]
 [ 82 243  91  82   0]
 [ 92  87 241  79   0]
 [ 76  69  87 267   0]
 [  0   0   0   0 498]]
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.55      0.54       499
           1       0.50      0.49      0.49       498
           2       0.50      0.48      0.49       499
           3       0.54      0.54      0.54       499
           4       1.00      1.00      1.00       498

    accuracy                           0.61      2493
   macro avg       0.61      0.61      0.61      2493
weighted avg       0.61      0.61      0.61      2493



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Machine learning/new2/simulated_ddos_attack.csv")

# Check available columns
print("Columns in dataset:", df.columns)

# Convert timestamp into useful time-based features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df.drop(columns=['timestamp'], inplace=True)

# Convert IPs into frequency counts (instead of direct encoding)
df['src_ip_count'] = df.groupby('src_ip')['src_ip'].transform('count')
df['dst_ip_count'] = df.groupby('dst_ip')['dst_ip'].transform('count')
df.drop(columns=['src_ip', 'dst_ip'], inplace=True)

# One-Hot Encoding for categorical columns (protocol & ports)
df = pd.get_dummies(df, columns=['protocol', 'src_port', 'dst_port'])

# Encode attack_type (target variable)
label_encoder = LabelEncoder()
df['attack_type'] = label_encoder.fit_transform(df['attack_type'])

# Splitting features and target
X = df.drop(columns=['attack_type'])
y = df['attack_type']

# Handling Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Standardizing numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest Classifier
rf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=15, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Compute accuracy and performance metrics
final_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Final Accuracy: {final_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Columns in dataset: Index(['timestamp', 'src_ip', 'dst_ip', 'protocol', 'src_port', 'dst_port',
       'length', 'attack_type'],
      dtype='object')
Final Accuracy: 42.08%
Confusion Matrix:
[[ 45 339   0 115   0]
 [ 28 367   0 103   0]
 [ 44 350   0 105   0]
 [ 30 330   0 139   0]
 [  0   0   0   0 498]]
Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.09      0.14       499
           1       0.26      0.74      0.39       498
           2       0.00      0.00      0.00       499
           3       0.30      0.28      0.29       499
           4       1.00      1.00      1.00       498

    accuracy                           0.42      2493
   macro avg       0.37      0.42      0.36      2493
weighted avg       0.37      0.42      0.36      2493



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Machine learning/new2/synthetic_high_accuracy_dataset.csv")

# Process 'timestamp' if it exists
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute
    df.drop(columns=['timestamp'], inplace=True)

# Convert IPs into frequency counts
if 'src_ip' in df.columns and 'dst_ip' in df.columns:
    df['src_ip_count'] = df.groupby('src_ip')['src_ip'].transform('count')
    df['dst_ip_count'] = df.groupby('dst_ip')['dst_ip'].transform('count')
    df.drop(columns=['src_ip', 'dst_ip'], inplace=True)

# One-hot encode 'protocol' if present
if 'protocol' in df.columns:
    df = pd.get_dummies(df, columns=['protocol'])

# Port Feature Engineering: Mean encoding by attack type
if 'src_port' in df.columns and 'dst_port' in df.columns and 'attack_type' in df.columns:
    df['src_port_mean'] = df.groupby('attack_type')['src_port'].transform('mean')
    df['dst_port_mean'] = df.groupby('attack_type')['dst_port'].transform('mean')
    df.drop(columns=['src_port', 'dst_port'], inplace=True)

# Encode attack_type (target variable)
if 'attack_type' in df.columns:
    label_encoder = LabelEncoder()
    df['attack_type'] = label_encoder.fit_transform(df['attack_type'])
else:
    raise KeyError("❌ Error: 'attack_type' column is missing, cannot proceed with training.")

# Splitting features and target
X = df.drop(columns=['attack_type'])
y = df['attack_type']

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardizing numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train XGBoost Classifier with optimized parameters
xgb_clf = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    reg_alpha=0.5,
    objective="multi:softmax",
    eval_metric="mlogloss"
)

# Train XGBoost
xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Predictions
y_pred = xgb_clf.predict(X_test)

# Compute accuracy and performance metrics
final_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Save model
xgb_clf.save_model("/content/drive/MyDrive/Machine learning/new2/xgb_model.json")

print(f"✅ Final Accuracy: {final_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


✅ Final Accuracy: 98.10%
Confusion Matrix:
[[199   0   0   0   1]
 [  3 196   1   0   0]
 [  1   0 193   0   5]
 [  3   0   1 196   1]
 [  2   0   1   0 197]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       200
           1       1.00      0.98      0.99       200
           2       0.98      0.97      0.98       199
           3       1.00      0.98      0.99       201
           4       0.97      0.98      0.98       200

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000

