In [2]:
!pip install pyarrow



In [3]:
# All Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

2025-04-26 19:59:59.627462: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745679599.656819   36662 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745679599.663825   36662 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-26 19:59:59.695854: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_csv('./cybersecurity_attacks.csv')

In [5]:
df.head()

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Proxy Information,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",150.9.97.135,Log Data,,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",,Log Data,,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",114.133.48.179,Log Data,Alert Data,Firewall
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",,,Alert Data,Firewall
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",149.6.110.119,,Alert Data,Firewall


In [6]:
df.isnull().sum()

Timestamp                     0
Source IP Address             0
Destination IP Address        0
Source Port                   0
Destination Port              0
Protocol                      0
Packet Length                 0
Packet Type                   0
Traffic Type                  0
Payload Data                  0
Malware Indicators        20000
Anomaly Scores                0
Attack Type                   0
Attack Signature              0
Action Taken                  0
Severity Level                0
User Information              0
Device Information            0
Network Segment               0
Geo-location Data             0
Proxy Information         19851
Firewall Logs             19961
IDS/IPS Alerts            20050
Log Source                    0
dtype: int64

In [7]:
selected_features = [
    "Source Port",
    "Destination Port",
    "Protocol",
    "Packet Length",
    "Packet Type",
    "Traffic Type"
]

target = ["Attack Type"]

In [8]:
for i in selected_features:
  print(f'DataType for {i} : {df[i].dtype}')

for i in target:
  print(f'DataType for {i} : {df[i].dtype}')

DataType for Source Port : int64
DataType for Destination Port : int64
DataType for Protocol : object
DataType for Packet Length : int64
DataType for Packet Type : object
DataType for Traffic Type : object
DataType for Attack Type : object


In [9]:
# label encode
'''
0- Protocol
1- Packet Type
2- Traffic Type
3- Attack Type
4- Alerts/Warnings
'''
le = LabelEncoder()
for col in ['Protocol', 'Packet Type', 'Traffic Type', 'Attack Type']:
    df[col] = le.fit_transform(df[col])

In [10]:
# Replace NaN with 0, and non-null values with 1
df['Alerts/Warnings'] = df['Alerts/Warnings'].notnull().astype(int)

In [11]:
df['Alerts/Warnings'].head()

0    0
1    0
2    1
3    1
4    1

In [12]:
df['Alerts/Warnings'].isnull().sum()

0

In [13]:
X = df[selected_features]
Y = df[target]

In [14]:
X.head()
X.isnull().sum()

Source Port         0
Destination Port    0
Protocol            0
Packet Length       0
Packet Type         0
Traffic Type        0
dtype: int64

In [15]:
Y.head()
Y.isnull().sum()

Attack Type    0
dtype: int64

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

In [18]:
accuracies = {}

for name, model in models.items():
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, y_pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

  return fit_method(estimator, *args, **kwargs)


Random Forest Accuracy: 0.3375
Decision Tree Accuracy: 0.3344


  y = column_or_1d(y, warn=True)


Logistic Regression Accuracy: 0.3376


  y = column_or_1d(y, warn=True)


SVM Accuracy: 0.3262


  return self._fit(X, y)


KNN Accuracy: 0.3351


In [19]:
# Convert Y to categorical for ANN
num_classes = len(np.unique(Y_train))
Y_train_cat = to_categorical(Y_train, num_classes=num_classes)
Y_test_cat = to_categorical(Y_test, num_classes=num_classes)

In [20]:
ann = Sequential()
ann.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
ann.add(Dense(32, activation='relu'))
ann.add(Dense(num_classes, activation='softmax'))

ann.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train ANN
ann.fit(X_train, Y_train_cat, epochs=30, batch_size=32, verbose=0)

# Evaluate ANN
loss, ann_acc = ann.evaluate(X_test, Y_test_cat, verbose=0)
accuracies["ANN"] = ann_acc
print(f"ANN Accuracy: {ann_acc:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-04-26 20:01:28.813674: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


ANN Accuracy: 0.3294


In [21]:
best_model = max(accuracies, key=accuracies.get)
print("\n✅ Best Performing Model:", best_model, "with Accuracy:", f"{accuracies[best_model]:.4f}")


✅ Best Performing Model: Logistic Regression with Accuracy: 0.3376


In [24]:
import os
import joblib
# 1) Ensure the target directory exists
os.makedirs("app/services", exist_ok=True)



le_protocol     = LabelEncoder().fit(df["Protocol"])
le_packet_type  = LabelEncoder().fit(df["Packet Type"])
le_traffic_type = LabelEncoder().fit(df["Traffic Type"])
le_attack       = LabelEncoder().fit(df["Attack Type"])


# 2) Save your Keras ANN as HDF5
ann.save("app/services/model.h5")
print("✅ Saved Keras model to app/services/model.h5")

# 3) (Optional) Save the best sklearn classifier
best_name    = best_model               # this is the name string, e.g. "Random Forest"
best_clf     = models[best_name]        # get the estimator object
joblib.dump(best_clf, "app/services/best_model.joblib")
print(f"✅ Saved {best_name} to app/services/best_model.joblib")

# 4) (Optional) If you separately instantiated your LabelEncoders, save them too:
joblib.dump(le_protocol,    "app/services/le_protocol.joblib")
joblib.dump(le_packet_type, "app/services/le_packet_type.joblib")
joblib.dump(le_traffic_type,"app/services/le_traffic_type.joblib")
joblib.dump(le_attack,      "app/services/le_attack.joblib")
print("✅ Saved all LabelEncoders to app/services/")



✅ Saved Keras model to app/services/model.h5
✅ Saved Logistic Regression to app/services/best_model.joblib
✅ Saved all LabelEncoders to app/services/
