In [17]:
import pandas as pd

# Step 1: Load the dataset
csv_path = "preprocessed_test.csv"  # Replace with your dataset file
pre_Processed_data = pd.read_csv(csv_path)

# Step 2: Split the DataFrame into batches of 1000 rows
batch_size = 1000
data_batches = [pre_Processed_data.iloc[i:i + batch_size] for i in range(0, len(pre_Processed_data), batch_size)]

# Step 3: Check the number of batches created
print(f"Total number of batches: {len(data_batches)}")

# Step 4: Access the first batch (for verification)
print("\nFirst batch sample:")
data_batches[400].head()

Total number of batches: 421

First batch sample:


Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
400000,0.501512,0.887586,0.3125,7.010673e-08,1.644604e-07,0.0,1.6e-05,0.102804,0.999999,1
400001,0.501512,0.80038,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
400002,1.0,0.469281,0.3125,3.369505e-06,5.509422e-06,0.000106,6.5e-05,0.126168,0.991952,1
400003,0.301396,0.80038,0.3125,5.258005e-08,1.644604e-07,0.0,1.6e-05,0.093458,1.0,0
400004,0.501512,0.469281,0.3125,7.010673e-08,1.644604e-07,0.0,1.6e-05,0.102804,1.0,1


In [8]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Step 1: Load Dataset
csv_path = "data.csv"  # Replace with your dataset file
data = pd.read_csv(csv_path)
# Step 2: Feature Engineering
def hash_ip(ip):
    # Simple hash function for IP addresses
    return hash(ip) % (10**9)

def hash_protocol(protocol):
    # Convert protocol to hashable integer
    return hash(protocol)

# Apply transformations to relevant columns
data['src_ip_hash'] = data['IPV4_SRC_ADDR'].apply(hash_ip)
data['dst_ip_hash'] = data['IPV4_DST_ADDR'].apply(hash_ip)
data['protocol_hash'] = data['PROTOCOL'].apply(hash_protocol)

# Step 3: Select Relevant Features
selected_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS", "Label"
]
data = data[selected_features]

# Step 4: Handle Missing Values
data = data.dropna()  # Drop rows with missing values

# Step 5: Normalize Numerical Features to [0, 1]
numerical_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS"
]
scaler = MinMaxScaler(feature_range=(0, 1))
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Step 6: Encode Labels
label_encoder = LabelEncoder()
data["Label"] = label_encoder.fit_transform(data["Label"])  # Convert labels to integers


In [9]:
import joblib

# Save the scaler to a file
scaler_filename = "scaler.joblib"
joblib.dump(scaler, scaler_filename)

print(f"Scaler saved to {scaler_filename}")

Scaler saved to scaler.joblib


In [10]:
# Denormalize the numerical features
data[numerical_features] = scaler.inverse_transform(data[numerical_features])

# Display the first few rows of the denormalized data
data.head()

Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,357076471.0,287106065.0,17.0,71.0,126.0,1.0,1.0,0.0,4294966.0,0
1,357076471.0,121274367.0,6.0,217753000.0,199100.0,4521.0,4049.0,24.0,4176249.0,1
2,833396385.0,175856011.0,17.0,8508021.0,8918372.0,9086.0,9086.0,0.0,4175916.0,0
3,992955428.0,809013042.0,6.0,8442138.0,9013406.0,9086.0,9086.0,0.0,4175916.0,0
4,833396385.0,175856011.0,6.0,8374706.0,0.0,9086.0,0.0,0.0,4175916.0,0


In [11]:
data.to_csv("preprocessed_data_denorm.csv", index=False)

In [3]:
import pandas as pd



# Filter for benign data (assuming "Benign" is represented as 0 in the Label column)
benign_data = data[data["Label"] == 0]

# Display the first few rows of the benign data
benign_data.head()

#

Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,0.802349,0.614971,1.0,1.884118e-07,5.180502e-07,0.0,1.6e-05,0.0,1.0,0
2,0.97779,0.052981,1.0,0.03727922,0.03666797,0.240242,0.148683,0.0,0.972281,0
3,0.76592,0.243987,0.3125,0.03699055,0.0370587,0.240242,0.148683,0.0,0.972281,0
4,0.97779,0.052981,0.3125,0.03669508,0.0,0.240242,0.0,0.0,0.972281,0
5,0.0,0.775212,0.3125,0.01664709,0.0,0.144145,0.0,0.0,0.972285,0


In [18]:
data_batches[0].head()

Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,0.075013,0.80038,0.3125,7.010673e-08,1.644604e-07,0.0,1.6e-05,0.102804,1.0,1
1,0.501512,0.80038,0.3125,7.010673e-08,1.644604e-07,0.0,1.6e-05,0.102804,1.0,1
2,1.0,0.469281,0.3125,7.010673e-08,1.644604e-07,0.0,1.6e-05,0.102804,1.0,1
3,0.868871,0.410582,0.0,6.134339e-07,0.0,2.6e-05,0.0,0.0,0.999864,1
4,0.301396,0.80038,0.3125,7.010673e-08,1.644604e-07,0.0,1.6e-05,0.102804,1.0,1


In [4]:
benign_data.head()

Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,0.802349,0.614971,1.0,1.884118e-07,5.180502e-07,0.0,1.6e-05,0.0,1.0,0
2,0.97779,0.052981,1.0,0.03727922,0.03666797,0.240242,0.148683,0.0,0.972281,0
3,0.76592,0.243987,0.3125,0.03699055,0.0370587,0.240242,0.148683,0.0,0.972281,0
4,0.97779,0.052981,0.3125,0.03669508,0.0,0.240242,0.0,0.0,0.972281,0
5,0.0,0.775212,0.3125,0.01664709,0.0,0.144145,0.0,0.0,0.972285,0


In [6]:
import pandas as pd

# Step 2: Split the DataFrame into batches of 1000 rows
batch_size = 1000
data_batches_benign = [benign_data.iloc[i:i + batch_size] for i in range(0, len(benign_data), batch_size)]

# Step 3: Check the number of batches created
print(f"Total number of batches: {len(data_batches)}")

# Step 4: Access the first batch (for verification)
print("\nFirst batch sample:")
data_batches_benign[0].head() # Display the first 5 rows of the first batch

Total number of batches: 601

First batch sample:


Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,0.802349,0.614971,1.0,1.884118e-07,5.180502e-07,0.0,1.6e-05,0.0,1.0,0
2,0.97779,0.052981,1.0,0.03727922,0.03666797,0.240242,0.148683,0.0,0.972281,0
3,0.76592,0.243987,0.3125,0.03699055,0.0370587,0.240242,0.148683,0.0,0.972281,0
4,0.97779,0.052981,0.3125,0.03669508,0.0,0.240242,0.0,0.0,0.972281,0
5,0.0,0.775212,0.3125,0.01664709,0.0,0.144145,0.0,0.0,0.972285,0


In [14]:
benign_data.shape

(13859, 10)

In [21]:
data_batches_benign[13]

Unnamed: 0,src_ip_hash,dst_ip_hash,protocol_hash,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
524139,0.351272,0.775212,0.3125,5.258005e-08,1.644604e-07,0.000000,0.000016,0.093458,1.000000,0
524140,0.085610,0.775212,0.3125,5.258005e-08,1.644604e-07,0.000000,0.000016,0.093458,1.000000,0
524141,0.286899,0.775212,0.3125,5.258005e-08,1.644604e-07,0.000000,0.000016,0.093458,1.000000,0
524142,0.351272,0.775212,0.3125,5.258005e-08,1.644604e-07,0.000000,0.000016,0.093458,1.000000,0
524143,0.085610,0.775212,0.3125,5.258005e-08,1.644604e-07,0.000000,0.000016,0.093458,1.000000,0
...,...,...,...,...,...,...,...,...,...,...
600094,0.802349,0.775212,0.3125,1.052635e-03,0.000000e+00,0.022504,0.000000,0.000000,0.992574,0
600095,0.977790,0.052981,0.3125,1.020945e-02,0.000000e+00,0.066691,0.000000,0.000000,0.992566,0
600096,0.000000,0.775212,0.3125,4.620012e-03,0.000000e+00,0.039983,0.000000,0.000000,0.992572,0
600097,1.000000,0.775212,1.0000,2.733900e-04,0.000000e+00,0.035858,0.000000,0.000000,0.992572,0


In [9]:
len(data_batches_benign)

14

In [19]:
len(data_batches)

421

load the model first and then train on ------------data_batches_benign[0-3] then train on data_batches[45-....]

betweenn each batch save the model

In [None]:
# call all the neccessary libraries here of  ydataSynthetic 

In [None]:
synth = RegularSynthesizer.load("ctgan_initial.pkl")

synth.fit(
        data=data_batches_benign[0],
        train_arguments=train_args,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )

In [None]:
synth.save("ctgan_benign_0.pkl")


save the trained modle and load it below and train again

In [None]:

synth = RegularSynthesizer.load("ctgan_benign_0.pkl")

synth.fit(
        data=data_batches_benign[1],
        train_arguments=train_args,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )

In [None]:
synth.save("ctgan_benign_1.pkl")


In [None]:

synth = RegularSynthesizer.load("ctgan_benign_1.pkl")

synth.fit(
        data=data_batches[45],
        train_arguments=train_args,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )

In [None]:
synth.save("ctgan_benign_1_attack_1.pkl")


In [None]:

synth = RegularSynthesizer.load("ctgan_benign_1_attack_1.pkl")

synth.fit(
        data=data_batches_benign[2],
        train_arguments=train_args,
        num_cols=num_cols,
        cat_cols=cat_cols,
    )

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the preprocessed data
csv_path = "preprocessed_test.csv"  # Replace with your dataset file
preprocessed_data = pd.read_csv(csv_path)

# Split the data into features and labels
X = preprocessed_data.drop(columns=["Label"])
y = preprocessed_data["Label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9893351108148641
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.58      0.72       977
           1       0.99      1.00      0.99     41030

    accuracy                           0.99     42007
   macro avg       0.97      0.79      0.85     42007
weighted avg       0.99      0.99      0.99     42007



In [10]:
import joblib

# Save the trained Random Forest model to a file
model_filename = "ids.joblib"
joblib.dump(rf_classifier, model_filename)

print(f"Model saved to {model_filename}")

Model saved to ids.joblib


In [None]:
import joblib

# Save the trained Random Forest model to a file
model_filename = "random_forest_model.pkl"
joblib.dump(rf_classifier, model_filename)

print(f"Model saved to {model_filename}

In [None]:
import joblib

# Load the saved Random Forest model from the file
loaded_rf_classifier = joblib.load(model_filename)

print(f"Model loaded from {model_filename}")

In [2]:
import pandas as pd
# Load the dataset
csv_path = "data.csv"  # Replace with your dataset file
dataset = pd.read_csv(csv_path)

# Display the first few rows of the dataset
dataset.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,Benign
1,192.168.100.6,49160,192.168.100.149,4444,6,0.0,217753000,199100,4521,4049,24,4176249,1,Theft
2,192.168.100.46,3456,192.168.100.5,80,17,0.0,8508021,8918372,9086,9086,0,4175916,0,Benign
3,192.168.100.3,80,192.168.100.55,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,0,Benign
4,192.168.100.46,80,192.168.100.5,80,6,7.0,8374706,0,9086,0,0,4175916,0,Benign


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

csv_path = "data.csv"
data = pd.read_csv(csv_path)

def hash_ip(ip):

    return hash(ip) % (10**9)

def hash_protocol(protocol):

    return hash(protocol)


data['src_ip_hash'] = data['IPV4_SRC_ADDR'].apply(hash_ip)
data['dst_ip_hash'] = data['IPV4_DST_ADDR'].apply(hash_ip)
data['protocol_hash'] = data['PROTOCOL'].apply(hash_protocol)

selected_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS","Attack"
]
data = data[selected_features]

data = data.dropna()

numerical_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS"
]
scaler = MinMaxScaler(feature_range=(0, 1))
data[numerical_features] = scaler.fit_transform(data[numerical_features])

label_encoder = LabelEncoder()
data["Attack"] = label_encoder.fit_transform(data["Attack"])

data.to_csv("preprocessed_test_labeled.csv", index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

csv_path = "data.csv"
data = pd.read_csv(csv_path)

def hash_ip(ip):

    return hash(ip) % (10**9)

def hash_protocol(protocol):

    return hash(protocol)


data['src_ip_hash'] = data['IPV4_SRC_ADDR'].apply(hash_ip)
data['dst_ip_hash'] = data['IPV4_DST_ADDR'].apply(hash_ip)
data['protocol_hash'] = data['PROTOCOL'].apply(hash_protocol)

selected_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS","Attack"
]
data = data[selected_features]

data = data.dropna()

numerical_features = [
    "src_ip_hash", "dst_ip_hash", "protocol_hash",
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS",
    "TCP_FLAGS", "FLOW_DURATION_MILLISECONDS"
]
scaler = MinMaxScaler(feature_range=(0, 1))
data[numerical_features] = scaler.fit_transform(data[numerical_features])

label_encoder = LabelEncoder()
data["Attack"] = label_encoder.fit_transform(data["Attack"])

data.to_csv("preprocessed_test_labeled.csv", index=False)

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [None]:

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
knn_y_pred = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
print(f"KNN Accuracy: {knn_accuracy}")


In [None]:

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
dt_y_pred = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print(f"Decision Tree Accuracy: {dt_accuracy}")


In [None]:
import joblib
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)
lr_classifier.fit(X_train, y_train)
lr_y_pred = lr_classifier.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy}")


joblib.dump(knn_classifier, 'knn_model.pkl')
joblib.dump(dt_classifier, 'dt_model.pkl')
joblib.dump(lr_classifier, 'lr_model.pkl')