# Importing Dependencies

In [22]:
import pandas as pd
import polars as pl
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import glob
import os
from Pre_processing import new_features

# Loading Dataset

In [23]:
# import polars as pl
# CORRECT_HEADERS = ["pkSeqID","stime","flgs","proto","saddr","sport","daddr","dport","pkts","bytes","state","ltime","seq","dur","mean","stddev","smac","dmac","sum","min","max","soui","doui","sco","dco","spkts","dpkts","sbytes","dbytes","rate","srate","drate","attack","category","subcategory"]
# file_paths = glob.glob(os.path.join("Entire_Dataset", "*.csv"))
# lazy_frames = []
# for f in file_paths:
#     lf = pl.scan_csv(
#         f,
#         infer_schema_length=0,
#         has_header=True,
#         rechunk=False
#     )
#     lf = lf.rename({old: new for old, new in zip(lf.collect_schema().names(), CORRECT_HEADERS)})
#     lazy_frames.append(lf)

# merged_lazy = pl.concat(lazy_frames, how="vertical")
# merged_lazy.sink_csv('Final_Merged_Dataset.csv')

In [24]:
print(new_features)

['pkSeqID', 'stime', 'proto', 'daddr', 'bytes', 'ltime', 'spkts', 'dpkts', 'dbytes']


In [25]:
df = pl.scan_csv("Final_Merged_Dataset.csv").select(new_features + ["attack"])

In [26]:
print(df)

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SELECT [col("pkSeqID"), col("stime"), col("proto"), col("daddr"), col("bytes"), col("ltime"), col("spkts"), col("dpkts"), col("dbytes"), col("attack")]
  Csv SCAN [Final_Merged_Dataset.csv]
  PROJECT */35 COLUMNS
  ESTIMATED ROWS: 69940782


In [27]:
target_counts = (
    df.select(pl.col("attack").value_counts())
    .collect()
)
print(target_counts)

shape: (2, 1)
┌──────────────┐
│ attack       │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {1,73360827} │
│ {0,9542}     │
└──────────────┘


In [28]:
TARGET = "attack"
selected_features = ['stime','ltime','dpkts','dbytes']

X_lazy = df.select(selected_features)
y_lazy = df.select("attack")

In [29]:
print(X_lazy.collect().dtypes)

[Float64, Float64, Int64, Int64]


In [30]:
# scaler = joblib.load('scaler.save')
# scaler = MinMaxScaler()
scaler = StandardScaler()
X = X_lazy.limit(2_000_000).collect().to_numpy()   # sample instead of full load
y = y_lazy.limit(2_000_000).collect().to_numpy()

X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [31]:
LABEL_COL = 'attack'
BATCH_SIZE = 1024          # CSV read batch
WINDOW = 20                # time window
MODEL_BATCH = 64           # training batch size
SELECTED_COLS = ['stime','ltime','dpkts','dbytes','attack']
NUM_FEATURES = len(SELECTED_COLS) - 1


# ---------- CSV loader ----------
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=BATCH_SIZE,
        label_name=LABEL_COL,
        select_columns=SELECTED_COLS,
        na_value="?",
        num_epochs=1,
        ignore_errors=True,
        shuffle=False      # IMPORTANT for temporal ordering
    )
    return dataset


# ---------- Convert feature dict -> tensor ----------
def pack(features, label):
    features = tf.stack(
        [tf.cast(v, tf.float32) for v in features.values()],
        axis=1
    )
    return features, tf.cast(label, tf.float32)


# ---------- Build streaming sliding-window dataset ----------
raw_train_data = get_dataset("Final_Merged_Dataset.csv")

dataset = raw_train_data.map(pack)

dataset = dataset.unbatch()   # CRITICAL: convert to row stream

dataset = dataset.window(WINDOW, shift=1, drop_remainder=True)

dataset = dataset.flat_map(
    lambda x, y: tf.data.Dataset.zip((
        x.batch(WINDOW),
        y.batch(WINDOW)
    ))
)

dataset = dataset.map(lambda x, y: (x, y[-1]))  # label = last timestep

dataset = dataset.batch(MODEL_BATCH).prefetch(tf.data.AUTOTUNE)

# Model Training

In [32]:
model = Sequential([
    Conv1D(64, 3, activation='relu',
           input_shape=(WINDOW, NUM_FEATURES)),
    MaxPooling1D(2),
    LSTM(64),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

history = model.fit(dataset, epochs=2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/2


2026-02-16 00:36:21.120976: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91900


[1m1146412/1146412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3619s[0m 3ms/step - accuracy: 0.9999 - loss: 0.0011
Epoch 2/2
[1m     30/1146412[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:06:49[0m 3ms/step - accuracy: 1.0000 - loss: 6.5152e-06 

2026-02-16 01:36:38.880134: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2026-02-16 01:36:38.880157: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2026-02-16 01:36:38.880175: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3501847956169870170
2026-02-16 01:36:38.880180: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 12512990609202023198
2026-02-16 01:36:38.880184: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 5135704204151385950


[1m1146412/1146412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3622s[0m 3ms/step - accuracy: 0.9999 - loss: 0.0010


2026-02-16 02:37:00.454567: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2026-02-16 02:37:00.455542: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 3501847956169870170
2026-02-16 02:37:00.455550: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 12512990609202023198
2026-02-16 02:37:00.455569: I tensorflow/core/framework/local_rendezvous.cc:426] Local rendezvous recv item cancelled. Key hash: 5135704204151385950


In [34]:
joblib.dump(scaler, 'scaler.save')

model.save('iot_model.keras')