# TON-IoT Review
________


## Importing modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

## Loading dataset

In [None]:
df = pd.read_csv("../Step1-Datasets-Feature-Engineering/team11_TON_IoT_unsw_edu_au_cleaned.csv")

# Quick look at data
display(df.head())
display(df.info())
display(df.describe())

## Data cleanup

### Drop non-required data

In [None]:
# dropping columns that are either empty or not required
df = df.drop(columns=["src_ip", "dst_ip", "type"])

### Drop long connection durations

In [None]:
# duriations of connections should last no more than 1000 seconds
df = df.drop(index=df[df["duration"] > 1000].index)

### Optimizations

In [None]:
# converting connection states to int
conn_state_unique = list(df["conn_state"].unique())
df["conn_state"] = df["conn_state"].apply(lambda x : conn_state_unique.index(x))

In [None]:
# converting protocols to int
proto_unique = list(df["proto"].unique())
df["proto"] = df["proto"].apply(lambda x : proto_unique.index(x))

In [None]:
# print results
df.describe()

In [None]:
df['total_bytes'] = df['src_ip_bytes'] + df['src_ip_bytes'] + df['src_bytes'] + df['dst_bytes']

In [None]:
# fill na with 0.
df.fillna(0, inplace=True)  # or df.dropna()

In [None]:
# Select only numeric columns for SMOTE

X = df.drop('label', axis=1)
y = df['label']

X_numeric = X.select_dtypes(include=np.number)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_numeric, y)

print("Before:", y.value_counts())
print("After:", pd.Series(y_res).value_counts())

## Rolling Average, Variance and Skew

In [None]:
# Example for a sliding window of 5 rows
window_size = 3
mean_rolling = df.drop(columns=["src_port", "dst_port", "proto", "label"]).rolling(window=window_size).mean()
var_rolling = df.drop(columns=["src_port", "dst_port", "proto", "label"]).rolling(window=window_size).var()
skew_rolling = df.drop(columns=["src_port", "dst_port", "proto", "label"]).rolling(window=window_size).skew()


In [None]:
# plot duration mean
plt.plot(mean_rolling["duration"])
plt.title("Rolling duration mean")
plt.legend(["duration"])
plt.ylabel("seconds (s)")
plt.show()

In [None]:
# plot duration variance
plt.plot(var_rolling["duration"])
plt.title("Rolling duration variance")
plt.legend(["duration"])
plt.ylabel("seconds (s)")
plt.show()

In [None]:
# plot duration skew 
plt.plot(skew_rolling.drop(index=[8])["duration"])
plt.title("Rolling duration skew")
plt.legend(["duration"])
plt.ylabel("seconds (s)")
plt.show()

## FFT magnitude

In [None]:
from scipy.fft import fft

# Example: compute FFT magnitude
plt.plot(np.abs(fft(df['total_bytes'].values)))
plt.title("FFT magnitude of total bytes")
plt.legend(["total bytes"])
plt.show

## Transformation and PCA

In [None]:
# ue min max
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_res)

In [None]:
# pca visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_res, cmap='coolwarm', alpha=0.5)
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(["Blue - normal, Red - (D)DoS"])
plt.title('PCA of IoT Features')
plt.show()


In [None]:
# correlation heatmap
plt.figure(figsize=(20,20))
sns.heatmap(pd.DataFrame(X_scaled).corr(), cmap='coolwarm', annot=False)
plt.xticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.yticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
# feature importance with rf
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_scaled, y_res)
importances = rf.feature_importances_

plt.figure(figsize=(18,6))
plt.bar(range(len(importances)), importances)
plt.title('Feature Importance')
plt.xticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.show()


## Feature Engineering

### Before

In [None]:
df.describe()

### After

In [None]:
after = pd.DataFrame(X_scaled)
after.columns = X_res.columns
after.describe()

# Export

In [None]:
# export data
df_engineered = pd.DataFrame(X_scaled, columns=X_numeric.columns)

X_train, X_test, y_train, y_test = train_test_split(df_engineered, y_res, test_size=0.2)
