# TON-IoT Review
________


## Importing modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

## Loading dataset

In [None]:
df = pd.read_csv("./team_11_ton-iot-raw.csv")

# Quick look at data
display(df.head())
display(df.info())
display(df.describe())

## Data cleanup

### Drop non-required data

In [None]:
df = df.drop(columns=[
    "service","dns_query", "dns_AA", 
    "dns_RD", "dns_RA", "dns_rejected", 
    "ssl_version", "ssl_cipher", "ssl_resumed", 
    "ssl_established", "ssl_subject", "ssl_issuer", 
    "http_trans_depth", "http_method", "http_uri", 
    "http_version", "http_request_body_len", "http_response_body_len", 
    "http_status_code", "http_user_agent", "http_orig_mime_types", 
    "http_resp_mime_types", "weird_name", "weird_addl", 
    "weird_notice", "type", "dns_qclass",
    "dns_qtype", "dns_rcode", "ts",
    "src_ip", "dst_ip"])

### Drop long connection durations

In [None]:
df = df.drop(index=df[df["duration"] > 1000].index)

### Optimizations

In [None]:
conn_state_unique = list(df["conn_state"].unique())
df["conn_state"] = df["conn_state"].apply(lambda x : conn_state_unique.index(x))

In [None]:
proto_unique = list(df["proto"].unique())
df["proto"] = df["proto"].apply(lambda x : proto_unique.index(x))

In [None]:
df.describe()

In [None]:
df['total_bytes'] = df['src_ip_bytes'] + df['src_ip_bytes'] + df['src_bytes'] + df['dst_bytes']

In [None]:
df.fillna(0, inplace=True)  # or df.dropna()

In [None]:
X = df.drop('label', axis=1)
y = df['label']

# Select only numeric columns for SMOTE
X_numeric = X.select_dtypes(include=np.number)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_numeric, y)

print("Before:", y.value_counts())
print("After:", pd.Series(y_res).value_counts())

## Rolling Average, Variance and Skew

In [None]:
# Example for a sliding window of 5 rows
window_size = 3
mean_rolling = df.drop(columns=["src_port", "dst_port", "proto", "label"]).rolling(window=window_size).mean()
var_rolling = df.drop(columns=["src_port", "dst_port", "proto", "label"]).rolling(window=window_size).var()
skew_rolling = df.drop(columns=["src_port", "dst_port", "proto", "label"]).rolling(window=window_size).skew()


In [None]:
plt.plot(mean_rolling["duration"])
plt.title("Rolling duration mean")
plt.legend(["duration"])
plt.ylabel("seconds (s)")
plt.show()

In [None]:
plt.plot(var_rolling["duration"])
plt.title("Rolling duration variance")
plt.legend(["duration"])
plt.ylabel("seconds (s)")
plt.show()

In [None]:
plt.plot(skew_rolling.drop(index=[8])["duration"])
plt.title("Rolling duration skew")
plt.legend(["duration"])
plt.ylabel("seconds (s)")
plt.show()

## FFT magnitude

In [None]:
from scipy.fft import fft

# Example: compute FFT magnitude
plt.plot(np.abs(fft(df['total_bytes'].values)))
plt.title("FFT magnitude of total bytes")
plt.legend(["total bytes"])
plt.show

## Transformation and PCA

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_res)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualization
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_res, cmap='coolwarm', alpha=0.5)
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(["Blue - normal, Red - (D)DoS"])
plt.title('PCA of IoT Features')
plt.show()


In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(pd.DataFrame(X_scaled).corr(), cmap='coolwarm', annot=False)
plt.xticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.yticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_scaled, y_res)
importances = rf.feature_importances_

plt.figure(figsize=(18,6))
plt.bar(range(len(importances)), importances)
plt.title('Feature Importance')
plt.xticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.show()


## Feature Engineering

### Before

In [None]:
df.describe()

### After

In [None]:
after = pd.DataFrame(X_scaled)
after.columns = X_res.columns
after.describe()

# Export

In [None]:
df_engineered = pd.DataFrame(X_scaled, columns=X_numeric.columns)
df_engineered['label'] = y_res
df_engineered.to_csv("./team_11_ton-iot-engineered.csv", index=False)