# TON-IoT Review
________


## Importing modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

## Loading dataset

In [None]:
df = pd.read_csv("../Step1-Datasets-Feature-Engineering/team11_TON_IoT_unsw_edu_au_cleaned.csv")

if len(df.columns) == 1 and "github.com" in df.columns[0]:
    raise ValueError("You have downloaded an LFS pointer, and not the actual file.\n" \
    "Please download team11_TON_IoT_unsw_edu_au_cleaned.csv directly from the git repo.\n" \
    "Place this download into the 'Step1-Datasets-Feature-Engineering' folder and try again.\n\n")

# Quick look at data
display(df.head())
display(df.info())
display(df.describe())

## Data cleanup

### Drop non-required data

In [None]:
# dropping columns that are either empty or not required
df = df.drop(columns=["src_ip", "dst_ip", "label"])

### Drop long connection durations

In [None]:
# duriations of connections should last no more than 1000 seconds
df = df.drop(index=df[df["duration"] > 1000].index)

### Optimizations

In [None]:
# attack type as numerical category
type_unique = list(df["type"].unique())
df["type"] = df["type"].apply(lambda x : type_unique.index(x))

In [None]:
# converting connection states to int
conn_state_unique = list(df["conn_state"].unique())
df["conn_state"] = df["conn_state"].apply(lambda x : conn_state_unique.index(x))

In [None]:
# converting protocols to int
proto_unique = list(df["proto"].unique())
df["proto"] = df["proto"].apply(lambda x : proto_unique.index(x))

In [None]:
# sum total bytes
df['total_bytes'] = df['src_ip_bytes'] + df['src_ip_bytes'] + df['src_bytes'] + df['dst_bytes']

In [None]:
# fill na with 0.
df.fillna(0, inplace=True)  # or df.dropna()

In [None]:
# Select only numeric columns for SMOTE

X = df.drop('type', axis=1)
y = df['type']

X_numeric = X.select_dtypes(include=np.number)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_numeric, y)

print("Before:", y.value_counts())
print("After:", pd.Series(y_res).value_counts())

## Transformation and PCA

In [None]:
# ue min max
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_res)

In [None]:
# correlation heatmap
plt.figure(figsize=(20,20))
sns.heatmap(pd.DataFrame(X_scaled).corr(), cmap='coolwarm', annot=False)
plt.xticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.yticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
# feature importance with rf
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_scaled, y_res)
importances = rf.feature_importances_

plt.figure(figsize=(18,6))
plt.bar(range(len(importances)), importances)
plt.title('Feature Importance')
plt.xticks(list(range(0,len(X_res.columns))),X_res.columns)
plt.show()


## Feature Engineering

### Before

In [None]:
df.describe()

### After

In [None]:
after = pd.DataFrame(X_scaled)
after.columns = X_res.columns
after.describe()

# Export

In [None]:
# export data
df_engineered = pd.DataFrame(X_scaled, columns=X_numeric.columns)

X_train, X_test, y_train, y_test = train_test_split(df_engineered, y_res, test_size=0.2)
