<a href="https://colab.research.google.com/github/SiddarthaNanuvala/Cloud-Traffic-Anomaly-Detection-with-Auto-Scaling/blob/main/Cloud_Traffic_Anomaly_Detection_with_Auto_Scaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip uninstall -y numpy scikit-learn xgboost lightgbm imbalanced-learn scipy mlxtend umap-learn


[0m

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("programmer3/unsw-nb15-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'unsw-nb15-dataset' dataset.
Path to dataset files: /kaggle/input/unsw-nb15-dataset


In [6]:
import os, glob
import pandas as pd

# Find all relevant CSV files in the downloaded dataset folder
csvs = glob.glob('/root/.cache/kagglehub/datasets/programmer3/unsw-nb15-dataset/versions/2/**/*.csv', recursive=True)
print(len(csvs), 'CSV files found')
for p in csvs[:10]:
    print(p)

# Load and preview the first CSV file
df = pd.read_csv(csvs[0])
print('Shape:', df.shape)
print('Columns:', list(df.columns)[:20], '...')
print(df.head(3))


ModuleNotFoundError: No module named 'numpy.random'

In [None]:
# Identify possible label and category columns
label_col_candidates = [c for c in df.columns if c.lower() in ['label', 'is_attack', 'class', 'binary_label']]
cat_col_candidates = [c for c in df.columns if 'attack' in c.lower() and 'cat' in c.lower()]
print('Binary candidates:', label_col_candidates)
print('Attack category candidates:', cat_col_candidates)

BINARY_COL = label_col_candidates[0]
ATTACK_COL = cat_col_candidates[0] if cat_col_candidates else None

# Normalize binary label to 0/1
df['y_bin'] = (df[BINARY_COL].astype(int) > 0).astype(int)

# Normalize multiclass (optional)
if ATTACK_COL:
    df['y_cat'] = df[ATTACK_COL].fillna('Benign').replace({'-': 'Benign'})
print(df[['y_bin']].head())
if ATTACK_COL:
    print(df[['y_cat']].head())


In [None]:
# Identify pure identifier, network, and timing columns
id_like = [c for c in df.columns if any(k in c.lower() for k in ['id', 'flowid'])]
net_like = [c for c in df.columns if c.lower() in ['srcip', 'dstip', 'sport', 'dsport']]
time_like = [c for c in df.columns if 'time' in c.lower() or c.lower() in ['timestamp', 'stime', 'ltime']]
print('id_like:', id_like)
print('net_like:', net_like)
print('time_like:', time_like)


In [None]:
import numpy as np

# Check for missing values, duplicates, and class balance
na_counts = df.isna().sum().sort_values(ascending=False)
print('Top NA columns:\n', na_counts.head(15))
dup_count = df.duplicated().sum()
print('Duplicate rows:', dup_count)

# Binary class distribution and attack categories
print('y_bin value counts:\n', df['y_bin'].value_counts(dropna=False))
if 'y_cat' in df:
    print('Top attack categories:\n', df['y_cat'].value_counts().head(10))

# Numeric columns summary
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric columns:', len(num_cols))
print(df[num_cols].describe().T.head(12))


In [None]:
!pip uninstall -y numpy scikit-learn xgboost lightgbm imbalanced-learn


In [None]:
!pip uninstall -y numpy scikit-learn xgboost lightgbm imbalanced-learn scipy mlxtend umap-learn


In [None]:
!pip install numpy==1.23.5 scikit-learn==1.2.2 xgboost==1.7.6 lightgbm==4.1.0 imbalanced-learn==0.9.1 scipy==1.11.4


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Drop target columns for feature set
X_num = df.drop(columns=['label', 'attack_cat', 'y_bin', 'y_cat'], errors='ignore').select_dtypes(include=[np.number]).copy()
y = df['y_bin']

print('Numeric features:', X_num.shape[1])
print('First few feature columns:', X_num.columns.tolist()[:10])

# Train/test split and random forest model
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))


In [None]:
# Manually balance dataset: equal records per class
n_benign = df[df['y_bin']==0].shape[0]
n_attack = n_benign  # Same number for balance

benign_sample = df[df['y_bin'] == 0]
attack_sample = df[df['y_bin'] == 1].sample(n=n_attack, random_state=42)

sample = pd.concat([benign_sample, attack_sample]).sample(frac=1, random_state=42)  # shuffle

# Save files
sample.to_csv('unsw_stage1_sample.csv', index=False)
pd.Series(X_num.columns).to_csv('unsw_stage1_feature_columns.csv', index=False)

import os
print("Files saved:", os.listdir())


In [None]:
from google.colab import files

# Download the dataset sample and the features file
files.download('unsw_stage1_sample.csv')
files.download('unsw_stage1_feature_columns.csv')


In [None]:
import pandas as pd

# Load balanced sample (created in Stage 1)
df = pd.read_csv('unsw_stage1_sample.csv')
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head(3))


In [None]:
# Only use numeric features for modeling
X = df.drop(columns=['label','attack_cat','y_bin','y_cat'], errors='ignore').select_dtypes(include=['number']).copy()
y = df['y_bin'].copy()
print("Number of features:", X.shape[1])


In [None]:
from sklearn.model_selection import train_test_split

# Stratified split (70% train, 15% val, 15% test)
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=0.1765, random_state=42, stratify=y_tv)
print("Train/Val/Test shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Train class distribution:", y_train.value_counts(normalize=True))


In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix
import numpy as np

def eval_metrics(y_true, y_pred, y_proba=None):
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1])
    macro_f1 = f1.mean()
    roc = roc_auc_score(y_true, y_proba) if y_proba is not None else None
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        'prec_0': p[0], 'rec_0': r[0], 'f1_0': f1[0],
        'prec_1': p[1], 'rec_1': r[1], 'f1_1': f1[1],
        'macro_f1': macro_f1, 'roc_auc': roc, 'cm': cm
    }

def pick_threshold(y_true, proba, target='f1_1'):
    thresholds = np.linspace(0.1, 0.9, 17)
    best_t, best_score = 0.5, -1
    for t in thresholds:
        pred = (proba >= t).astype(int)
        m = eval_metrics(y_true, pred, proba)
        score = m[target]
        if score > best_score:
            best_score, best_t = score, t
    return best_t


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipelines = {
    'LR_balanced': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=-1))
    ]),
    'RF_balanced': RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample', n_jobs=-1),
}
print("Pipelines ready:", list(pipelines.keys()))


In [None]:
# Choose model pipeline
name = 'RF_balanced'  # Use 'LR_balanced' for logistic regression
model = pipelines[name]
model.fit(X_train, y_train)

# Predict on validation set
proba_val = model.predict_proba(X_val)[:,1]
t_star = pick_threshold(y_val, proba_val, target='f1_1')
y_val_pred = (proba_val >= t_star).astype(int)

metrics_val = eval_metrics(y_val, y_val_pred, proba_val)
print(f"{name} Validation Metrics:", {k: round(float(v),4) if not isinstance(v,np.ndarray) else v.tolist() for k,v in metrics_val.items()})
print("Optimal probability threshold:", t_star)


In [None]:
# Predict on test set
proba_test = model.predict_proba(X_test)[:,1]
y_test_pred = (proba_test >= t_star).astype(int)
test_metrics = eval_metrics(y_test, y_test_pred, proba_test)
print("Test Metrics:", {k: round(float(v),4) if not isinstance(v,np.ndarray) else v.tolist() for k,v in test_metrics.items()})


In [None]:
name = 'LR_balanced'
model = pipelines[name]
model.fit(X_train, y_train)
proba_val = model.predict_proba(X_val)[:,1]
t_star_lr = pick_threshold(y_val, proba_val, target='f1_1')
y_val_pred = (proba_val >= t_star_lr).astype(int)
metrics_val_lr = eval_metrics(y_val, y_val_pred, proba_val)
print(f"{name} Validation Metrics:", {k: round(float(v),4) if not isinstance(v,np.ndarray) else v.tolist() for k,v in metrics_val_lr.items()})
print("Optimal probability threshold:", t_star_lr)


In [None]:
!pip install imbalanced-learn

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
print("After SMOTE:", dict(zip(*np.unique(y_train_sm, return_counts=True))))
model.fit(X_train_sm, y_train_sm)
# Continue with validation and test as before


In [None]:
import matplotlib.pyplot as plt

def plot_class_metrics(metrics, title):
    labels = ['Benign', 'Attack']
    precision = [metrics['prec_0'], metrics['prec_1']]
    recall = [metrics['rec_0'], metrics['rec_1']]
    f1 = [metrics['f1_0'], metrics['f1_1']]
    x = range(len(labels))

    plt.figure(figsize=(7,4))
    plt.bar(x, precision, width=0.2, label='Precision', align='center')
    plt.bar([i+0.2 for i in x], recall, width=0.2, label='Recall', align='center')
    plt.bar([i+0.4 for i in x], f1, width=0.2, label='F1', align='center')
    plt.xticks([i+0.2 for i in x], labels)
    plt.legend()
    plt.title(title)
    plt.show()

plot_class_metrics(metrics_val, "Validation Metrics")


In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_val, proba_val)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Validation)')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
!pip install xgboost lightgbm scikit-learn==1.2.2


In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    scale_pos_weight=1, # or set = n_benign/n_attack if imbalanced
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

xgb_model.fit(X_train, y_train)

proba_val_xgb = xgb_model.predict_proba(X_val)[:,1]
t_star_xgb = pick_threshold(y_val, proba_val_xgb, target='f1_1')
y_val_pred_xgb = (proba_val_xgb >= t_star_xgb).astype(int)
metrics_val_xgb = eval_metrics(y_val, y_val_pred_xgb, proba_val_xgb)
print("XGBoost Validation Metrics:", {k: round(float(v),4) if not isinstance(v,np.ndarray) else v.tolist() for k,v in metrics_val_xgb.items()})
print("Optimal threshold (XGB):", t_star_xgb)
