In [None]:
# %% 1. Imports and Setup
"""
team2_TrafficLabellingClean.ipynb
----------------------------------
Step 1: Dataset Cleaning and Optimization (Team 2)
----------------------------------
Pipeline:
- Read 8 raw CSVs (CICIDS2017)
- Cleaning: dropna, duplicates, constant columns, IP/Timestamp
- Feature selection: low-variance + high-correlation filters
- Numeric optimization: downcast to reduce memory footprint
- Save cleaned CSV under ./team2_TrafficLabelling/
"""

import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# Detect notebook folder
BASE_DIR = os.path.dirname(__file__) if "__file__" in globals() else os.getcwd()

# ✅ 自动检测数据路径
possible_dirs = [
    os.path.normpath(os.path.join(BASE_DIR, "..", "data", "traffic")),
    r"C:\Users\hi\AI-CloudSec-System\data\traffic",   # 你旧仓库的真实路径
]
DATA_FOLDER = None
for p in possible_dirs:
    if os.path.exists(p):
        DATA_FOLDER = p
        break

if DATA_FOLDER is None:
    raise FileNotFoundError("❌ Could not find the traffic dataset folder. Please check the path manually.")

# 输出检查信息
print("Environment initialized.")
print("BASE_DIR     :", BASE_DIR)
print("DATA_FOLDER  :", DATA_FOLDER)
print("Files inside :", len(os.listdir(DATA_FOLDER)))

# 输出目录下前几个文件名
for f in os.listdir(DATA_FOLDER)[:5]:
    print("  -", f)

# 输出路径设置
OUTPUT_DIR  = os.path.join(BASE_DIR, "team2_TrafficLabelling")
OUTPUT_NAME = "team2_TrafficLabellingClean.csv"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_NAME)


In [None]:
# %% 2. Load Raw Data
raw_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours.pcap_ISCX.csv",
    "Friday-WorkingHours.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
]

dfs, missing, failed = [], [], []
for fname in raw_files:
    fpath = os.path.join(DATA_FOLDER, fname)
    if not os.path.exists(fpath):
        missing.append(fname)
        continue
    try:
        df_tmp = pd.read_csv(fpath, encoding="utf-8-sig", low_memory=False)
    except UnicodeDecodeError:
        df_tmp = pd.read_csv(fpath, encoding="latin1", low_memory=False)
    except Exception as e:
        failed.append((fname, str(e)))
        continue
    dfs.append(df_tmp)

assert dfs, f"No files loaded. Missing={missing}, Failed={failed}"
if missing: print("⚠ Missing files:", missing)
if failed:  print("⚠ Failed to read:", failed)

df = pd.concat(dfs, ignore_index=True)
print("Merged dataset shape:", df.shape)
assert df.shape[0] > 0 and df.shape[1] > 0, "Merged dataframe is empty."


In [None]:
# %% 3. Basic Cleaning
before = df.shape
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
after = df.shape
print(f"Dropna+duplicates: {before} -> {after} | removed={before[0]-after[0]}")
assert df.shape[0] > 0, "All rows removed after cleaning; check inputs/thresholds."


In [None]:
# %% 4. Feature Removal (constant, IP/Timestamp, low variance, high correlation)
# 4.1 constants (zero variance)
constant_cols = [
    "Bwd PSH Flags", "Bwd URG Flags",
    "Fwd Avg Bytes/Bulk", "Fwd Avg Packets/Bulk", "Fwd Avg Bulk Rate",
    "Bwd Avg Bytes/Bulk", "Bwd Avg Packets/Bulk", "Bwd Avg Bulk Rate",
]
const_in_df = [c for c in constant_cols if c in df.columns]
df.drop(columns=const_in_df, inplace=True, errors="ignore")

# 4.2 IP / Timestamp identifiers
id_like = [c for c in df.columns if ("IP" in c) or ("Timestamp" in c)]
df.drop(columns=id_like, inplace=True, errors="ignore")

# 4.3 low variance (numeric only)
low_var_cols = []
numeric_df = df.select_dtypes(include=[np.number])
if not numeric_df.empty:
    sel = VarianceThreshold(threshold=1e-4)
    sel.fit(numeric_df)
    low_var_cols = list(numeric_df.columns[~sel.get_support()])
    if low_var_cols:
        df.drop(columns=low_var_cols, inplace=True, errors="ignore")

# 4.4 high correlation (numeric only)
corr_drop = []
num_only = df.select_dtypes(include=[np.number])
if not num_only.empty:
    corr = num_only.corr()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    corr_drop = [c for c in upper.columns if any(upper[c].abs() > 0.95)]
    if corr_drop:
        df.drop(columns=corr_drop, inplace=True, errors="ignore")

print("Dropped (constants):", len(const_in_df), "|", const_in_df)
print("Dropped (id-like)  :", len(id_like))
print("Dropped (low-var)  :", len(low_var_cols))
print("Dropped (high-corr):", len(corr_drop))
print("Current shape      :", df.shape)
assert df.shape[1] > 0, "All columns removed; relax thresholds or review features."


In [None]:
# %% 5. Optimize Numeric Types
for col in df.select_dtypes(include=["float64"]).columns:
    df[col] = df[col].astype("float32")
for col in df.select_dtypes(include=["int64"]).columns:
    df[col] = df[col].astype("int32")
print("Optimized numeric dtypes.")


In [None]:
# %% 6. Save and Validate
os.makedirs(OUTPUT_DIR, exist_ok=True)
assert df.shape[0] > 0 and df.shape[1] > 0, "Empty dataframe at save time."

df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig", lineterminator="\n")
print(f"✅ Cleaned dataset saved to: {OUTPUT_PATH}")
print("Final shape:", df.shape)

assert os.path.exists(OUTPUT_PATH), "CSV was not written."
size_mb = round(os.path.getsize(OUTPUT_PATH)/(1024**2), 2)
print("File size (MB):", size_mb)

# quick preview
try:
    preview = pd.read_csv(OUTPUT_PATH, nrows=5)
    print("Preview shape:", preview.shape, "| Columns:", len(preview.columns))
except Exception as e:
    print("Read-back failed:", e)
