In [1]:
"""
team2_TrafficLabellingClean.py
-------------------------------
Functions:
- Safe CSV reader (utf-8-sig with latin1 fallback).
- Basic cleaning: dropna, duplicates, constant columns,
  IP/Timestamp columns, extreme values.
- Lightweight feature selection: low variance filter +
  high correlation filter.
- Numeric optimization: downcast + rounding to reduce file size.
- Outputs both cleaned CSV and validation report directly into /datasets.
"""
import mlflow
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [2]:
# ---------------- Paths ----------------
# Source dataset folder (update this path if needed)
DATA_FOLDER = r"C:\Users\hi\AI-CloudSec-System\data\traffic"

# ---------------- Corrected Path Definitions ----------------
# When running in Jupyter, os.getcwd() gives the notebook directory. 
# We navigate up one level (os.pardir) to the project root (AI-CloudSec-System-1).
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) 
# The target output directory is Step1-Datasets-Feature-Engineering.
OUT_DIR = os.path.join(PROJECT_DIR, "Step1-Datasets-Feature-Engineering")

# Ensure the output directory exists
os.makedirs(OUT_DIR, exist_ok=True)

# Define the final output file paths
OUT_FILE = os.path.join(OUT_DIR, "team2_TrafficLabellingClean.csv")
REPORT = os.path.join(OUT_DIR, "team2_TrafficLabellingClean_report.txt")



In [3]:
# ---------------- Safe CSV Reader ----------------
def safe_read_csv(path):
    try:
        print(f"Reading {path} with utf-8-sig ...")
        return pd.read_csv(path, low_memory=False, encoding="utf-8-sig")
    except UnicodeDecodeError:
        print(f"⚠️ UTF-8 failed for {path}, retrying with latin1 ...")
        return pd.read_csv(path, low_memory=False, encoding="latin1")


In [None]:


# ---------------- Cleaning ----------------
def clean_dataframe(df, log):
    before = len(df)
    df = df.dropna().drop_duplicates()
    log.append(
        f"Dropna + duplicates: {before - len(df)} rows removed, now {len(df)} rows"
    )

    const_cols = df.columns[df.nunique() <= 1].tolist()
    if const_cols:
        df = df.drop(columns=const_cols)
        log.append(f"Dropped {len(const_cols)} constant cols: {const_cols}")

    drop_cols = [c for c in df.columns if "IP" in c or "Timestamp" in c]
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")
        log.append(f"Dropped {len(drop_cols)} IP/Timestamp cols")

    if " Flow Duration" in df.columns:
        df = df[(df[" Flow Duration"] > 0) & (df[" Flow Duration"] < 3600)]
    if " Flow Bytes/s" in df.columns:
        df = df[df[" Flow Bytes/s"] < 1e9]

    return df



In [None]:

# ---------------- Feature Selection ----------------
def feature_selection(df, log, label_col=" Label"):
    if label_col in df.columns:
        X = df.drop(columns=[label_col], errors="ignore")
    else:
        X = df

    X = X.select_dtypes(include=np.number)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    selector = VarianceThreshold(threshold=0.01)
    X_var = selector.fit_transform(X_scaled)
    kept_cols = X.columns[selector.get_support()]
    log.append(f"Low variance removed: {X.shape[1] - len(kept_cols)} cols")

    X_df = pd.DataFrame(X_var, columns=kept_cols)
    corr = X_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
    if to_drop:
        X_df = X_df.drop(columns=to_drop)
        log.append(f"High correlation removed: {len(to_drop)} cols")

    if label_col in df.columns:
        X_df[label_col] = df[label_col].values

    return X_df



In [None]:

# ---------------- Numeric Optimization ----------------
def optimize_numeric(df, log, decimals=2):
    before_mem = df.memory_usage(deep=True).sum() / (1024 * 1024)
    for col in df.select_dtypes(include=[np.number]).columns:
        if pd.api.types.is_integer_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], downcast="integer")
        else:
            df[col] = df[col].round(decimals)
            df[col] = pd.to_numeric(df[col], downcast="float")
    after_mem = df.memory_usage(deep=True).sum() / (1024 * 1024)
    ratio = (before_mem - after_mem) / before_mem * 100
    log.append(
        f"Optimized numeric cols: {before_mem:.2f}MB →"
        f"{after_mem:.2f}MB (↓{ratio:.1f}%)"
    )
    return df



In [4]:

# ---------------- Main (MLflow Integrated) ----------------
log = []

# 1. Set MLflow Lab Name
mlflow.set_experiment("Team2_Feature_Engineering_Traffic_Data")

# 2. Run a new MLflow 
with mlflow.start_run() as run:
    # Log the type of operation/step
    mlflow.set_tag("step", "data_cleaning_and_feature_selection")

    # --- [Start my origin code logical] ---
    files = [
        os.path.join(DATA_FOLDER, f)
        for f in os.listdir(DATA_FOLDER)
        if f.endswith(".csv")
    ]
    dfs = [safe_read_csv(f) for f in files]
    df = pd.concat(dfs, ignore_index=True)
    log.append(f"Merged {len(files)} files: {df.shape}")

    df = clean_dataframe(df, log)
    df_final = feature_selection(df, log)
    df_final = optimize_numeric(df_final, log, decimals=2)

    # record key index
    mlflow.log_metric("final_rows", len(df_final))
    mlflow.log_metric("final_columns", df_final.shape[1])

    # save printout(original code)
    # Note: OUT_FILE & REPORT Patch should be capable to visit in Notebook
    df_final.to_csv(OUT_FILE, index=False, encoding="utf-8-sig")
    with open(REPORT, "w", encoding="utf-8") as f:
        f.write("\n".join(str(x) for x in log))

    # 3. Register MLflow Artifacts files
    mlflow.log_artifact(OUT_FILE, artifact_path="cleaned_data")
    mlflow.log_artifact(REPORT, artifact_path="reports")

    print("✅ Saved cleaned dataset:", OUT_FILE, df_final.shape)
    print("📊 Validation report written:", REPORT)
    print(f"MLflow Run ID: {run.info.run_id}")

2025/10/17 13:39:36 INFO mlflow.tracking.fluent: Experiment with name 'Team2_Feature_Engineering_Traffic_Data' does not exist. Creating a new experiment.


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\hi\\AI-CloudSec-System\\data\\traffic'