### MLflow setup & UI

In [None]:

!pip install mlflow -q
!mlflow ui --backend-store-uri sqlite:///team5_mlflow.db --port 5000


### Step 0: Load dataset and remove highly correlated features (user-defined preprocessing)

In [None]:

# Step 0: Load dataset and remove highly correlated features
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
 

n = 2426574  # total rows in the file
s = 500000   # sample size
skip = sorted(np.random.choice(np.arange(1, n + 1), n - s, replace=False))
df = pd.read_csv('../Step1-Datasets-Feature-Engineering/team11_BotNeTIoT-L01_label_NoDuplicates.csv', skiprows=skip)


# Drop highly correlated features
def drop_highly_correlated_features(df, threshold=0.95):
    df_copy = df.copy()
    corr_matrix = df_copy.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    df_copy.drop(columns=to_drop, inplace=True)
    return df_copy, corr_matrix, to_drop

df, corr, dropped_features = drop_highly_correlated_features(df)

print("Dropped features:", dropped_features)
print("Remaining features:", df.columns.tolist())

# Visualize correlation heatmap after feature removal
corr_after = df.corr().abs()
plt.figure(figsize=(30, 30))
sns.heatmap(corr_after, annot=False, cmap="coolwarm")
plt.title('Correlation Heatmap (After Feature Removal)')
plt.show()

# Print the correlation matrix
print(corr_after)


### Train a model **inside MLflow** using the dataset, log metrics, and register the model

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn

TRACKING_URI = "sqlite:///team5_mlflow.db"
EXPERIMENT_NAME = "team5-BoTNeTIoT"
REGISTERED_MODEL_NAME = "team5_BoTNeTIoT_Model"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

# Find a DataFrame
def _find_dataframe():
    priority = ['df', 'data', 'dataset', 'bot_df', 'train_df', 'full_df']
    for p in priority:
        if p in globals() and isinstance(globals()[p], pd.DataFrame) and len(globals()[p]) > 10:
            return p, globals()[p]
    for name, obj in globals().items():
        if isinstance(obj, pd.DataFrame) and len(obj) > 10:
            return name, obj
    return None, None

df_name, df = _find_dataframe()
if df is None:
    raise RuntimeError("No DataFrame found. Ensure your dataset is loaded into a variable like 'df'.")
print(f"Using DataFrame '{df_name}' with shape {df.shape}")

# Infer target
cands = ['label','Label','attack','Attack','class','Class','target','Target','y','attack_type','Attack_type']
target = None
for c in cands:
    if c in df.columns:
        target = c
        break
if target is None:
    lower = {c.lower(): c for c in df.columns}
    for c in ['label','attack','class','target','y','attack_type']:
        if c in lower: 
            target = lower[c]
            break
if target is None:
    target = df.columns[-1]
    print(f"[Warning] Falling back to last column as target: {target}")
print("Target column:", target)

X_all = df.drop(columns=[target])
y_all = df[target]

# Numeric-only baseline
num_cols = X_all.select_dtypes(include=[np.number]).columns.tolist()
if not num_cols:
    raise RuntimeError("No numeric features available. Please encode features earlier in the notebook.")
X_all = X_all[num_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all if y_all.nunique() > 1 else None
)

# Ensure DataFrames
X_train = pd.DataFrame(X_train, columns=num_cols)
X_test  = pd.DataFrame(X_test, columns=num_cols)
y_train = pd.Series(y_train).reset_index(drop=True).to_frame('y')
y_test  = pd.Series(y_test).reset_index(drop=True).to_frame('y')

pipe = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# --------- Train & log to MLflow ---------
with mlflow.start_run(run_name="baseline_rf_training") as run:
    # Log basic params
    mlflow.log_params({
        "n_samples": int(len(df)),
        "n_features_numeric": int(len(num_cols)),
        "target": str(target),
        "train_size": int(len(X_train)),
        "test_size": int(len(X_test)),
        "model_type": "RandomForestClassifier",
        "rf_n_estimators": 200,
        "rf_max_depth": "None",
        "rf_random_state": 42
    })
    
    # Fit and evaluate
    pipe.fit(X_train, y_train.values.ravel())
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test.values.ravel(), y_pred)
    prec = precision_score(y_test.values.ravel(), y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test.values.ravel(), y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test.values.ravel(), y_pred, average='weighted', zero_division=0)
    
    mlflow.log_metrics({
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    })
    
    if y_all.nunique() == 2 and hasattr(pipe, "predict_proba"):
        try:
            auc = roc_auc_score(y_test.values.ravel(), pipe.predict_proba(X_test)[:, 1])
            mlflow.log_metric("roc_auc", auc)
        except Exception:
            pass

    # Step 1: Log model artifact (do not register yet)
    artifact_path = "model"
    mlflow.sklearn.log_model(sk_model=pipe, artifact_path=artifact_path)
    
    # Step 2: Explicitly register the model
    model_uri = f"runs:/{run.info.run_id}/{artifact_path}"
    result = mlflow.register_model(model_uri=model_uri, name="team5_BoTNeTIoT_Model")
    
    print(f"✅ Model registered successfully under name: {result.name}, version: {result.version}")



### (Optional) Register an already-trained model object to MLflow

In [None]:

import mlflow
import mlflow.sklearn
TRACKING_URI = "sqlite:///team5_mlflow.db"
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment("team5-BoTNeTIoT")
REGISTERED_MODEL_NAME = "team5_BoTNeTIoT_Model"

def _find_trained_model():
    cand_names = []
    for name, obj in globals().items():
        if hasattr(obj, "fit") and hasattr(obj, "predict"):
            cand_names.append(name)
    for p in ['model','clf','rf','pipeline','estimator']:
        if p in cand_names:
            return p, globals()[p]
    if cand_names: 
        return cand_names[-1], globals()[cand_names[-1]]
    return None, None

name, obj = _find_trained_model()
if obj is None:
    print("No trained model found to register.")
else:
    with mlflow.start_run(run_name="register_existing_model"):
        mlflow.sklearn.log_model(obj, "model", registered_model_name=REGISTERED_MODEL_NAME)
        print(f"Registered existing model under '{REGISTERED_MODEL_NAME}'.")
