### MLflow setup & UI

In [None]:

!pip install mlflow -q
!mlflow ui --backend-store-uri sqlite:///../Step2-MLops-Mlflow-to-register-your-model/team5_mlflow.db --port 5000


### Step 0: Load dataset

In [None]:

# Step 0: Load dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import os
# Set the path to the file you'd like to load
file_path = "team11_BotNeTIoT-L01_label_NoDuplicates.csv"

# Load dataset
n = 2426574  # total rows in the file
s = 500000   # sample size
skip = sorted(np.random.choice(np.arange(1, n + 1), n - s, replace=False))
 
path = '../Step1-Datasets-Feature-Engineering/team11_BotNeTIoT-L01_label_NoDuplicates.csv'
                                             
if os.path.exists(path):
    df = pd.read_csv(path, skiprows=skip)
    print("✅ File found and loaded successfully!")
else:
    print("❌ File not found at:", os.path.abspath(path))
    

### Split df into xtrain, ytrain, xtest, ytest

In [9]:
 
if df is None:
    raise RuntimeError("No DataFrame found. Ensure your dataset is loaded into a variable like 'df'.")

# Infer target
 
target =  'label'

X_all = df.drop(columns=[target])
y_all = df[target]

# Numeric-only baseline
num_cols = X_all.select_dtypes(include=[np.number]).columns.tolist()
if not num_cols:
    raise RuntimeError("No numeric features available. Please encode features earlier in the notebook.")
X_all = X_all[num_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all if y_all.nunique() > 1 else None
)

# Ensure DataFrames
X_train = pd.DataFrame(X_train, columns=num_cols)
X_test  = pd.DataFrame(X_test, columns=num_cols)
y_train = pd.Series(y_train).reset_index(drop=True).to_frame('y')
y_test  = pd.Series(y_test).reset_index(drop=True).to_frame('y')

### Train a model **inside MLflow** using the dataset, log metrics, and register the model

In [None]:

# Save MLflow DB inside your Step2 folder
db_folder = os.path.join("..", "Step2-MLops-MLflow-to-register-your-model")  # parent folder
os.makedirs(db_folder, exist_ok=True)

# Full path to save .db file
db_path = os.path.join(db_folder, "team5_mlflow.db")

# SQLite URI format (absolute path)
TRACKING_URI = f"sqlite:///{os.path.abspath(db_path)}"
 
EXPERIMENT_NAME = "team5-BoTNeTIoT"
REGISTERED_MODEL_NAME = "team5_BoTNeTIoT_Model"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)



pipe = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# --------- Train & log to MLflow ---------
with mlflow.start_run(run_name="baseline_rf_training") as run:
    # Log basic params
    mlflow.log_params({
        "n_samples": int(len(df)),
        "n_features_numeric": int(len(num_cols)),
        "target": str(target),
        "train_size": int(len(X_train)),
        "test_size": int(len(X_test)),
        "model_type": "RandomForestClassifier",
        "rf_n_estimators": 200,
        "rf_max_depth": "None",
        "rf_random_state": 42
    })
    
    # Fit and evaluate
    pipe.fit(X_train, y_train.values.ravel())
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test.values.ravel(), y_pred)
    prec = precision_score(y_test.values.ravel(), y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test.values.ravel(), y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test.values.ravel(), y_pred, average='weighted', zero_division=0)
    
    mlflow.log_metrics({
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    })
    
    if y_all.nunique() == 2 and hasattr(pipe, "predict_proba"):
        try:
            auc = roc_auc_score(y_test.values.ravel(), pipe.predict_proba(X_test)[:, 1])
            mlflow.log_metric("roc_auc", auc)
        except Exception:
            pass

    # Step 1: Log model artifact (do not register yet)
    artifact_path = "model"
    mlflow.sklearn.log_model(sk_model=pipe, artifact_path=artifact_path)
    
    # Step 2: Explicitly register the model
    model_uri = f"runs:/{run.info.run_id}/{artifact_path}"
    result = mlflow.register_model(model_uri=model_uri, name="team5_BoTNeTIoT_Model")
    
    print(f"✅ Model registered successfully under name: {result.name}, version: {result.version}")

