<a href="https://colab.research.google.com/github/Nicole0906/DLI_Group_Assignment/blob/main/Tai%20Wei%20Kent_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing and Connecting Drive

In [27]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Change directory to the desired location in Google Drive
import os
os.chdir('/content/gdrive/MyDrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Traning Model

In [29]:
# ==============================
# STEP 1: Install dependencies
# ==============================
!pip install -q lightgbm scikit-learn joblib tabulate gdown

# ==============================
# STEP 2: Imports & Reproducibility
# ==============================
import os, random, time, sys, subprocess, pathlib
import numpy as np
import pandas as pd
import lightgbm as lgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tabulate import tabulate

# ---- Reproducibility: set ALL the seeds ----
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Some LGBM/BLAS nondeterminism can persist with multi-threading.
# For strict reproducibility, use single-thread.
LGBM_N_JOBS = 1

# ==============================
# STEP 3: Fetch & Load dataset from GitHub
# ==============================
REPO_URL = "https://github.com/Nicole0906/DLI_Group_Assignment.git"
CLONE_DIR = "./DLI_Group_Assignment"

# Clone only if not already present
if not os.path.exists(CLONE_DIR):
    print(f"⬇️  Cloning repository from: {REPO_URL}")
    subprocess.run(["git", "clone", REPO_URL, CLONE_DIR], check=True)

# Now set your CSV path inside the repo
DATA_PATH = os.path.join(CLONE_DIR, "Phishing_Legitimate_full 3.csv")  # change if file is in a subfolder

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

df = pd.read_csv(DATA_PATH)

print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ==============================
# STEP 4: Define features & target
# ==============================
TARGET_COL = "CLASS_LABEL"
ID_COLS = [c for c in ["id"] if c in df.columns]  # drop only if present

X = df.drop(columns=ID_COLS + [TARGET_COL])
y = df[TARGET_COL]

print("\nFeature shape:", X.shape)
print("Target distribution:\n", y.value_counts())

# Provide a safe label map; edit if your labels differ
# If your dataset uses {0,1} where 1=phishing, 0=legitimate, this is a good default.
unique_labels = sorted(y.unique())
label_map = {0: "Legitimate", 1: "Phishing"}
# fallback names for unseen labels
for lbl in unique_labels:
    if lbl not in label_map:
        label_map[lbl] = f"Class {lbl}"

# ==============================
# STEP 5: Train-test split (seeded)
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)

print("\nData Split:")
print("Train size:", X_train.shape, "Test size:", X_test.shape)

# ==============================
# STEP 6: Train LightGBM Model (deterministic settings)
# ==============================
model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=100,
    random_state=SEED,
    n_jobs=LGBM_N_JOBS,
    # Extra knobs to reduce nondeterminism
    deterministic=True,
    force_row_wise=True,   # helps determinism across versions
)

print("\n🚀 Training LightGBM model...")
model.fit(X_train, y_train)

# ==============================
# STEP 7: Evaluate the model
# ==============================
start = time.time()
y_pred = model.predict(X_test)
end = time.time()
inference_time = (end - start) / len(X_test) * 1000.0  # ms/sample

print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Raw classification report
report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

# ==============================
# STEP 8: Build Custom Table (rounded to 3 decimals, with Support)
# ==============================
classes = sorted(unique_labels)
table_rows = []

for cls in classes:
    cls_key = str(cls)
    if cls_key in report:
        table_rows.append([
            f"{cls} ({label_map.get(cls, f'Class {cls}')})",
            round(report[cls_key]["precision"], 3),
            round(report[cls_key]["recall"], 3),
            round(report[cls_key]["f1-score"], 3),
            int(report[cls_key]["support"]),
            round(inference_time, 3),
        ])

# Accuracy, Macro, Weighted
table_rows.extend([
    [
        "Accuracy",
        round(report["accuracy"], 3),
        round(report["accuracy"], 3),
        round(report["accuracy"], 3),
        len(y_test),
        round(inference_time, 3),
    ],
    [
        "Macro avg",
        round(report["macro avg"]["precision"], 3),
        round(report["macro avg"]["recall"], 3),
        round(report["macro avg"]["f1-score"], 3),
        int(report["macro avg"]["support"]),
        round(inference_time, 3),
    ],
    [
        "Weighted avg",
        round(report["weighted avg"]["precision"], 3),
        round(report["weighted avg"]["recall"], 3),
        round(report["weighted avg"]["f1-score"], 3),
        int(report["weighted avg"]["support"]),
        round(inference_time, 3),
    ],
])

print("\n📑 Final Evaluation Table:\n")
print(tabulate(
    table_rows,
    headers=["Class", "Precision", "Recall", "F1-score", "Support", "Inference Time (ms/sample)"],
    tablefmt="grid"
))

# ==============================
# STEP 9: Print Final F1 Score + One-line verdict
# ==============================
final_f1 = round(report["weighted avg"]["f1-score"], 3)
TARGET_F1 = 0.98  # <-- updated target here
verdict = "target met ✅" if final_f1 >= TARGET_F1 else "target not met ❌"
print(f"\n🎯 Final F1 Score: {final_f1}")
print(f"✅ Verdict: Achieved F1 = {final_f1}, target = {TARGET_F1} → {verdict}")

# ==============================
# STEP 10: Save the model
# ==============================
MODEL_PATH = "./lgbm_model.pkl"
joblib.dump(model, MODEL_PATH)
print(f"\n✅ Model saved at: {MODEL_PATH}")

✅ Dataset loaded successfully!
Shape: (10000, 50)
Columns: ['id', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash', 'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress', 'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname', 'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath', 'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks', 'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms', 'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction', 'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch', 'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow', 'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle', 'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT', 'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT', 'PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL']

Feature shape: (1000