<a href="https://colab.research.google.com/github/Nicole0906/DLI_Group_Assignment/blob/main/Nicole_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Change directory to the desired location in Google Drive
import os
os.chdir('/content/gdrive/MyDrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Traning Model

In [30]:
import os
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix,
    precision_score, recall_score, roc_auc_score
)

# --------------------
# 1. Clone GitHub Repo
# --------------------
REPO_URL = "https://github.com/YOUR_USERNAME/YOUR_REPO.git"  # <-- change this
CLONE_DIR = "/content/DLI_Group_Assignment"

if not os.path.exists(CLONE_DIR):
    !git clone {REPO_URL} {CLONE_DIR}

# --------------------
# 2. Find CSV file automatically
# --------------------
csv_file = None
for root, dirs, files in os.walk(CLONE_DIR):
    for file in files:
        if file.lower().endswith(".csv"):
            csv_file = os.path.join(root, file)
            break
    if csv_file:
        break

if csv_file is None:
    raise FileNotFoundError("No CSV file found in the repository.")

print(f"✅ Found CSV file: {csv_file}")

# --------------------
# 3. Load dataset
# --------------------
df = pd.read_csv(csv_file)

if "CLASS_LABEL" not in df.columns:
    raise ValueError("Dataset must have 'CLASS_LABEL' column as target.")

X = df.drop(columns=["CLASS_LABEL"])
y = df["CLASS_LABEL"]

# --------------------
# 4. Split dataset
# --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------
# 5. Pipeline: Scaling + Logistic Regression
# --------------------
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# --------------------
# 6. Train & Measure Inference Time
# --------------------
pipeline.fit(X_train, y_train)

start_time = time.time()
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
end_time = time.time()

inference_time_ms = (end_time - start_time) * 1000

inference_time_ms = (end_time - start_time) * 1000

# Metrics
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Number of parameters in logistic regression
num_params = np.prod(pipeline.named_steps['logreg'].coef_.shape) + pipeline.named_steps['logreg'].intercept_.shape[0]

# Results
print("\n📊 Logistic Regression Model Results")
print(f"Accuracy     : {acc:.6f}")
print(f"Precision    : {precision:.6f}")
print(f"Recall       : {recall:.6f}")
print(f"F1-score     : {f1:.6f}")
print(f"ROC-AUC      : {roc_auc:.6f}")
print(f"Parameters   : {num_params}")
print(f"Inference Time: {inference_time_ms:.3f} ms")

# Classification Report & Confusion Matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=6))

print("Confusion Matrix:")
print(cm)


✅ Found CSV file: /content/DLI_Group_Assignment/Phishing_Legitimate_full 3.csv

📊 Logistic Regression Model Results
Accuracy     : 0.999000
Precision    : 0.999000
Recall       : 0.999000
F1-score     : 0.999000
ROC-AUC      : 0.999978
Parameters   : 50
Inference Time: 7.621 ms

Classification Report:
              precision    recall  f1-score   support

           0   0.999000  0.999000  0.999000      1000
           1   0.999000  0.999000  0.999000      1000

    accuracy                       0.999000      2000
   macro avg   0.999000  0.999000  0.999000      2000
weighted avg   0.999000  0.999000  0.999000      2000

Confusion Matrix:
[[999   1]
 [  1 999]]


In [31]:
from tabulate import tabulate  # install with: pip install tabulate

# Create a table-like output
headers = ["Model", "Accuracy", "Precision", "Recall", "F1", "ROC-AUC", "Params", "Inference (ms)"]
table_data = [[
    "Logistic Regression (TF-IDF)",
    f"{acc:.3f}",
    f"{precision:.3f}",
    f"{recall:.3f}",
    f"{f1:.3f}",
    f"{roc_auc:.3f}",
    num_params,
    f"{inference_time_ms:.2f}"
]]

print("\nMODEL EVALUATION TABLE")
print(tabulate(table_data, headers=headers, tablefmt="grid"))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=3))

print("Confusion Matrix:")
print(cm)



MODEL EVALUATION TABLE
+------------------------------+------------+-------------+----------+-------+-----------+----------+------------------+
| Model                        |   Accuracy |   Precision |   Recall |    F1 |   ROC-AUC |   Params |   Inference (ms) |
| Logistic Regression (TF-IDF) |      0.999 |       0.999 |    0.999 | 0.999 |         1 |       50 |             7.62 |
+------------------------------+------------+-------------+----------+-------+-----------+----------+------------------+

Classification Report:
              precision    recall  f1-score   support

           0      0.999     0.999     0.999      1000
           1      0.999     0.999     0.999      1000

    accuracy                          0.999      2000
   macro avg      0.999     0.999     0.999      2000
weighted avg      0.999     0.999     0.999      2000

Confusion Matrix:
[[999   1]
 [  1 999]]


In [32]:
from sklearn.metrics import classification_report
import pandas as pd
from tabulate import tabulate

# Get classification report as dictionary
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Convert to DataFrame for nicer formatting
report_df = pd.DataFrame(report_dict).transpose()

# Round values to 3 decimal places
report_df = report_df.round(3)

# Print nicely using tabulate
print("\nCLASSIFICATION REPORT")
print(tabulate(report_df, headers='keys', tablefmt='grid'))



CLASSIFICATION REPORT
+--------------+-------------+----------+------------+-----------+
|              |   precision |   recall |   f1-score |   support |
| 0            |       0.999 |    0.999 |      0.999 |  1000     |
+--------------+-------------+----------+------------+-----------+
| 1            |       0.999 |    0.999 |      0.999 |  1000     |
+--------------+-------------+----------+------------+-----------+
| accuracy     |       0.999 |    0.999 |      0.999 |     0.999 |
+--------------+-------------+----------+------------+-----------+
| macro avg    |       0.999 |    0.999 |      0.999 |  2000     |
+--------------+-------------+----------+------------+-----------+
| weighted avg |       0.999 |    0.999 |      0.999 |  2000     |
+--------------+-------------+----------+------------+-----------+
