Importing and Connecting Drive

In [22]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Change directory to the desired location in Google Drive
import os
os.chdir('/content/gdrive/MyDrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Installation

Traning Model

In [24]:
# ==============================
# STEP 1: Install dependencies
# ==============================
!pip install lightgbm scikit-learn joblib

# ==============================
# STEP 2: Import libraries
# ==============================
import pandas as pd
import lightgbm as lgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# ==============================
# STEP 3: Load dataset
# ==============================
df = pd.read_csv("/content/gdrive/MyDrive/Phishing_Legitimate_full 3.csv")

print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

# ==============================
# STEP 4: Define features & target
# ==============================
X = df.drop(columns=["id", "CLASS_LABEL"])  # Drop ID + Target
y = df["CLASS_LABEL"]

print("\nFeature shape:", X.shape)
print("Target distribution:\n", y.value_counts())

# ==============================
# STEP 5: Train-test split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nData Split:")
print("Train size:", X_train.shape, "Test size:", X_test.shape)

# ==============================
# STEP 6: Train LightGBM Model
# ==============================
model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

print("\n🚀 Training LightGBM model...")
model.fit(X_train, y_train)

# ==============================
# STEP 7: Evaluate the model
# ==============================
y_pred = model.predict(X_test)

print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📑 Classification Report:\n", classification_report(y_test, y_pred))

f1 = f1_score(y_test, y_pred)
print(f"\n🎯 Final F1 Score: {f1*100:.2f}%")

# ==============================
# STEP 8: Save the model
# ==============================
model_path = "/content/gdrive/MyDrive/lgbm_model.pkl"
joblib.dump(model, model_path)
print(f"\n✅ Model saved at: {model_path}")

✅ Dataset loaded successfully!
Shape: (10000, 50)
Columns: ['id', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash', 'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress', 'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname', 'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath', 'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks', 'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms', 'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction', 'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch', 'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow', 'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle', 'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT', 'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT', 'PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL']
   id  NumDots  Subdo