In [3]:
# Cell 1: Imports

import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse


In [4]:
# Cell 2: Load Dataset

df_phi = pd.read_csv("../data/raw/PhiUSIIL_Phishing_URL_Dataset.csv")

# Keep only URL and label
df_phi = df_phi[["URL", "label"]]

df_phi.head()


Unnamed: 0,URL,label
0,https://www.southbankmosaics.com,1
1,https://www.uni-mainz.de,1
2,https://www.voicefmradio.co.uk,1
3,https://www.sfnmjournal.com,1
4,https://www.rewildingargentina.org,1


In [5]:
# Cell 3: Basic Info

print("Shape:", df_phi.shape)
print("\nClass Distribution:")
print(df_phi["label"].value_counts())

Shape: (235795, 2)

Class Distribution:
label
1    134850
0    100945
Name: count, dtype: int64


In [6]:
# Cell 4: Lexical Feature Extraction

def extract_lexical_features(url):
    features = {}
    
    parsed = urlparse(url)
    
    # Basic length features
    features["url_length"] = len(url)
    features["domain_length"] = len(parsed.netloc)
    features["path_length"] = len(parsed.path)
    
    # Special character counts
    features["dot_count"] = url.count(".")
    features["hyphen_count"] = url.count("-")
    features["at_count"] = url.count("@")
    features["question_count"] = url.count("?")
    features["ampersand_count"] = url.count("&")
    features["equal_count"] = url.count("=")
    
    # Digit features
    features["digit_count"] = sum(c.isdigit() for c in url)
    features["digit_ratio"] = features["digit_count"] / max(len(url), 1)
    
    # IP address presence
    features["has_ip"] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0
    
    # Suspicious tokens
    suspicious_words = ["login", "secure", "verify", "update", "account", "bank", "paypal"]
    features["suspicious_token_count"] = sum(word in url.lower() for word in suspicious_words)
    
    # HTTPS usage
    features["is_https"] = 1 if parsed.scheme == "https" else 0
    
    return features


In [7]:
# Cell 5: Apply Feature Extraction (Sample for speed)

df_phi_sample = df_phi.sample(50000, random_state=42)

features_list = df_phi_sample["URL"].apply(extract_lexical_features)

X_phi = pd.DataFrame(features_list.tolist())
y_phi = df_phi_sample["label"]

print("Feature shape:", X_phi.shape)
X_phi.head()


Feature shape: (50000, 14)


Unnamed: 0,url_length,domain_length,path_length,dot_count,hyphen_count,at_count,question_count,ampersand_count,equal_count,digit_count,digit_ratio,has_ip,suspicious_token_count,is_https
0,25,17,0,3,0,0,0,0,0,0,0.0,0,0,1
1,60,28,24,2,0,0,0,0,0,1,0.016667,0,0,1
2,385,27,350,4,8,0,0,0,0,51,0.132468,0,0,1
3,19,6,6,1,0,0,0,0,0,2,0.105263,0,0,0
4,36,28,0,3,0,0,0,0,0,0,0.0,0,0,1
