In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('dataset_phishing.csv')

# Quickly preview data
print(df.columns)        # See available column names
print(df.head())         # Preview first few rows


Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 

In [2]:
import re
from urllib.parse import urlparse

def extract_url_features(url):
    uses_https = int(urlparse(url).scheme == 'https')
    url_length = len(url)
    num_dots = url.count('.')
    has_ip = int(bool(re.search(r'(\d{1,3}\.){3}\d{1,3}', urlparse(url).netloc)))
    parsed = urlparse(url)
    has_at = '@' in url
    double_slash_in_path = '//' in parsed.path
    has_suspicious_chars = int(has_at or double_slash_in_path)
    return [uses_https, url_length, num_dots, has_ip, has_suspicious_chars]

feature_names = ['uses_https', 'url_length', 'num_dots', 'has_ip', 'has_suspicious_chars']
X = df['url'].apply(extract_url_features)
X = pd.DataFrame(X.tolist(), columns=feature_names)

y = df['status'].map({'phishing': 1, 'legitimate': 0})  # Converts label to 1/0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [4]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6767279090113736
              precision    recall  f1-score   support

           0       0.66      0.73      0.69      1143
           1       0.70      0.62      0.66      1143

    accuracy                           0.68      2286
   macro avg       0.68      0.68      0.68      2286
weighted avg       0.68      0.68      0.68      2286



In [5]:
import joblib
joblib.dump(model, "phishing_rf_model.pkl")


['phishing_rf_model.pkl']

In [7]:
import os
print(os.listdir())


['Untitled Folder', 'prac9.py', 'Untitled10.ipynb', '.zshrc.save', 'Untitled7.ipynb', '.eclipse', 'sar3b.sh', '.config', 'Music', 'MEGABASTERD_UPLOAD_LOGS', 'u.item', '.zprofile.pysave', 'AAPL_stock_data.csv', '.condarc', '.docker', 'sar4c.sh', 'tempCodeRunnerFile', 'labels.csv', 'Untitled12.ipynb', '.codeium', 'Untitled5.ipynb', 'Clang', 'Untitled1.ipynb', '2aa.sh', 'index.html', 'shdd.sh', '.DS_Store', 'sar4b.sh', '.CFUserTextEncoding', 'phishing_rf_model.pkl', 'Apriori.ipynb', '.xonshrc', 'sar3c.sh', '.bash_profile.save', 'Untitled3.ipynb', 'Untitled14.ipynb', 'Untitled.ipynb', 'songdataset.csv', '.zshrc', '.packettracer', '.templateengine', 'Untitled4.ipynb', 'extracted_code.ipynb', '.streamlit', '.local', 'Untitled13.ipynb', 'Untitled6.ipynb', 'Pictures', 'Untitled11.ipynb', '.atom', '.zprofile', '.tabnine', 'MEGA downloads', 'sar4.sh', 'movie_by_name.py', '2a.sh', '.nuget', 'dog_breed_detection.ipynb', 'dataset_phishing.csv', '.zsh_history', 'Untitled2.ipynb', '.p2', '.ipython', 