In [1]:
import numpy as np
import pandas as pd
import re
import tldextract
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from urllib.parse import urlparse
import whois
import datetime

# Load dataset for feature scaling and model training
file_path = "/content/Dataset-2.xlsx"
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name='Worksheet')

df['status'] = df['status'].map({'legitimate': 0, 'phishing': 1})
df.drop(columns=['url'], inplace=True)
df.drop_duplicates(inplace=True)

# Selecting key features
selected_features = [
    'length_url', 'nb_dots', 'nb_hyphens', 'https_token', 'prefix_suffix',
    'random_domain', 'shortening_service', 'nb_redirection', 'web_traffic',
    'dns_record', 'google_index', 'page_rank', 'domain_age'
]
X = df[selected_features]
y = df['status']

# Normalize the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Feature extraction from URL
def extract_features(url):
    parsed_url = urlparse(url)
    domain_info = tldextract.extract(url)
    domain = domain_info.domain + '.' + domain_info.suffix

    features = []
    features.append(len(url))  # URL length
    features.append(url.count('.'))  # Number of dots
    features.append(url.count('-'))  # Number of hyphens
    features.append(1 if parsed_url.scheme == 'https' else 0)  # HTTPS presence
    features.append(1 if '-' in domain_info.domain else 0)  # Prefix-Suffix
    features.append(1 if re.search(r'[^a-zA-Z0-9]', domain_info.domain) else 0)  # Random characters in domain
    features.append(1 if any(short in url for short in ['bit.ly', 'tinyurl', 'goo.gl']) else 0)  # Shortening service
    features.append(url.count('/'))  # Number of redirections
    features.append(1)  # Web traffic placeholder (API required for real data)
    try:
        whois_info = whois.whois(domain)
        domain_age = (datetime.datetime.now() - whois_info.creation_date[0]).days / 365 if whois_info.creation_date else 0
    except:
        domain_age = 0  # If WHOIS lookup fails
    features.append(1 if whois_info.domain_name else 0)  # DNS record
    features.append(1)  # Google index placeholder (API required for real data)
    features.append(1)  # Page rank placeholder (API required for real data)
    features.append(domain_age)  # Domain age

    return scaler.transform([features])

ModuleNotFoundError: No module named 'tldextract'

In [2]:
!pip install tldextract


Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [5]:
import numpy as np
import pandas as pd
import re
import tldextract
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from urllib.parse import urlparse
import whois
import datetime

# Load dataset for feature scaling and model training
file_path = "/content/Dataset-2.xlsx"
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name='Worksheet')

df['status'] = df['status'].map({'legitimate': 0, 'phishing': 1})
df.drop(columns=['url'], inplace=True)
df.drop_duplicates(inplace=True)

# Selecting key features
selected_features = [
    'length_url', 'nb_dots', 'nb_hyphens', 'https_token', 'prefix_suffix',
    'random_domain', 'shortening_service', 'nb_redirection', 'web_traffic',
    'dns_record', 'google_index', 'page_rank', 'domain_age'
]
X = df[selected_features]
y = df['status']

# Normalize the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Feature extraction from URL
def extract_features(url):
    parsed_url = urlparse(url)
    domain_info = tldextract.extract(url)
    domain = domain_info.domain + '.' + domain_info.suffix

    features = []
    features.append(len(url))  # URL length
    features.append(url.count('.'))  # Number of dots
    features.append(url.count('-'))  # Number of hyphens
    features.append(1 if parsed_url.scheme == 'https' else 0)  # HTTPS presence
    features.append(1 if '-' in domain_info.domain else 0)  # Prefix-Suffix
    features.append(1 if re.search(r'[^a-zA-Z0-9]', domain_info.domain) else 0)  # Random characters in domain
    features.append(1 if any(short in url for short in ['bit.ly', 'tinyurl', 'goo.gl']) else 0)  # Shortening service
    features.append(url.count('/'))  # Number of redirections
    features.append(1)  # Web traffic placeholder (API required for real data)
    try:
        whois_info = whois.whois(domain)
        domain_age = (datetime.datetime.now() - whois_info.creation_date[0]).days / 365 if whois_info.creation_date else 0
    except:
        domain_age = 0  # If WHOIS lookup fails
    features.append(1 if whois_info.domain_name else 0)  # DNS record
    features.append(1)  # Google index placeholder (API required for real data)
    features.append(1)  # Page rank placeholder (API required for real data)
    features.append(domain_age)  # Domain age

    return scaler.transform([features])

KeyboardInterrupt: 

In [4]:
!pip install python-whois


Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5


In [6]:
import numpy as np
import pandas as pd
import re
import tldextract
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from urllib.parse import urlparse
import whois
import datetime

# Load dataset for feature scaling and model training
file_path = "/content/Dataset-2.xlsx"
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name='Worksheet')

df['status'] = df['status'].map({'legitimate': 0, 'phishing': 1})
df.drop(columns=['url'], inplace=True)
df.drop_duplicates(inplace=True)

# Selecting key features
selected_features = [
    'length_url', 'nb_dots', 'nb_hyphens', 'https_token', 'prefix_suffix',
    'random_domain', 'shortening_service', 'nb_redirection', 'web_traffic',
    'dns_record', 'google_index', 'page_rank', 'domain_age'
]
X = df[selected_features]
y = df['status']

# Normalize the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Feature extraction from URL
def extract_features(url):
    parsed_url = urlparse(url)
    domain_info = tldextract.extract(url)
    domain = domain_info.domain + '.' + domain_info.suffix

    features = []
    features.append(len(url))  # URL length
    features.append(url.count('.'))  # Number of dots
    features.append(url.count('-'))  # Number of hyphens
    features.append(1 if parsed_url.scheme == 'https' else 0)  # HTTPS presence
    features.append(1 if '-' in domain_info.domain else 0)  # Prefix-Suffix
    features.append(1 if re.search(r'[^a-zA-Z0-9]', domain_info.domain) else 0)  # Random characters in domain
    features.append(1 if any(short in url for short in ['bit.ly', 'tinyurl', 'goo.gl']) else 0)  # Shortening service
    features.append(url.count('/'))  # Number of redirections
    features.append(1)  # Web traffic placeholder (API required for real data)
    try:
        whois_info = whois.whois(domain)
        domain_age = (datetime.datetime.now() - whois_info.creation_date[0]).days / 365 if whois_info.creation_date else 0
    except:
        domain_age = 0  # If WHOIS lookup fails
    features.append(1 if whois_info.domain_name else 0)  # DNS record
    features.append(1)  # Google index placeholder (API required for real data)
    features.append(1)  # Page rank placeholder (API required for real data)
    features.append(domain_age)  # Domain age

    return scaler.transform([features])

In [7]:
import pickle

# Save the trained Random Forest model
with open("rf_model.pkl", "wb") as file:
    pickle.dump(rf_model, file)

# Save the scaler
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

print("✅ Model and Scaler saved successfully!")


✅ Model and Scaler saved successfully!
