In [1]:
import pandas as pd
import re
import requests
from urllib.parse import urlparse
import ipaddress
from datetime import datetime
import whois
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Load datasets
phishing_data=pd.read_csv("online-valid.csv")
legitimate_data=pd.read_csv("Benign_list_big_final.csv")
legitimate_data.columns=['URLs']

In [3]:
# Sampling 5000 rows from each dataset to create balanced subsets
phishing_sample=phishing_data.sample(n=5000,random_state=12).reset_index(drop=True)
legitimate_sample=legitimate_data.sample(n=5000,random_state=12).reset_index(drop=True)

# Feature extraction functions

In [4]:
def extract_domain(url):
    domain=urlparse(url).netloc
    return domain.replace("www.", "") if domain.startswith("www.") else domain

In [5]:
# Check if a URL contains an IP address
def contains_ip(url):
    try:
        ipaddress.ip_address(url)
        return 1
    except ValueError:
        return 0

In [6]:
# Check if a URL contains an '@' symbol
def contains_at_sign(url):
    return 1 if "@" in url else 0

In [7]:
# Check if the URL length is greater than or equal to 54 characters
def get_url_length(url):
    return 1 if len(url)>=54 else 0

In [8]:
# Calculate the depth of a URL (number of path segments)
def get_url_depth(url):
    return len([segment for segment in urlparse(url).path.split('/') if segment])

In [9]:
# Check if the URL has redirection ("//" appearing after the protocol)
def check_redirection(url):
    return 1 if url.rfind('//')>7 else 0

In [10]:
# Check if the domain part of the URL contains "https"
def check_https_in_domain(url):
    return 1 if 'https' in urlparse(url).netloc else 0

In [11]:
# Check if the URL uses a shortening service
def check_shortening_service(url):
    shortening_services=r"bit\.ly|goo\.gl|shorte\.st|t\.co|tinyurl|ow\.ly"
    return 1 if re.search(shortening_services, url) else 0

In [12]:
# Check if the domain contains a prefix or suffix separated by '-'
def check_prefix_suffix(url):
    return 1 if '-' in urlparse(url).netloc else 0

In [13]:
# Retrieve WHOIS information for a domain
def get_domain_info(url):
    try:
        return whois.whois(urlparse(url).netloc)
    except:
        return None

In [14]:
# Calculate the domain's age in months
def calculate_domain_age(domain_info):
    if domain_info and domain_info.creation_date and domain_info.expiration_date:
        creation_date=domain_info.creation_date
        expiration_date=domain_info.expiration_date
        if isinstance(creation_date, list):
            creation_date=creation_date[0]
        if isinstance(expiration_date,list):
            expiration_date=expiration_date[0]
        age=(expiration_date-creation_date).days//30 if creation_date and expiration_date else 0
        return 1 if age<6 else 0
    return 1

In [15]:
# Check if the domain expires in less than 6 months
def calculate_domain_end(domain_info):
    if domain_info and domain_info.expiration_date:
        expiration_date=domain_info.expiration_date
        if isinstance(expiration_date,list):
            expiration_date=expiration_date[0]
        end_time=(expiration_date-datetime.now()).days//30
        return 0 if end_time<6 else 1
    return 1

In [16]:
def extract_features(url,label):
    features=[]
    try:
        features.append(extract_domain(url)) # Domain name
        features.append(contains_ip(url)) # IP presence
        features.append(contains_at_sign(url)) # '@' symbol presence
        features.append(get_url_length(url)) # URL length
        features.append(get_url_depth(url)) # URL depth
        features.append(check_redirection(url)) # Redirection check
        features.append(check_https_in_domain(url)) # HTTPS in domain
        features.append(check_shortening_service(url)) # URL shortening service check
        features.append(check_prefix_suffix(url)) # Prefix/Suffix presence

        domain_info=get_domain_info(url)
        features.append(1 if domain_info is None else 0) # DNS record
        features.append(1 if domain_info is None else calculate_domain_age(domain_info))
        features.append(1 if domain_info is None else calculate_domain_end(domain_info))

        try:
            response=requests.get(url,timeout=5)
            features.append(1 if re.findall(r"<iframe>|<frameBorder>",response.text)else 0)
            features.append(1 if re.findall("<script>.+onmouseover.+</script>",response.text)else 0)
            features.append(1 if re.findall(r"event.button ?== ?2",response.text)else 0)
            features.append(1 if len(response.history) > 2 else 0)
        except requests.exceptions.RequestException:
            features+=[1, 1, 1, 1]# Default values for failed requests
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        features+=[1]*12# Default values for errors

    features.append(label)
    return features

In [17]:
# Optimization
def process_urls(urls,label):
    with ThreadPoolExecutor() as executor:
        results=list(executor.map(lambda url:extract_features(url,label),urls))
    return results

In [18]:
# Extract features for both legitimate and phishing datasets
legitimate_features=process_urls(legitimate_sample['URLs'],0)
phishing_features=process_urls(phishing_sample['url'],1)

In [19]:
# Convert the extracted features into dataframes
legitimate_df=pd.DataFrame(legitimate_features,columns=[
    'Domain','Have_IP','Have_At','URL_Length','URL_Depth','Redirection',
    'https_Domain','TinyURL','Prefix/Suffix','DNS_Record',
    'Domain_Age','Domain_End','iFrame','Mouse_Over','Right_Click','Web_Forwards','Label'])
phishing_df=pd.DataFrame(phishing_features,columns=[
    'Domain','Have_IP','Have_At','URL_Length','URL_Depth','Redirection',
    'https_Domain','TinyURL','Prefix/Suffix','DNS_Record',
    'Domain_Age','Domain_End','iFrame','Mouse_Over','Right_Click','Web_Forwards','Label'])

In [20]:
# Save the extracted features to CSV files
legitimate_df.to_csv('legitimate.csv',index=False)
phishing_df.to_csv('phishing.csv',index=False)

In [21]:
# Combine both datasets for further analysis
final_data=pd.concat([legitimate_df,phishing_df]).reset_index(drop=True)
final_data.to_csv('combined.csv',index=False)
print("Hello World")

Hello World
