In [None]:
import urllib.parse
import re

def extract_features(url):
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),

        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
    }
    
    return features

# Example usage
url = "https://www.example.com/page?param1=value1&param2=value2"
features = extract_features(url)
print(features)


In [3]:
import urllib.request
from bs4 import BeautifulSoup
import re

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(soup.find_all('a'))
        ext_hyperlinks = sum(1 for link in soup.find_all('a') if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in soup.find_all('form'))
        
        return pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms = 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,
    }
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)


{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False}


In [2]:
import urllib.request
from bs4 import BeautifulSoup

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect = 0, 0, 0, 0
    
    # Features
    features = {
        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect
    }
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)


{'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0}


In [5]:
import urllib.request
from bs4 import BeautifulSoup
import re

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms = 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect
    }
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)

{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False, 'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0}


In [7]:
import urllib.request
from bs4 import BeautifulSoup

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        return frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0
    
    # Features
    features = {
        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window
    }
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)


{'FrequentDomainNameMismatch': 1, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0}


In [8]:
import urllib.request
from bs4 import BeautifulSoup
import re

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect,

        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window,
        
    }
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)

{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False, 'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0, 'FrequentDomainNameMismatch': 1, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0}


In [None]:
import urllib.request
from bs4 import BeautifulSoup
import re

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "SubmitInfoToEmail": 1,  # You need to implement this feature extraction
        "IframeOrFrame": 1,       # You need to implement this feature extraction
        "MissingTitle": 1,        # You need to implement this feature extraction
        "ImagesOnlyInForm": 1,    # You need to implement this feature extraction
        "SubdomainLevelRT": len(parsed_url.hostname.split('.')),
        "UrlLengthRT": len(url),
        "PctExtResourceUrlsRT": pct_ext_resources,
        "AbnormalExtFormActionR": pct_abnormal_form_action,
        "ExtMetaScriptLinkRT": pct_ext_hyperlinks,
        "PctExtNullSelfRedirectHyperlinksRT": pct_null_self_redirect,
        "CLASS_LABEL": 1          # You need to decide how to determine this label
    }
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)


In [10]:
import requests
from bs4 import BeautifulSoup

def extract_features_from_url(url):
    features = {
        "SubmitInfoToEmail": False,
        "IframeOrFrame": False,
        "MissingTitle": False,
        "ImagesOnlyInForm": False
    }

    try:
        # Fetch the webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if form submits to an email
        forms = soup.find_all('form')
        for form in forms:
            if 'mailto:' in form.get('action', ''):
                features["SubmitInfoToEmail"] = True
                break

        # Check for iframes or frames
        if soup.find_all('iframe') or soup.find_all('frame'):
            features["IframeOrFrame"] = True

        # Check if the webpage has a title
        if not soup.title:
            features["MissingTitle"] = True

        # Check if images are only within form tags
        images = soup.find_all('img')
        form_images = soup.find_all('form img')
        if len(images) == len(form_images):
            features["ImagesOnlyInForm"] = True

    except Exception as e:
        print("Error:", e)

    return features

# Example usage:
url = "https://google.com"
features = extract_features_from_url(url)
print("Features extracted from", url, ":", features)


Features extracted from https://google.com : {'SubmitInfoToEmail': False, 'IframeOrFrame': False, 'MissingTitle': False, 'ImagesOnlyInForm': False}


In [11]:
import urllib.request
from bs4 import BeautifulSoup
import re

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect,

        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window,
        
        "SubmitInfoToEmail": False,
        "IframeOrFrame": False,
        "MissingTitle": False,
        "ImagesOnlyInForm": False
    }

    try:
        # Fetch the webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if form submits to an email
        forms = soup.find_all('form')
        for form in forms:
            if 'mailto:' in form.get('action', ''):
                features["SubmitInfoToEmail"] = True
                break

        # Check for iframes or frames
        if soup.find_all('iframe') or soup.find_all('frame'):
            features["IframeOrFrame"] = True

        # Check if the webpage has a title
        if not soup.title:
            features["MissingTitle"] = True

        # Check if images are only within form tags
        images = soup.find_all('img')
        form_images = soup.find_all('form img')
        if len(images) == len(form_images):
            features["ImagesOnlyInForm"] = True

    except Exception as e:
        print("Error:", e)
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)

{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False, 'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0, 'FrequentDomainNameMismatch': 1, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0, 'SubmitInfoToEmail': False, 'IframeOrFrame': False, 'MissingTitle': False, 'ImagesOnlyInForm': False}


In [12]:
import urllib.request
from bs4 import BeautifulSoup
import re

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Calculate SubdomainLevelRT
    subdomain_level_rt = len(parsed_url.hostname.split('.'))

    # Calculate UrlLengthRT
    url_length_rt = len(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect,

        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window,
        
        "SubmitInfoToEmail": False,
        "IframeOrFrame": False,
        "MissingTitle": False,
        "ImagesOnlyInForm": False,

        "SubdomainLevelRT": subdomain_level_rt,
        "UrlLengthRT": url_length_rt,
    }

    try:
        # Fetch the webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if form submits to an email
        forms = soup.find_all('form')
        for form in forms:
            if 'mailto:' in form.get('action', ''):
                features["SubmitInfoToEmail"] = True
                break

        # Check for iframes or frames
        if soup.find_all('iframe') or soup.find_all('frame'):
            features["IframeOrFrame"] = True

        # Check if the webpage has a title
        if not soup.title:
            features["MissingTitle"] = True

        # Check if images are only within form tags
        images = soup.find_all('img')
        form_images = soup.find_all('form img')
        if len(images) == len(form_images):
            features["ImagesOnlyInForm"] = True

    except Exception as e:
        print("Error:", e)
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)

{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False, 'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0, 'FrequentDomainNameMismatch': 1, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0, 'SubmitInfoToEmail': False, 'IframeOrFrame': False, 'MissingTitle': False, 'ImagesOnlyInForm': False, 'SubdomainLevelRT': 3, 'UrlLengthRT': 22, 'PctExtResourceUrlsRT': None, 'AbnormalExt

In [13]:
import urllib.request
from bs4 import BeautifulSoup
import re
import requests

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)


    # Function to fetch the webpage content and extract external resource URLs
    def extract_external_resource_urls(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            ext_resource_urls = [res.get('src') for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '')]
            return ext_resource_urls
        except Exception as e:
            print("Error fetching webpage:", e)
            return []

    # Function to fetch the webpage content and extract form actions
    def extract_form_actions(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            form_actions = [form.get('action', '') for form in soup.find_all('form')]
            return form_actions
        except Exception as e:
            print("Error fetching webpage:", e)
            return []

    # Calculate PctExtResourceUrlsRT
    ext_resource_urls = extract_external_resource_urls(url)
    total_resources = len(ext_resource_urls)
    total_urls = len([res for res in ext_resource_urls if urllib.parse.urlparse(res).netloc != parsed_url.netloc])
    pct_ext_resource_urls_rt = (total_urls / total_resources) * 100 if total_resources > 0 else 0

    # Calculate AbnormalExtFormActionR
    form_actions = extract_form_actions(url)
    abnormal_ext_form_action_r = sum(1 for action in form_actions if urllib.parse.urlparse(action).netloc != parsed_url.netloc)

    # Calculate SubdomainLevelRT
    subdomain_level_rt = len(parsed_url.hostname.split('.'))

    # Calculate UrlLengthRT
    url_length_rt = len(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect,

        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window,
        
        "SubmitInfoToEmail": False,
        "IframeOrFrame": False,
        "MissingTitle": False,
        "ImagesOnlyInForm": False,

        "SubdomainLevelRT": subdomain_level_rt,
        "UrlLengthRT": url_length_rt,

        "PctExtResourceUrlsRT": pct_ext_resource_urls_rt,
        "AbnormalExtFormActionR": abnormal_ext_form_action_r,
    }

    try:
        # Fetch the webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if form submits to an email
        forms = soup.find_all('form')
        for form in forms:
            if 'mailto:' in form.get('action', ''):
                features["SubmitInfoToEmail"] = True
                break

        # Check for iframes or frames
        if soup.find_all('iframe') or soup.find_all('frame'):
            features["IframeOrFrame"] = True

        # Check if the webpage has a title
        if not soup.title:
            features["MissingTitle"] = True

        # Check if images are only within form tags
        images = soup.find_all('img')
        form_images = soup.find_all('form img')
        if len(images) == len(form_images):
            features["ImagesOnlyInForm"] = True

    except Exception as e:
        print("Error:", e)
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)

{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False, 'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0, 'FrequentDomainNameMismatch': 1, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0, 'SubmitInfoToEmail': False, 'IframeOrFrame': False, 'MissingTitle': False, 'ImagesOnlyInForm': False, 'SubdomainLevelRT': 3, 'UrlLengthRT': 22, 'PctExtResourceUrlsRT': 0, 'AbnormalExtFor

In [1]:
import urllib.request
from bs4 import BeautifulSoup
import re
import requests

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Function to fetch the webpage content and extract external meta tags, script sources, and link hrefs
    def extract_external_content(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            ext_meta_tags = [meta['content'] for meta in soup.find_all('meta') if 'http' in meta.get('content', '')]
            script_sources = [script['src'] for script in soup.find_all('script') if 'http' in script.get('src', '')]
            link_hrefs = [link['href'] for link in soup.find_all('link') if 'http' in link.get('href', '')]
            return ext_meta_tags, script_sources, link_hrefs
        except Exception as e:
            print("Error fetching webpage:", e)
            return [], [], []

    # Function to calculate the percentage of external null self-redirect hyperlinks
    def calculate_pct_ext_null_self_redirect_hyperlinks(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            total_links = len(soup.find_all('a'))
            ext_null_self_redirect_links = sum(1 for link in soup.find_all('a') if link.get('href', '').lower() in ['null', 'void(0)'])
            pct_ext_null_self_redirect_hyperlinks_rt = (ext_null_self_redirect_links / total_links) * 100 if total_links > 0 else 0
            return pct_ext_null_self_redirect_hyperlinks_rt
        except Exception as e:
            print("Error calculating percentage of external null self-redirect hyperlinks:", e)
            return 0

    # Calculate ExtMetaScriptLinkRT
    ext_meta_tags, script_sources, link_hrefs = extract_external_content(url)
    ext_meta_script_link_rt = len(ext_meta_tags) + len(script_sources) + len(link_hrefs)

    # Calculate PctExtNullSelfRedirectHyperlinksRT
    pct_ext_null_self_redirect_hyperlinks_rt = calculate_pct_ext_null_self_redirect_hyperlinks(url)

    # Set the CLASS_LABEL
    # Here, you would define your logic to determine the class label based on the features extracted
    # For demonstration purposes, let's set it as 'phishing' if any external meta tags, scripts, or links are found, otherwise 'legitimate'
    class_label = 'phishing' if ext_meta_script_link_rt > 0 else 'legitimate'


    # Function to fetch the webpage content and extract external resource URLs
    def extract_external_resource_urls(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            ext_resource_urls = [res.get('src') for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '')]
            return ext_resource_urls
        except Exception as e:
            print("Error fetching webpage:", e)
            return []

    # Function to fetch the webpage content and extract form actions
    def extract_form_actions(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            form_actions = [form.get('action', '') for form in soup.find_all('form')]
            return form_actions
        except Exception as e:
            print("Error fetching webpage:", e)
            return []

    # Calculate PctExtResourceUrlsRT
    ext_resource_urls = extract_external_resource_urls(url)
    total_resources = len(ext_resource_urls)
    total_urls = len([res for res in ext_resource_urls if urllib.parse.urlparse(res).netloc != parsed_url.netloc])
    pct_ext_resource_urls_rt = (total_urls / total_resources) * 100 if total_resources > 0 else 0

    # Calculate AbnormalExtFormActionR
    form_actions = extract_form_actions(url)
    abnormal_ext_form_action_r = sum(1 for action in form_actions if urllib.parse.urlparse(action).netloc != parsed_url.netloc)

    # Calculate SubdomainLevelRT
    subdomain_level_rt = len(parsed_url.hostname.split('.'))

    # Calculate UrlLengthRT
    url_length_rt = len(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect,

        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window,
        
        "SubmitInfoToEmail": False,
        "IframeOrFrame": False,
        "MissingTitle": False,
        "ImagesOnlyInForm": False,

        "SubdomainLevelRT": subdomain_level_rt,
        "UrlLengthRT": url_length_rt,

        "PctExtResourceUrlsRT": pct_ext_resource_urls_rt,
        "AbnormalExtFormActionR": abnormal_ext_form_action_r,

        "ExtMetaScriptLinkRT": ext_meta_script_link_rt,
        "PctExtNullSelfRedirectHyperlinksRT": pct_ext_null_self_redirect_hyperlinks_rt,
        "CLASS_LABEL": class_label
    }

    try:
        # Fetch the webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if form submits to an email
        forms = soup.find_all('form')
        for form in forms:
            if 'mailto:' in form.get('action', ''):
                features["SubmitInfoToEmail"] = True
                break

        # Check for iframes or frames
        if soup.find_all('iframe') or soup.find_all('frame'):
            features["IframeOrFrame"] = True

        # Check if the webpage has a title
        if not soup.title:
            features["MissingTitle"] = True

        # Check if images are only within form tags
        images = soup.find_all('img')
        form_images = soup.find_all('form img')
        if len(images) == len(form_images):
            features["ImagesOnlyInForm"] = True

    except Exception as e:
        print("Error:", e)
    
    return features

# Example usage
url = "https://www.google.com"
features = extract_features(url)
print(features)

{'NumDots': 2, 'SubdomainLevel': 3, 'PathLevel': 2, 'UrlLength': 22, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 14, 'PathLength': 0, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 1, 'PctExtHyperlinks': 70.0, 'PctExtResourceUrls': 0.0, 'ExtFavicon': 0, 'InsecureForms': False, 'RelativeFormAction': 100.0, 'ExtFormAction': 0.0, 'AbnormalFormAction': 0.0, 'PctNullSelfRedirectHyperlinks': 0.0, 'FrequentDomainNameMismatch': 1, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0, 'SubmitInfoToEmail': False, 'IframeOrFrame': False, 'MissingTitle': False, 'ImagesOnlyInForm': False, 'SubdomainLevelRT': 3, 'UrlLengthRT': 22, 'PctExtResourceUrlsRT': 0, 'AbnormalExtFor

In [4]:
import urllib.request
from bs4 import BeautifulSoup
import re
import requests

def extract_features(url):
    # Function to parse HTML content of the webpage and extract required features
    def parse_html_content(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check for frequent domain name mismatch
        domain = urllib.parse.urlparse(url).netloc
        mismatch_count = sum(1 for link in soup.find_all('a') if domain not in link.get('href'))
        frequent_domain_name_mismatch = 1 if mismatch_count > 5 else 0
        
        # Check for fake links in the status bar
        fake_links = sum(1 for link in soup.find_all('a') if 'javascript' in link.get('href', '').lower())
        fake_link_in_status_bar = 1 if fake_links > 0 else 0
        
        # Check if right-click is disabled
        right_click_disabled = 1 if 'contextmenu' in html_content.decode().lower() else 0
        
        # Check for pop-up windows
        pop_up_windows = sum(1 for tag in soup.find_all() if tag.get('onload') and 'window.open' in tag.get('onload').lower())
        pop_up_window = 1 if pop_up_windows > 0 else 0
        
        forms = soup.find_all('form')
        links = soup.find_all('a')
        
        total_forms = len(forms)
        total_links = len(links)
        
        # Initialize feature counts
        relative_form_action_count = 0
        ext_form_action_count = 0
        abnormal_form_action_count = 0
        null_self_redirect_count = 0
        
        for form in forms:
            action = form.get('action', '')
            if action.startswith('/'):
                relative_form_action_count += 1
            elif '://' in action:
                ext_form_action_count += 1
            elif action and not action.startswith('#') and not action.startswith('javascript:'):
                abnormal_form_action_count += 1
        
        for link in links:
            href = link.get('href', '')
            if not href or href.startswith('#') or href.lower() in ['null', 'void(0)']:
                null_self_redirect_count += 1
        
        # Calculate percentages
        pct_relative_form_action = (relative_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_ext_form_action = (ext_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_abnormal_form_action = (abnormal_form_action_count / total_forms) * 100 if total_forms > 0 else 0
        pct_null_self_redirect = (null_self_redirect_count / total_links) * 100 if total_links > 0 else 0
        
        # Count total number of hyperlinks and external hyperlinks
        total_hyperlinks = len(links)
        ext_hyperlinks = sum(1 for link in links if 'http' in link.get('href'))
        pct_ext_hyperlinks = (ext_hyperlinks / total_hyperlinks) * 100 if total_hyperlinks > 0 else 0
        
        # Count total number of resource URLs and external resource URLs (like images, scripts)
        total_resources = len(soup.find_all(['img', 'script', 'link']))
        ext_resources = sum(1 for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '') or 'http' in res.get('href', ''))
        pct_ext_resources = (ext_resources / total_resources) * 100 if total_resources > 0 else 0
        
        # Check if the webpage has an external favicon
        favicon_url = soup.find('link', rel='shortcut icon')
        ext_favicon = 1 if favicon_url and 'http' in favicon_url.get('href', '') else 0
        
        # Check if the webpage contains insecure forms
        insecure_forms = any(form.get('action', '').startswith('http://') for form in forms)
        
        return pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Function to fetch the webpage content and extract external meta tags, script sources, and link hrefs
    def extract_external_content(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            ext_meta_tags = [meta['content'] for meta in soup.find_all('meta') if 'http' in meta.get('content', '')]
            script_sources = [script['src'] for script in soup.find_all('script') if 'http' in script.get('src', '')]
            link_hrefs = [link['href'] for link in soup.find_all('link') if 'http' in link.get('href', '')]
            return ext_meta_tags, script_sources, link_hrefs
        except Exception as e:
            print("Error fetching webpage:", e)
            return [], [], []

    # Function to calculate the percentage of external null self-redirect hyperlinks
    def calculate_pct_ext_null_self_redirect_hyperlinks(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            total_links = len(soup.find_all('a'))
            ext_null_self_redirect_links = sum(1 for link in soup.find_all('a') if link.get('href', '').lower() in ['null', 'void(0)'])
            pct_ext_null_self_redirect_hyperlinks_rt = (ext_null_self_redirect_links / total_links) * 100 if total_links > 0 else 0
            return pct_ext_null_self_redirect_hyperlinks_rt
        except Exception as e:
            print("Error calculating percentage of external null self-redirect hyperlinks:", e)
            return 0

    # Calculate ExtMetaScriptLinkRT
    ext_meta_tags, script_sources, link_hrefs = extract_external_content(url)
    ext_meta_script_link_rt = len(ext_meta_tags) + len(script_sources) + len(link_hrefs)

    # Calculate PctExtNullSelfRedirectHyperlinksRT
    pct_ext_null_self_redirect_hyperlinks_rt = calculate_pct_ext_null_self_redirect_hyperlinks(url)

    # Set the CLASS_LABEL
    # Here, you would define your logic to determine the class label based on the features extracted
    # For demonstration purposes, let's set it as 'phishing' if any external meta tags, scripts, or links are found, otherwise 'legitimate'
    class_label = 'phishing' if ext_meta_script_link_rt > 0 else 'legitimate'


    # Function to fetch the webpage content and extract external resource URLs
    def extract_external_resource_urls(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            ext_resource_urls = [res.get('src') for res in soup.find_all(['img', 'script', 'link']) if 'http' in res.get('src', '')]
            return ext_resource_urls
        except Exception as e:
            print("Error fetching webpage:", e)
            return []

    # Function to fetch the webpage content and extract form actions
    def extract_form_actions(url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            form_actions = [form.get('action', '') for form in soup.find_all('form')]
            return form_actions
        except Exception as e:
            print("Error fetching webpage:", e)
            return []

    # Calculate PctExtResourceUrlsRT
    ext_resource_urls = extract_external_resource_urls(url)
    total_resources = len(ext_resource_urls)
    total_urls = len([res for res in ext_resource_urls if urllib.parse.urlparse(res).netloc != parsed_url.netloc])
    pct_ext_resource_urls_rt = (total_urls / total_resources) * 100 if total_resources > 0 else 0

    # Calculate AbnormalExtFormActionR
    form_actions = extract_form_actions(url)
    abnormal_ext_form_action_r = sum(1 for action in form_actions if urllib.parse.urlparse(action).netloc != parsed_url.netloc)

    # Calculate SubdomainLevelRT
    subdomain_level_rt = len(parsed_url.hostname.split('.'))

    # Calculate UrlLengthRT
    url_length_rt = len(url)

    # Predefined lists of sensitive words and brand names
    sensitive_words = ["login", "password", "banking", "account", "verify", "secure"]
    brand_names = ["paypal", "google", "facebook", "amazon", "apple"]

    # Function to count occurrences of sensitive words and check for embedded brand names
    def count_sensitive_words(url):
        num_sensitive_words = sum(1 for word in sensitive_words if word in url.lower())
        embedded_brand_name = any(brand in url.lower() for brand in brand_names)
        return num_sensitive_words, embedded_brand_name
    
    # Extract the features
    num_sensitive_words, embedded_brand_name = count_sensitive_words(url)
    
    # Fetch the webpage content
    try:
        with urllib.request.urlopen(url) as response:
            html_content = response.read()
            pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = parse_html_content(html_content)
    except Exception as e:
        print("Error fetching webpage:", e)
        pct_relative_form_action, pct_ext_form_action, pct_abnormal_form_action, pct_null_self_redirect, pct_ext_hyperlinks, pct_ext_resources, ext_favicon, insecure_forms, frequent_domain_name_mismatch, fake_link_in_status_bar, right_click_disabled, pop_up_window = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    
    # Features
    features = {
        "NumDots": url.count('.'),
        "SubdomainLevel": len(parsed_url.hostname.split('.')),
        "PathLevel": url.count('/'),
        "UrlLength": len(url),
        "NumDash": url.count('-'),
        "NumDashInHostname": parsed_url.hostname.count('-'),
        "AtSymbol": 1 if '@' in url else 0,
        "TildeSymbol": 1 if '~' in url else 0,
        "NumUnderscore": url.count('_'),
        "NumPercent": url.count('%'),
        
        "NumQueryComponents": len(urllib.parse.parse_qs(parsed_url.query)),
        "NumAmpersand": url.count('&'),
        "NumHash": url.count('#'),
        "NumNumericChars": sum(c.isdigit() for c in url),
        "NoHttps": 1 if parsed_url.scheme != 'https' else 0,
        
        "RandomString": 1 if re.search(r'\b[0-9a-f]{10}\b', url) else 0, # Assuming random string contains 10 hex characters
        "IpAddress": 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', parsed_url.netloc) else 0,
        "DomainInSubdomains": 1 if parsed_url.netloc.count('.') > 2 else 0,
        "DomainInPaths": 1 if '.' in parsed_url.path else 0,
        "HttpsInHostname": 1 if 'https' in parsed_url.netloc else 0,

        "HostnameLength": len(parsed_url.hostname),
        "PathLength": len(parsed_url.path),
        "QueryLength": len(parsed_url.query),
        "DoubleSlashInPath": 1 if '//' in url else 0,
        
        "NumSensitiveWords": num_sensitive_words,
        "EmbeddedBrandName": 1 if embedded_brand_name else 0,

        "PctExtHyperlinks": pct_ext_hyperlinks,
        "PctExtResourceUrls": pct_ext_resources,
        "ExtFavicon": ext_favicon,
        "InsecureForms": insecure_forms,

        "RelativeFormAction": pct_relative_form_action,
        "ExtFormAction": pct_ext_form_action,
        "AbnormalFormAction": pct_abnormal_form_action,
        "PctNullSelfRedirectHyperlinks": pct_null_self_redirect,

        "FrequentDomainNameMismatch": frequent_domain_name_mismatch,
        "FakeLinkInStatusBar": fake_link_in_status_bar,
        "RightClickDisabled": right_click_disabled,
        "PopUpWindow": pop_up_window,
        
        "SubmitInfoToEmail": False,
        "IframeOrFrame": False,
        "MissingTitle": False,
        "ImagesOnlyInForm": False,

        "SubdomainLevelRT": subdomain_level_rt,
        "UrlLengthRT": url_length_rt,

        "PctExtResourceUrlsRT": pct_ext_resource_urls_rt,
        "AbnormalExtFormActionR": abnormal_ext_form_action_r,

        "ExtMetaScriptLinkRT": ext_meta_script_link_rt,
        "PctExtNullSelfRedirectHyperlinksRT": pct_ext_null_self_redirect_hyperlinks_rt,
        "CLASS_LABEL": class_label
    }

    try:
        # Fetch the webpage content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if form submits to an email
        forms = soup.find_all('form')
        for form in forms:
            if 'mailto:' in form.get('action', ''):
                features["SubmitInfoToEmail"] = True
                break

        # Check for iframes or frames
        if soup.find_all('iframe') or soup.find_all('frame'):
            features["IframeOrFrame"] = True

        # Check if the webpage has a title
        if not soup.title:
            features["MissingTitle"] = True

        # Check if images are only within form tags
        images = soup.find_all('img')
        form_images = soup.find_all('form img')
        if len(images) == len(form_images):
            features["ImagesOnlyInForm"] = True

    except Exception as e:
        print("Error:", e)
    
    return features

# Example usage
url = "https://cults3d.com/"
features = extract_features(url)
print(features)

Error fetching webpage: HTTP Error 403: Forbidden
{'NumDots': 1, 'SubdomainLevel': 2, 'PathLevel': 3, 'UrlLength': 20, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 0, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 1, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 11, 'PathLength': 1, 'QueryLength': 0, 'DoubleSlashInPath': 1, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 0, 'PctExtHyperlinks': 0, 'PctExtResourceUrls': 0, 'ExtFavicon': 0, 'InsecureForms': 0, 'RelativeFormAction': 0, 'ExtFormAction': 0, 'AbnormalFormAction': 0, 'PctNullSelfRedirectHyperlinks': 0, 'FrequentDomainNameMismatch': 0, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0, 'SubmitInfoToEmail': False, 'IframeOrFrame': False, 'MissingTitle': False, 'ImagesOnlyInForm': False, 'SubdomainLevelRT': 2, 'UrlLengthRT': 20, 'PctExtReso