# **URL Feature Extraction**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/phishing-dataset/Phishing_dataset_02.csv
/kaggle/input/phishing-dataset/phishing_site_urls.csv
/kaggle/input/phishing-dataset/Phishing_dataset_03.csv
/kaggle/input/phishing-dataset/Phishing_dataset_01.csv
/kaggle/input/phishing-dataset/malicious_phish.csv
/kaggle/input/phishing-dataset/Phishing_dataset_04.csv


In [2]:
# Load the dataset
data = pd.read_csv("/kaggle/input/phishing-dataset/Phishing_dataset_02.csv")

# drop unnecessary columns from the dataframe
data = data.drop(data.columns[1:66], axis=1)
data = data.drop(data.columns[2:3], axis=1)
data = data.drop(data.columns[4:6], axis=1)
data = data.drop(data.columns[10:13], axis=1)

# print number of phishing and legitimate urls
print(data['status'].value_counts())

# print the column names
print(data.columns)

data.head()

status
legitimate    5715
phishing      5715
Name: count, dtype: int64
Index(['url', 'login_form', 'links_in_tags', 'submit_email', 'sfh', 'iframe',
       'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
       'whois_registered_domain', 'domain_registration_length', 'domain_age',
       'web_traffic', 'dns_record', 'google_index', 'page_rank', 'status'],
      dtype='object')


Unnamed: 0,url,login_form,links_in_tags,submit_email,sfh,iframe,popup_window,safe_anchor,onmouseover,right_clic,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,0,80.0,0,0,0,0,0.0,0,0,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,0,100.0,0,0,0,0,100.0,0,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,0,100.0,0,0,0,0,100.0,0,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,0,100.0,0,0,0,0,62.5,0,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,1,76.470588,0,0,0,0,0.0,0,0,0,224,8175,8725,0,0,6,legitimate


## Feature Extraction Functions

In [3]:
# Import headers

import re
from urllib.parse import *

In [4]:
# Embedded Domain: Examines dot-separated domain/hostname patterns in the URL path.

def embedded_domain(url):
    # Extract the domain from the URL
    domain = urlparse(url).netloc
    # Split the domain into its components
    domain_parts = domain.split('.')
    
    # Checking whether the url is similar to a well-known domain
    # If it is, we return 1, else 0
    well_known_URLs = ['google', 'facebook', 'twitter', 'linkedin', 'youtube', 'instagram', 'pinterest', 'amazon', 'snapchat', 'reddit', 'flickr', 'whatsapp', 'quora', 'vimeo', 'periscope', 'vine', 'meetup', 'tagged', 'askfm', 'meetme', 'meetup', 'myspace', 'stumbleupon', 'delicious', 'digg', 'slashdot', 'fark', 'newsvine', 'foursquare', 'yelp', 'tripadvisor', 'zomato', 'opentable']
    
    for well_known_URL in well_known_URLs:
        for domain_part in domain_parts:
            if len(set(domain_part)&set(well_known_URL)) == len(well_known_URL) - 1:
                return 1
    return -1


# Example usage
url1 = "http://www.google.com"
url2 = "http://www.facehook.com"

print(embedded_domain(url1))
print(embedded_domain(url2))

-1
1


In [5]:
# IP Address: Attackers often employ IP address in the URL
# to disguise a webpage’s malicious nature, while legitimate
# websites almost always use domain names instead of IP
# addresses due to their easy memorability.

def having_ip_address(url):
    # Regular expression to match IP address pattern
    ip_address_pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'

    # Search for the pattern in the URL
    match = re.search(ip_address_pattern, url)

    if match:
        # print match.group()
        return -1
    else:
        # print 'No matching pattern found'
        return 1
    
# Example usage
url1 = "https://www.google.com"
url2 = "172.0.0.1"

print(having_ip_address(url1))
print(having_ip_address(url2))

1
-1


In [6]:
# Number of dots in URL: Phishing pages tend to use more
# dots in their URLs than the legitimate sites.
def no_of_dots(url):
    return url.count('.')

# Example usage
url = "https://www.google.com"

print(no_of_dots(url))

2


In [7]:
# Lexical features: The URL string is broken down into
# multiple tokens. Each token constitutes a binary feature.
# The delimiters to obtain the tokens are ‘/’, ‘?’, ‘.’, ‘=’, ‘ ’,
# ‘&’, and ‘-’

def extract_lexical_features(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Define delimiters
    delimiters = ['/', '?', '.', '=', ' ', '&', '-']

    # Split the url into tokens based on the delimiters
    tokens = re.split('|'.join(map(re.escape, delimiters)), parsed_url.geturl())

    # Remove empty tokens
    tokens = list(filter(None, tokens))

    # Initialize a dictionary to store binary features for each token
    lexical_features = {}

    # Extract binary features for each token
    for token in tokens:
        lexical_features[token] = 1
    
    return lexical_features

# Example usage

url = 'https://www.google.com/search?q=feature+extraction+from+url&oq=feature+extraction+from+url&aqs=chrome..69i57j0l7.10257j0j7&sourceid=chrome&ie=UTF-8'

print(extract_lexical_features(url))

{'https:': 1, 'www': 1, 'google': 1, 'com': 1, 'search': 1, 'q': 1, 'feature+extraction+from+url': 1, 'oq': 1, 'aqs': 1, 'chrome': 1, '69i57j0l7': 1, '10257j0j7': 1, 'sourceid': 1, 'ie': 1, 'UTF': 1, '8': 1}


In [8]:
# Number of sensitive words in URL: In (Garera et al., 2007),
# Garera et al summarized a set of eight sensitive words that
# frequently appear in phishing URLs. This is a numeric feature with a range of 0 to 8.

def no_of_sensitive_words(url):
    sensitive_words = ['confirm', 'account', 'banking', 'secure', 'ebayisapi', 'webscr', 'login', 'signin']
    count = 0
    for word in sensitive_words:
        if word in url:
            count += 1
    return count

# Example usage
url1 = "https://www.google.com"
url2 = "http://www.abc.com/confirm"

print(no_of_sensitive_words(url1))
print(no_of_sensitive_words(url2))

0
1


In [9]:
# Out-of-Position Top Level Domain (TLD): Checks for
# unusual positioning of TLDs in the URL.

def out_of_position_tld(url):
    tld = ['com', 'org', 'net', 'edu', 'gov', 'in']
    
    # domain = urlparse(url).netloc
    # check if the TLD is in the middle of the domain
    tokens = url.split('.')
    for i in range(len(tokens) - 1):
        if tokens[i] in tld:
            return -1
    return 1

# Example usage
url1 = 'http://www.google.com'
url2 = 'http://www.google.com.in'
    
print(out_of_position_tld(url1))
print(out_of_position_tld(url2))

1
-1


In [10]:
# Check if the website is using HTTPS
def https_token(url):
    https_tokens = url.split('//')[0]
    if https_tokens == 'https:':
        return 1
    else:
        return -1

# Example usage
url1 = 'http://www.google.com'
url2 = 'https://www.google.com'

print(https_token(url1))
print(https_token(url2))

-1
1


In [11]:
# Get the length of the URL
def url_length(url):
    return len(url)

In [12]:
# If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else -1 (legitimate).

# listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

# Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return -1

# Example usage
url1 = 'http://www.google.com'
url2 = 'https://goo.gl'

print(tinyURL(url1))
print(tinyURL(url2))

-1
1


In [13]:
# Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return -1            # legitimate
    
# Example usage
url1 = 'http://www.google.com'
url2 = 'http://www.go-ogle.com'

print(prefixSuffix(url1))
print(prefixSuffix(url2))

-1
1


In [14]:
# Apply the feature extraction functions to the URL column of the filtered data
data.loc[:,'embedded_domain'] = data.loc[:,'url'].apply(embedded_domain)
data.loc[:,'having_ip_address'] = data.loc[:,'url'].apply(having_ip_address)
data.loc[:,'no_of_dots'] = data.loc[:,'url'].apply(no_of_dots)
data.loc[:,'lexical_features'] = data.loc[:,'url'].apply(extract_lexical_features)
data.loc[:,'no_of_sensitive_words'] = data.loc[:,'url'].apply(no_of_sensitive_words)
data.loc[:,'out_of_position_tld'] = data.loc[:,'url'].apply(out_of_position_tld)
data.loc[:,'https_token'] = data.loc[:,'url'].apply(https_token)
data.loc[:,'url_length'] = data.loc[:,'url'].apply(url_length)
data.loc[:,'tinyURL'] = data.loc[:,'url'].apply(tinyURL)
data.loc[:,'prefixSuffix'] = data.loc[:,'url'].apply(prefixSuffix)

data.head()

Unnamed: 0,url,login_form,links_in_tags,submit_email,sfh,iframe,popup_window,safe_anchor,onmouseover,right_clic,...,embedded_domain,having_ip_address,no_of_dots,lexical_features,no_of_sensitive_words,out_of_position_tld,https_token,url_length,tinyURL,prefixSuffix
0,http://www.crestonwood.com/router.php,0,80.0,0,0,0,0,0.0,0,0,...,-1,1,3,"{'http:': 1, 'www': 1, 'crestonwood': 1, 'com'...",0,1,-1,37,-1,-1
1,http://shadetreetechnology.com/V4/validation/a...,0,100.0,0,0,0,0,100.0,0,0,...,1,1,1,"{'http:': 1, 'shadetreetechnology': 1, 'com': ...",0,1,-1,77,-1,-1
2,https://support-appleld.com.secureupdate.duila...,0,100.0,0,0,0,0,100.0,0,0,...,1,1,4,"{'https:': 1, 'support': 1, 'appleld': 1, 'com...",1,-1,1,126,-1,1
3,http://rgipt.ac.in,0,100.0,0,0,0,0,62.5,0,0,...,-1,1,2,"{'http:': 1, 'rgipt': 1, 'ac': 1, 'in': 1}",0,1,-1,18,-1,-1
4,http://www.iracing.com/tracks/gateway-motorspo...,1,76.470588,0,0,0,0,0.0,0,0,...,-1,1,2,"{'http:': 1, 'www': 1, 'iracing': 1, 'com': 1,...",0,1,-1,55,-1,-1


In [15]:
# print the value counts of each feature
for col in data.columns[1:]:
    print(data[col].value_counts())

login_form
0    10703
1      727
Name: count, dtype: int64
links_in_tags
0.000000      3403
100.000000    2851
50.000000      453
66.666667      345
75.000000      195
              ... 
68.000000        1
16.000000        1
48.780488        1
13.043478        1
91.176471        1
Name: count, Length: 473, dtype: int64
submit_email
0    11430
Name: count, dtype: int64
sfh
0    11430
Name: count, dtype: int64
iframe
0    11415
1       15
Name: count, dtype: int64
popup_window
0    11361
1       69
Name: count, dtype: int64
safe_anchor
0.000000      4438
100.000000    1732
50.000000      337
25.000000      319
14.285714      225
              ... 
32.142857        1
75.806452        1
87.272727        1
11.956522        1
17.500000        1
Name: count, Length: 1083, dtype: int64
onmouseover
0    11417
1       13
Name: count, dtype: int64
right_clic
0    11414
1       16
Name: count, dtype: int64
whois_registered_domain
0    10597
1      833
Name: count, dtype: int64
domain_registration_

In [16]:
# save the data to a new csv file named 'Data_processed.csv'
data.to_csv('/kaggle/working/Data_processed.csv', index=False)