In [1]:
import pandas as pd
import math

#### Data chosen

In [2]:
df = pd.read_csv('url_phill_extra.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


In [3]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')

The dataset has 551652 rows and 4 columns


+ 345738 --> Legitimate [0]
+ 206381 --> Phishing [1]

#### Data Cleaning

In [4]:
df['label'] = df['label'].replace({'benign': 'legitimate', 'malicious': 'phishing'})
df.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,legitimate,0
1,1,https://www.youtube.com,legitimate,0
2,2,https://www.facebook.com,legitimate,0
3,3,https://www.baidu.com,legitimate,0
4,4,https://www.wikipedia.org,legitimate,0


In [5]:
df.rename(columns = {'result': 'Target', 'url': 'URL', 'label': 'Label'}, inplace = True)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target
0,0,https://www.google.com,legitimate,0
1,1,https://www.youtube.com,legitimate,0
2,2,https://www.facebook.com,legitimate,0
3,3,https://www.baidu.com,legitimate,0
4,4,https://www.wikipedia.org,legitimate,0


#### Data Understanding

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551652 entries, 0 to 551651
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  551652 non-null  int64 
 1   URL         551652 non-null  object
 2   Label       551652 non-null  object
 3   Target      551652 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 16.8+ MB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,551652.0,275854.748294,159294.088711,0.0,137912.75,275825.5,413738.25,552118.0
Target,551652.0,0.373268,0.483673,0.0,0.0,0.0,1.0,1.0


In [None]:
# checking missing values
df.isnull().sum()

Unnamed: 0    0
URL           0
Label         0
Target        0
dtype: int64

In [9]:
# checking duplicate values
print(f'Duplicates: {df.duplicated().sum()}')

Duplicates: 0


#### Feature extraction

##### 1. URL length

In [10]:
def get_url_len(url):
    return len(url)

url = 'https://www.google.com'
print(get_url_len(url))

22


In [11]:
df['URL Length'] = df['URL'].apply(get_url_len)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length
0,0,https://www.google.com,legitimate,0,22
1,1,https://www.youtube.com,legitimate,0,23
2,2,https://www.facebook.com,legitimate,0,24
3,3,https://www.baidu.com,legitimate,0,21
4,4,https://www.wikipedia.org,legitimate,0,25


##### 2. No of digits in URL

In [12]:
def count_digits(url):
    count = 0
    for i in url:
        if '0' <= i <= '9':
            count += 1
    return count

url = 'http://writeassociate.com/test/Portal/inicio/IO8Hc30w_Eq8DfVjyJGvwEO4GhAnH48CqLwGx-uH4XXCpAPCJlRkBsaGmGQ6QgAIyLKwQ/www.Bancasaleon.com.do/bhdi/'
print(count_digits(url))

9


In [13]:
df['Digits count'] = df['URL'].apply(count_digits)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count
0,0,https://www.google.com,legitimate,0,22,0
1,1,https://www.youtube.com,legitimate,0,23,0
2,2,https://www.facebook.com,legitimate,0,24,0
3,3,https://www.baidu.com,legitimate,0,21,0
4,4,https://www.wikipedia.org,legitimate,0,25,0


##### 3. URL depth

In [14]:
def count_url_depth(url):
    url1 = url.split("://")[-1]
    return url1.count('/')

url = 'https://www.google.com'
print(count_url_depth(url))

0


In [15]:
df['URL Depth'] = df['URL'].apply(count_url_depth)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth
0,0,https://www.google.com,legitimate,0,22,0,0
1,1,https://www.youtube.com,legitimate,0,23,0,0
2,2,https://www.facebook.com,legitimate,0,24,0,0
3,3,https://www.baidu.com,legitimate,0,21,0,0
4,4,https://www.wikipedia.org,legitimate,0,25,0,0


##### 4. URL Entropy

In [16]:
def entropy(url):
    frequency = {}
    for char in url:
        if char in frequency:
            frequency[char] += 1
        else:
            frequency[char] = 1

    entropy = 0.0
    length = len(url)
    for count in frequency.values():
        probability = count / length
        if probability > 0:
            entropy -= probability * math.log2(probability)
    return round(entropy, 1) 

url = 'https://www.wikipedia.org/wiki/Entropy'
print(entropy(url))

4.0


In [17]:
df['URL Entropy'] = df['URL'].apply(entropy)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy
0,0,https://www.google.com,legitimate,0,22,0,0,3.7
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8


##### 5. URL Popularity Score (higher for legitimate)

In [18]:
def popularity_score(url):
    popular_tlds = ['.com', '.org', '.net', '.gov', '.edu']
    
    tld_score = 5 if any(url.endswith(tld) for tld in popular_tlds) else 2

    if len(url) < 20:
        length_score = 5
    elif len(url) < 30:
        length_score = 3
    else:
        length_score = 1


    subdomain_count = url.count('.') - 1
    subdomain_score = max(1, 5 - subdomain_count)  

    score = tld_score + length_score + subdomain_score
    return score

url = 'http://ip-23-229-147-12.ip.secureserver.net/public/files/products/specsheet/bookmark/ii.php?rand=13InboxLightaspxn.1774256418&amp;fid.4.1252899642&amp;fid=1&amp;fid=4&amp;fav.1&amp;fav.1&amp;rand.13InboxLight.aspxn.1774256418&amp;fid.1252899642&amp;fid.1&amp;email&amp;.rand=13InboxLight.aspx?n=1774256418'
print(popularity_score(url))

4


In [19]:
df['URL Popularity Score'] = df['URL'].apply(popularity_score)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12


##### 6. Domain of URL

In [20]:
def get_domain(url):
    if '://' in url:
        url = url.split('://')[1]
    if 'www.' in url:
        url = url.split('www.')[1]
    domain = url.split('/')[0]
    return domain

url = 'https://www.google.com'
print(get_domain(url))

google.com


In [21]:
df['Domain'] = df['URL'].apply(get_domain)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org


##### 7. Number of Sub-Domains

In [22]:
def count_subdomain(url):
    if '://' in url:
        url = url.split('://')[1]
    domain_part = url.split('/')[0]
    parts = domain_part.split('.')
    if len(parts) < 2:
        return 0
    return len(parts) - 2

url = 'https://www.community.livejournal.com/lollipop_lyrics/17646.html,benign'
print(count_subdomain(url))

2


In [23]:
df['Sub-domain count'] = df['URL'].apply(count_subdomain)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1


##### 8. Entropy of Domain

In [24]:
def calculate_entropy(url):
    domain = get_domain(url)
    frequency = {}
    for char in domain:
        if char in frequency:
            frequency[char] += 1
        else:
            frequency[char] = 1
            
    total_chars = len(domain)
    
    entropy = 0.0
    for count in frequency.values():
        probability = count / total_chars
        entropy -= probability * (0 if probability == 0 else log2(probability))
    
    return round(entropy, 1) 

def log2(x):
    return 3.321928094887362 * (x**(1/3.321928094887362))

url = 'https://www.archiver.rootsweb.ancestry.com/th/read/RABURN/1997-10/0875834683'
print(calculate_entropy(url))

-1.5


In [25]:
df['Domain Entropy'] = df['URL'].apply(calculate_entropy)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7


##### 9. Path Length

In [26]:
def path_length(url):
    path_start = url.find('/', url.find('//') + 2)
    
    if path_start == -1:
        return 0
    
    path_end = url.find('?') if '?' in url else len(url)
    path = url[path_start:path_end]
    
    return len(path)


url = 'https://www.archiver.rootsweb.ancestry.com/th/read/RABURN/1997-10/0875834683'
print(path_length(url))

34


In [27]:
df['Path length'] = df['URL'].apply(path_length)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9,0
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8,0
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8,0
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7,0
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0


##### 10. Count of Special characters

In [28]:
def count_special_char(url):
    special_char = ['-', '_', '.', '=', '?', '&', '@', '#', '%', ':', '~']
    return sum(url.count(char) for char in special_char)

url = 'http://writeassociate.com/test/Portal/inicio/IO8Hc30w_Eq8DfVjyJGvwEO4GhAnH48CqLwGx-uH4XXCpAPCJlRkBsaGmGQ6QgAIyLKwQ/www.Bancasaleon.com.do/bhdi/'
print(count_special_char(url))

7


In [29]:
df['Special Characters count'] = df['URL'].apply(count_special_char)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9,0,3
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8,0,3
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8,0,3
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7,0,3
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3


##### 11. HTTPS

In [30]:
def has_https(url):
    if url.startswith('https://'):
        return 'Yes'
    else:
        return 'No'
    
url = 'http://writeassociate.com/test/Portal/inicio/IO8Hc30w_Eq8DfVjyJGvwEO4GhAnH48CqLwGx-uH4XXCpAPCJlRkBsaGmGQ6QgAIyLKwQ/www.Bancasaleon.com.do/bhdi/'
print(has_https(url))

No


In [31]:
df['Has https'] = df['URL'].apply(has_https)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count,Has https
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9,0,3,Yes
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8,0,3,Yes
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8,0,3,Yes
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7,0,3,Yes
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3,Yes


##### 12. Has IP in URL

In [32]:
def has_ip(url):
    main_part = url.split('/')[2] if '//' in url else url.split('/')[0]
    parts = main_part.split('.')
    
    if len(parts) == 4:
        for part in parts:
            if not part.isdigit() or not (0 <= int(part) <= 255):
                return 'No'
        return 'Yes'
    return 'No'

url = 'http://256.100.50.25/test'
print(has_ip(url))

No


In [33]:
df['Has IP'] = df['URL'].apply(has_ip)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count,Has https,Has IP
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9,0,3,Yes,No
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8,0,3,Yes,No
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8,0,3,Yes,No
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7,0,3,Yes,No
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3,Yes,No


##### 13. Has Suspicious Keywords

In [34]:
def has_suspicious_keywords(url):
    suspicious_keywords = [
        'login', 'verify', 'secure', 'account', 'pay',
        'bank', 'password', 'credential', 'bitcoin', 'confirm'
    ]
    url_lower = url.lower()
    
    for keyword in suspicious_keywords:
        if keyword in url_lower:
            return 'Yes'
    return 'No'

url = 'https://www.wcv2.com/1/login.php'
print(has_suspicious_keywords(url))

Yes


In [35]:
df['Has Suspicious keywords'] = df['URL'].apply(has_suspicious_keywords)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count,Has https,Has IP,Has Suspicious keywords
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9,0,3,Yes,No,No
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8,0,3,Yes,No,No
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8,0,3,Yes,No,No
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7,0,3,Yes,No,No
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3,Yes,No,No


##### 14. TLD Value

In [36]:
def get_tld(url):
    if '://' in url:
        url = url.split('://')[1]
    
    if url.startswith('www.'):
        url = url[4:]

    parts = url.split("/")[0].split(".")
    if len(parts) > 1:
        return parts[-1]
    return None

url = 'https://www.wikipedia.org/wiki/Entropy'
print(get_tld(url))

org


In [37]:
df['TLD'] = df['URL'].apply(get_tld)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,Label,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count,Has https,Has IP,Has Suspicious keywords,TLD
0,0,https://www.google.com,legitimate,0,22,0,0,3.7,12,google.com,1,-1.9,0,3,Yes,No,No,com
1,1,https://www.youtube.com,legitimate,0,23,0,0,3.8,12,youtube.com,1,-1.8,0,3,Yes,No,No,com
2,2,https://www.facebook.com,legitimate,0,24,0,0,3.9,12,facebook.com,1,-1.8,0,3,Yes,No,No,com
3,3,https://www.baidu.com,legitimate,0,21,0,0,3.9,12,baidu.com,1,-1.7,0,3,Yes,No,No,com
4,4,https://www.wikipedia.org,legitimate,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3,Yes,No,No,org


##### 15. Contains Suspicious TLD

In [38]:
def has_suspicious_tld(url):
    suspicious_tlds = [
        "xyz", "top", "club", "online", "site", "win", "click", "link",
        "bid", "ga", "download", "buzz", "loan", "review",
        "men", "party", "top", "fun", "space",
        "pw", "cn", "tk", "work"
    ]

    tld = get_tld(url)

    if tld and tld in suspicious_tlds:
        return 'Yes'
    return 'No'

url = 'https://thedroneshop.xyz/wp-content/l/'
print(has_suspicious_tld(url))

Yes


In [41]:
df['Has Suspicious TLD'] = df['URL'].apply(has_suspicious_tld)
df.head()

Unnamed: 0.1,Unnamed: 0,Target,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count,Has https,Has IP,Has Suspicious keywords,TLD,Has Suspicious TLD,URL,Label
0,0,0,22,0,0,3.7,12,google.com,1,-1.9,0,3,Yes,No,No,com,No,https://www.google.com,legitimate
1,1,0,23,0,0,3.8,12,youtube.com,1,-1.8,0,3,Yes,No,No,com,No,https://www.youtube.com,legitimate
2,2,0,24,0,0,3.9,12,facebook.com,1,-1.8,0,3,Yes,No,No,com,No,https://www.facebook.com,legitimate
3,3,0,21,0,0,3.9,12,baidu.com,1,-1.7,0,3,Yes,No,No,com,No,https://www.baidu.com,legitimate
4,4,0,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3,Yes,No,No,org,No,https://www.wikipedia.org,legitimate


#### Re-arranging columns

In [42]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [44]:
df.columns

Index(['Target', 'URL Length', 'Digits count', 'URL Depth', 'URL Entropy',
       'URL Popularity Score', 'Domain', 'Sub-domain count', 'Domain Entropy',
       'Path length', 'Special Characters count', 'Has https', 'Has IP',
       'Has Suspicious keywords', 'TLD', 'Has Suspicious TLD', 'URL', 'Label'],
      dtype='object')

In [45]:
df = df[['URL', 'URL Length', 'Digits count', 'URL Depth', 'URL Entropy',
       'URL Popularity Score', 'Domain', 'Sub-domain count', 'Domain Entropy',
       'Path length', 'Special Characters count', 'Has https', 'Has IP',
       'Has Suspicious keywords', 'TLD', 'Has Suspicious TLD', 'Label','Target']]

In [46]:
df

Unnamed: 0,URL,URL Length,Digits count,URL Depth,URL Entropy,URL Popularity Score,Domain,Sub-domain count,Domain Entropy,Path length,Special Characters count,Has https,Has IP,Has Suspicious keywords,TLD,Has Suspicious TLD,Label,Target
0,https://www.google.com,22,0,0,3.7,12,google.com,1,-1.9,0,3,Yes,No,No,com,No,legitimate,0
1,https://www.youtube.com,23,0,0,3.8,12,youtube.com,1,-1.8,0,3,Yes,No,No,com,No,legitimate,0
2,https://www.facebook.com,24,0,0,3.9,12,facebook.com,1,-1.8,0,3,Yes,No,No,com,No,legitimate,0
3,https://www.baidu.com,21,0,0,3.9,12,baidu.com,1,-1.7,0,3,Yes,No,No,com,No,legitimate,0
4,https://www.wikipedia.org,25,0,0,3.8,12,wikipedia.org,1,-1.7,0,3,Yes,No,No,org,No,legitimate,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551647,https://ww.prestamo.enlinea.pe.vpphoangha.vn/,45,0,1,3.9,4,ww.prestamo.enlinea.pe.vpphoangha.vn,4,-1.5,1,6,Yes,No,No,vn,No,phishing,1
551648,http://goldenrod-motley-texture.glitch.me/hvwa...,51,0,1,4.2,6,goldenrod-motley-texture.glitch.me,1,-1.5,10,6,No,No,No,me,No,phishing,1
551649,https://bancolombia.com1home0892.repl.co/?2,43,6,1,4.3,6,bancolombia.com1home0892.repl.co,2,-1.5,1,5,Yes,No,No,co,No,phishing,1
551650,https://aol-108318.weeblysite.com/,34,6,1,4.3,7,aol-108318.weeblysite.com,1,-1.4,1,4,Yes,No,No,com,No,phishing,1


In [47]:
df.to_csv('FINAL.csv')