In [6]:
import numpy as np 
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld

In [7]:
urldata1 = pd.read_csv('urldata1.csv', on_bad_lines='skip')
urldata1.head()

Unnamed: 0,url,result
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [8]:
#Removing the unnamed columns as it is not necesary.
#urldata1 = urldata1.drop(['Unnamed: 0','label'],axis=1)
urldata1 = urldata1.rename(columns = {"result":"label"})
urldata1.head()

Unnamed: 0,url,label
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [9]:
urls_data = pd.read_csv("urldata.csv")
urls_data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [10]:
def convert_to_int(word):
    word_dict = {'bad':1, 'good':0}
    return word_dict[word]

urls_data['label'] = urls_data['label'].apply(lambda x : convert_to_int(x))
urls_data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,1
1,espdesign.com.au,1
2,iamagameaddict.com,1
3,kalantzis.net,1
4,slightlyoffcenter.net,1


In [11]:
urldata = pd.concat([urldata1, urls_data], ignore_index = True)
urldata.head()

Unnamed: 0,url,label
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [12]:
from urllib.parse import urlparse

In [13]:
urldata['url_length'] = urldata['url'].apply(lambda i: len(str(i)))

In [14]:
#Hostname Length
urldata['hostname_length'] = urldata['url'].apply(lambda i: len(urlparse(i).netloc))

In [15]:
urlparse('https://www.google.com').netloc

'www.google.com'

In [16]:
#Path Length
urldata['path_length'] = urldata['url'].apply(lambda i: len(urlparse(i).path))

In [17]:
urlparse('https://www.youtube.com/watch?v=megsXkCLzVo&list=RDmegsXkCLzVo&start_radio=1').path

'/watch'

In [18]:
#First Directory Length
def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

urldata['fd_length'] = urldata['url'].apply(lambda i: fd_length(i))

In [19]:
#Length of Top Level Domain
urldata['tld'] = urldata['url'].apply(lambda i: get_tld(i,fail_silently=True))
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

urldata['tld_length'] = urldata['tld'].apply(lambda i: tld_length(i))
    



In [20]:
get_tld('https://www.youtube.com/watch?v=megsXkCLzVo&list=RDmegsXkCLzVo&start_radio=1')

'youtube.com'

In [21]:
urldata.head()

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,tld,tld_length
0,https://www.google.com,0,22,14,0,0,google.com,10
1,https://www.youtube.com,0,23,15,0,0,youtube.com,11
2,https://www.facebook.com,0,24,16,0,0,facebook.com,12
3,https://www.baidu.com,0,21,13,0,0,baidu.com,9
4,https://www.wikipedia.org,0,25,17,0,0,wikipedia.org,13


In [22]:
urldata = urldata.drop("tld",axis=1)

In [23]:
urldata.head()

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,tld_length
0,https://www.google.com,0,22,14,0,0,10
1,https://www.youtube.com,0,23,15,0,0,11
2,https://www.facebook.com,0,24,16,0,0,12
3,https://www.baidu.com,0,21,13,0,0,9
4,https://www.wikipedia.org,0,25,17,0,0,13


In [24]:
urldata['count-'] = urldata['url'].apply(lambda i: i.count('-'))
urldata['count@'] = urldata['url'].apply(lambda i: i.count('@'))
urldata['count?'] = urldata['url'].apply(lambda i: i.count('?'))
urldata['count.'] = urldata['url'].apply(lambda i: i.count('.'))
urldata['count='] = urldata['url'].apply(lambda i: i.count('='))
urldata['count-http'] = urldata['url'].apply(lambda i : i.count('http'))
urldata['count-https'] = urldata['url'].apply(lambda i : i.count('https'))
urldata['count-www'] = urldata['url'].apply(lambda i: i.count('www'))

In [25]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"
     

# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [26]:
# importing required packages for this section
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [27]:
# importing required packages for this section
import requests

In [28]:
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

In [29]:
urldata['red'] = urldata['url'].apply(lambda i: redirection(i))
urldata['tiny_url'] = urldata['url'].apply(lambda i: tinyURL(i))

In [30]:
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits
urldata['count-digits']= urldata['url'].apply(lambda i: digit_count(i))
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters
urldata['count-letters']= urldata['url'].apply(lambda i: letter_count(i))
def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')
urldata['count_dir'] = urldata['url'].apply(lambda i: no_of_dir(i))

In [31]:
urldata.head()

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,tld_length,count-,count@,count?,count.,count=,count-http,count-https,count-www,red,tiny_url,count-digits,count-letters,count_dir
0,https://www.google.com,0,22,14,0,0,10,0,0,0,2,0,1,1,1,0,0,0,17,0
1,https://www.youtube.com,0,23,15,0,0,11,0,0,0,2,0,1,1,1,0,0,0,18,0
2,https://www.facebook.com,0,24,16,0,0,12,0,0,0,2,0,1,1,1,0,0,0,19,0
3,https://www.baidu.com,0,21,13,0,0,9,0,0,0,2,0,1,1,1,0,0,0,16,0
4,https://www.wikipedia.org,0,25,17,0,0,13,0,0,0,2,0,1,1,1,0,0,0,20,0


In [32]:
urldata.tail()

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,tld_length,count-,count@,count?,count.,count=,count-http,count-https,count-www,red,tiny_url,count-digits,count-letters,count_dir
793795,udemy.com,0,9,0,9,0,-1,0,0,0,1,0,0,0,0,0,0,0,8,0
793796,google,0,6,0,6,0,-1,0,0,0,0,0,0,0,0,0,0,0,6,0
793797,facebook,0,8,0,8,0,-1,0,0,0,0,0,0,0,0,0,0,0,8,0
793798,coursera,0,8,0,8,0,-1,0,0,0,0,0,0,0,0,0,0,0,8,0
793799,udemy,0,5,0,5,0,-1,0,0,0,0,0,0,0,0,0,0,0,5,0


In [33]:
from urllib.parse import urlparse

def has_sensitive_keywords(url):
    # Extract path from URL
    path = urlparse(url).path
    
    # List of sensitive keywords to check for
    keywords = ['login', 'banking', 'password', 'account',"password","account","verify","security","update","payment","confirmation","session","signin","logon","authenticate","validate","admin","billing","support"]
    
    for keyword in keywords:
        if keyword in path:
            return 1
    
    return 0




In [38]:
urldata['sensitive_key'] = urls_data['url'].apply(lambda i: has_sensitive_keywords(i))

In [39]:
import tldextract

def has_multiple_subdomains(url):
    # Extract domain and subdomains from URL
    domain = tldextract.extract(url).domain
    subdomains = tldextract.extract(url).subdomain.split('.')
    
    # Count number of subdomains
    num_subdomains = len(subdomains)
    return num_subdomains


In [40]:
urldata['mul_sub_domains'] = urls_data['url'].apply(lambda i: has_multiple_subdomains(i))

In [41]:
urldata.head()

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,tld_length,count-,count@,count?,...,count-http,count-https,count-www,red,tiny_url,count-digits,count-letters,count_dir,sensitive_key,mul_sub_domains
0,https://www.google.com,0,22,14,0,0,10,0,0,0,...,1,1,1,0,0,0,17,0,0.0,1.0
1,https://www.youtube.com,0,23,15,0,0,11,0,0,0,...,1,1,1,0,0,0,18,0,0.0,1.0
2,https://www.facebook.com,0,24,16,0,0,12,0,0,0,...,1,1,1,0,0,0,19,0,0.0,1.0
3,https://www.baidu.com,0,21,13,0,0,9,0,0,0,...,1,1,1,0,0,0,16,0,0.0,1.0
4,https://www.wikipedia.org,0,25,17,0,0,13,0,0,0,...,1,1,1,0,0,0,20,0,0.0,1.0


In [42]:
urldata.to_csv('urls_data.csv', index=False)