# Phishing Website Detection Feature Extraction


# Objective:

1-Identify phishing websites that mimic trusted URLs and webpages.

2-Collect data and extract selective features from URLs.



# Collecting the Data:

# 1--Phishing URLs:

Source: PhishTank (https://www.phishtank.com/developer_info.php)
Provides a set of phishing URLs in formats like CSV and JSON, updated hourly.

# 2--Legitimate URLs:

Source: University of New Brunswick (https://www.unb.ca/cic/datasets/url-2016.html)
File: 'Benign_list_big_final.csv'
Contains 35,300 legitimate URLs.

In [None]:
import pandas as pd

In [3]:
df1=pd.read_csv("online_valid.csv")
##loading the phishing URLs data to dataframe

In [3]:
df1.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6557033,http://u1047531.cp.regruhosting.ru/acces-inges...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:43+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
1,6557032,http://hoysalacreations.com/wp-content/plugins...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:37+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
2,6557011,http://www.accsystemprblemhelp.site/checkpoint...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:54:31+00:00,yes,2020-05-09T21:55:38+00:00,yes,Facebook
3,6557010,http://www.accsystemprblemhelp.site/login_atte...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:53:48+00:00,yes,2020-05-09T21:54:34+00:00,yes,Facebook
4,6557009,https://firebasestorage.googleapis.com/v0/b/so...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:49:27+00:00,yes,2020-05-09T21:51:24+00:00,yes,Microsoft


In [38]:
df1.shape

(14858, 8)

In [40]:
phisurl=df1.sample(n=5000,random_state=12).copy()
phisurl=phisurl.reset_index(drop=True)
#Collecting 5,000 Phishing URLs randomly

In [41]:
phisurl.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6514946,http://confirmprofileaccount.com/,http://www.phishtank.com/phish_detail.php?phis...,2020-04-19T11:06:55+00:00,yes,2020-04-19T13:42:41+00:00,yes,Other
1,4927651,http://www.marreme.com/MasterAdmin/04mop.html,http://www.phishtank.com/phish_detail.php?phis...,2017-04-04T19:35:54+00:00,yes,2017-05-03T23:00:42+00:00,yes,Other
2,5116976,http://modsecpaststudents.com/review/,http://www.phishtank.com/phish_detail.php?phis...,2017-07-25T18:48:30+00:00,yes,2017-07-28T16:01:36+00:00,yes,Other
3,6356131,https://docs.google.com/forms/d/e/1FAIpQLScL6L...,http://www.phishtank.com/phish_detail.php?phis...,2020-01-13T20:13:37+00:00,yes,2020-01-17T01:55:38+00:00,yes,Other
4,6535965,https://oportunidadedasemana.com/americanas//?...,http://www.phishtank.com/phish_detail.php?phis...,2020-04-29T00:01:03+00:00,yes,2020-05-01T10:55:35+00:00,yes,Other


# As of now we collected 5000 phishing URLs. Now, we need to collect the legitimate URLs.

In [5]:
##Loading legitimate files 
df2=pd.read_csv("Benign_list_big_final.csv")
df2.columns = ['URLs']

In [8]:
df2.head()

Unnamed: 0,URLs
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...


In [9]:
df2.shape

(35377, 1)

In [6]:
legiurls=df2.sample(n=5000,random_state=11).copy()
legiurls=legiurls.reset_index(drop=True)
legiurls.head()

Unnamed: 0,URLs
0,https://ahrefs.com/content-explorer/overview/v...
1,http://espn.go.com/college-football/story/_/id...
2,http://indianexpress.com/article/world/neighbo...
3,http://thenextweb.com/apps/2011/07/24/the-comp...
4,http://twitter.com/home?status=%E3%83%8C%E3%81...


In [11]:
df2['URLs'][3]

'http://1337x.to/torrent/1145504/Avengers-Age-of-Ultron-2015-CAM-New-Audio-x264-CPG/'

# Feature Extraction:
In this step, features are extracted from the URLs dataset.

The extracted features are categorized into

1)Address Bar based Features

2)HTML & Javascript based Features


# 3.1. Address Bar Based Features:
Many features can be extracted that can be consided as address bar base features. Out of them, below mentioned were considered for this project.

*-Domain of URL

*-IP Address in URL

*-"@" Symbol in URL

*-Length of URL

*-Depth of URL

*-Redirection "//" in URL

*-"http/https" in Domain name

*-Using URL Shortening Services “TinyURL”

*-Prefix or Suffix "-" in Domain

In [7]:
#importing required packages for this section
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [8]:
# 1.Domain of the URL (Domain) 
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
    domain = domain.replace("www.","")
  return domain

In [9]:
getDomain("http://1337x.to/torrent/1145504/Avengers-Age-of-Ultron-2015-CAM-New-Audio-x264-CPG/")

'1337x.to'

In [10]:
# 2.Checks for IP address in URL (Have_IP)
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

In [16]:
havingIP('http://1337x.to/torrent/1145504/Avengers-Age-of-Ultron-2015-CAM-New-Audio-x264-CPG/')

0

In [11]:
# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

In [12]:
# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

In [13]:
# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

In [14]:
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

In [15]:
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

In [16]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [17]:
# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0
     

In [18]:
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

# Domain Based Features:
Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.

*-DNS Record

*-Website Traffic

*-Age of Domain

*-End Period of Domain

In [19]:
# importing required packages for this section
import requests

In [20]:
# 15. IFrame Redirection (iFrame)
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[|]", response.text):
          return 0
      else:
          return 1

In [21]:
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("", response.text):
      return 1
    else:
      return 0

In [22]:
# 17.Checks the status of the right click attribute (Right_Click)
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

In [23]:
# 18.Checks the number of forwardings (Web_Forwards)    
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1
     

In [24]:
#Function to extract features
def featureExtraction(url,label):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))

  # HTML & Javascript based features (4)
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(label)

  return features

In [28]:
#Extracting the feautres & storing them in a list
legi_features = []
label = 0
for i in range(0, 1000):
  url = legiurls['URLs'][i]
  legi_features.append(featureExtraction(url,label))

In [31]:
len(legi_features)


1000

In [32]:
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)


Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,ahrefs.com,0,0,1,8,0,0,0,0,0,1,1,0,0
1,espn.go.com,0,0,1,6,0,0,0,0,1,1,1,0,0
2,indianexpress.com,0,0,1,5,0,0,0,0,0,1,1,0,0
3,thenextweb.com,0,0,1,7,0,0,0,0,0,1,1,0,0
4,twitter.com,0,0,1,1,0,0,0,0,1,1,1,1,0


In [None]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('legitimate.csv', index= False)

In [6]:
pd.read_csv('legitimate.csv').head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,graphicriver.net,0,0,1,1,0,0,0,0,0,0,1,0,0
1,ecnavi.jp,0,0,1,1,1,0,0,0,0,0,1,0,0
2,hubpages.com,0,0,1,1,0,0,0,0,0,0,1,0,0
3,extratorrent.cc,0,0,1,3,0,0,0,0,0,0,1,0,0
4,icicibank.com,0,0,1,3,0,0,0,0,0,0,1,0,0


In [None]:
#Extracting the feautres & storing them in a list
phish_features = []
label = 1
for i in range(0, 5000):
  url = phisurl['url'][i]
  phish_features.append(featureExtraction(url,label))

In [None]:
#converting the list to dataframe
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)


In [None]:
# Storing the extracted legitimate URLs fatures to csv file
phishing.to_csv('phishing.csv', index= False)
     

In [7]:
pd.read_csv('phishing.csv').head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,Tiny_URL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,eevee.tv,0,0,0,4,0,0,0,0,0,0,1,0,1
1,appleid.apple.com-sa.pm,0,0,0,1,0,0,0,1,0,0,1,0,1
2,grandcup.xyz,0,0,0,0,0,0,0,0,1,1,1,1,1
3,villa-azzurro.com,0,0,0,1,0,0,0,1,0,0,1,0,1
4,mygpstrip.net,0,0,0,2,0,0,0,0,0,0,1,0,1


In [None]:
#Concatenating the dataframes into one 
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)

In [None]:
urldata.to_csv('urldata.csv', index=False)
     

In [8]:
pd.read_csv('urldata.csv').head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,graphicriver.net,0,0,1,1,0,0,0,0,0,1,1,1,0,0,1,0,0
1,ecnavi.jp,0,0,1,1,1,0,0,0,0,1,1,1,0,0,1,0,0
2,hubpages.com,0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0
3,extratorrent.cc,0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0,0
4,icicibank.com,0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0,0
