# Preprocessing


In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_data = pd.read_csv("C:100-legitimate-art.txt")

In [3]:
raw_data.head()
#Dataset

Unnamed: 0,websites
0,http://www.emuck.com:3000/archive/egan.html
1,http://danoday.com/summit.shtml
2,http://groups.yahoo.com/group/voice_actor_appr...
3,http://voice-international.com/
4,http://www.livinglegendsltd.com/


In [3]:
raw_data['websites'].str.split("://").head()
#Splitting Protocol

0         [http, www.emuck.com:3000/archive/egan.html]
1                     [http, danoday.com/summit.shtml]
2    [http, groups.yahoo.com/group/voice_actor_appr...
3                     [http, voice-international.com/]
4                    [http, www.livinglegendsltd.com/]
Name: websites, dtype: object

In [4]:
seperation_of_protocol = raw_data['websites'].str.split("://",expand = True) 
#Saving protocol in a new coloumn

In [5]:
seperation_of_protocol.head()

Unnamed: 0,0,1
0,http,www.emuck.com:3000/archive/egan.html
1,http,danoday.com/summit.shtml
2,http,groups.yahoo.com/group/voice_actor_appreciatio...
3,http,voice-international.com/
4,http,www.livinglegendsltd.com/


In [6]:
seperation_domain_name = seperation_of_protocol[1].str.split("/",1,expand = True)
#Splitting Domain name

In [7]:
seperation_domain_name.columns=["domain_name","address"] 

In [8]:
seperation_domain_name.head()

Unnamed: 0,domain_name,address
0,www.emuck.com:3000,archive/egan.html
1,danoday.com,summit.shtml
2,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,voice-international.com,
4,www.livinglegendsltd.com,


In [9]:
#Concatenating data frames
splitted_data = pd.concat([seperation_of_protocol[0],seperation_domain_name],axis=1)


In [10]:
splitted_data.columns = ['protocol','domain_name','address']

In [11]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address
0,http,www.emuck.com:3000,archive/egan.html
1,http,danoday.com,summit.shtml
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,http,voice-international.com,
4,http,www.livinglegendsltd.com,


## Features Extraction

1.Long URL to Hide the Suspicious Part

If the length of the URL is greater than or equal 54 characters then the URL classified as phishing


0 --- indicates legitimate

1 --- indicates Phishing

2 --- indicates Suspicious

In [12]:
def long_url(l):
    if len(l) < 54:
        return 0
    elif len(l) >= 54 and len(l) <= 75:
        return 2
    return 1

In [13]:
#Add above result to dataset
splitted_data['long_url'] = raw_data['websites'].apply(long_url) 


In [14]:
splitted_data[splitted_data.long_url == 0].head()

Unnamed: 0,protocol,domain_name,address,long_url
0,http,www.emuck.com:3000,archive/egan.html,0
1,http,danoday.com,summit.shtml,0
3,http,voice-international.com,,0
4,http,www.livinglegendsltd.com,,0
5,http,voicechasers.com,forum/viewforum.php?f=8,0


Feature-2

2.URL’s having “@” Symbol

Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.

0 --- indicates legitimate

1 --- indicates Phishing


In [15]:
def have_at_symbol(l):
    if "@" in l:
        return 1
    return 0
    

In [16]:
splitted_data['having_@_symbol'] = raw_data['websites'].apply(have_at_symbol)

In [17]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol
0,http,www.emuck.com:3000,archive/egan.html,0,0
1,http,danoday.com,summit.shtml,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0
3,http,voice-international.com,,0,0
4,http,www.livinglegendsltd.com,,0,0


Feature-3

3.Redirecting using “//”

0 --- indicates legitimate

1 --- indicates Phishing


In [18]:
def redirection(l):
    if "//" in l:
        return 1
    return 0

In [19]:
splitted_data['redirection_//_symbol'] = seperation_of_protocol[1].apply(redirection)

In [20]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol
0,http,www.emuck.com:3000,archive/egan.html,0,0,0
1,http,danoday.com,summit.shtml,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0
3,http,voice-international.com,,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0


Feature-4

4.Adding Prefix or Suffix Separated by (-) to the Domain
    
1 --> indicates phishing

0 --> indicates legitimate
    

In [21]:
def prefix_suffix_seperation(l):
    if '-' in l:
        return 1
    return 0

In [22]:
splitted_data['prefix_suffix_seperation'] = seperation_domain_name['domain_name'].apply(prefix_suffix_seperation)

In [23]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0
3,http,voice-international.com,,0,0,0,1
4,http,www.livinglegendsltd.com,,0,0,0,0


Feature - 5

5. Sub-Domain and Multi Sub-Domains

0 --- indicates legitimate

1 --- indicates Phishing

2 --- indicates Suspicious


In [24]:
def sub_domains(l):
    if l.count('.') < 3:
        return 0
    elif l.count('.') == 3:
        return 2
    return 1

In [25]:
splitted_data['sub_domains'] = splitted_data['domain_name'].apply(sub_domains)

In [26]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0


Feature-6

6.Using the IP Address

1 --> indicates phishing

0 --> indicates legitimate

In [27]:
import re
def having_ip_address(url):
    match=re.search('(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  #IPv4
                    '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)'  #IPv4 in hexadecimal
                    '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',url)     #Ipv6
    if match:
        #print match.group()
        return 1
    else:
        #print 'No matching pattern found'
        return 0


In [28]:
splitted_data['having_ip_address'] = raw_data['websites'].apply(having_ip_address)

In [29]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0


Feature-7

7.Using URL Shortening Services
         
1 --> indicates phishing

0 --> indicates legitimate

In [30]:
def shortening_service(url):
    match=re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net',url)
    if match:
        return 1
    else:
        return 0



In [31]:
splitted_data['shortening_service'] = raw_data['websites'].apply(shortening_service)

In [32]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortening_service
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0


Feature - 8 

8.The Existence of “HTTPS” Token in the Domain Part of the URL

In [33]:
def https_token(url):
    match=re.search('https://|http://',url)
    if match.start(0)==0:
        url=url[match.end(0):]
    match=re.search('http|https',url)
    if match:
        return 1
    else:
        return 0


In [34]:
splitted_data['https_token'] = raw_data['websites'].apply(https_token)

In [35]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortening_service,https_token
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0,0


Feature - 9

9.Website Traffic

Phishing sites may not be recognized by the Alexa database (Alexa the Web Information Company).

    IF{Website Rank<100,000 → LegitimateWebsite Rank>100,

In [36]:
from bs4 import BeautifulSoup
import urllib.request
def web_traffic(url):
    try:
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
    except TypeError:
        return 1
    rank= int(rank)
    if (rank<100000):
        return 0
    else:
        return 2

In [37]:
splitted_data['web_traffic'] = raw_data['websites'].apply(web_traffic)

In [38]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortening_service,https_token,web_traffic
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0,0,1
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0,0,2
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0,0,0,1
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0,0,1


Feature - 10

10.Domain Registration Length

IF{Domains Expires on≤ 1 years → Phishing
         
         Otherwise→ Legitimate

In [47]:
import whois
from datetime import datetime
import time
def domain_registration_length_sub(domain):
    expiration_date = domain.expiration_date
    today = time.strftime('%Y-%m-%d')
    today = datetime.strptime(today, '%Y-%m-%d')
    if expiration_date is None:
        return 1
    elif type(expiration_date) is list or type(today) is list :
        return 2               
    else:
        registration_length = abs((expiration_date - today).days)
        if registration_length / 365 <= 1:
            return 1
        else:
            return 0

    
    
    

In [51]:
def domain_registration_length_main(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
        
    if dns == 1:
        return 1
    else:
        return domain_registration_length_sub(domain_name)
    

In [52]:
splitted_data['domain_registration_length'] = splitted_data['domain_name'].apply(domain_registration_length_main)

In [53]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortening_service,https_token,web_traffic,domain_registration_length
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0,0,1,1
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0,0,2,1
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0,0,0,2
3,http,voice-international.com,,0,0,0,1,0,0,0,0,1,1
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0,0,1,1
