# Feature Extraction

In [1]:
import ipaddress
import re
import urllib.request
from bs4 import BeautifulSoup
import socket
import requests
from googlesearch import search
import whois
from datetime import datetime
import time
from dateutil.parser import parse as date_parse
import dns
import OpenSSL
import ssl

def IPv4NumCheck(s):
	try:
		return str(int(s)) == s and 0 <= int(s) <= 255
	except: 
		return False

def IPv6NumCheck(s):
	if len(s) > 4:
		return False
	try : 
		return int(s, 16) >= 0 and s[0] != '-'
	except:
		return False
def is_registered(domain_name):
	"""
	A function that returns a boolean indicating 
	whether a `domain_name` is registered
	"""
	try:
		w = whois.whois(domain_name)
	except Exception:
		return False
	else:
		return bool(w.domain_name)

def get_certificate(host, port=443, timeout=10):
	context = ssl.create_default_context()
	conn = socket.create_connection((host, port))
	sock = context.wrap_socket(conn, server_hostname=host)
	sock.settimeout(timeout)
	try:
		der_cert = sock.getpeercert(True)
	finally:
		sock.close()
	return ssl.DER_cert_to_PEM_cert(der_cert)

def features(url):
	url_feature=[]
	#Converting to http
	if not re.match(r"^https?", url):
		url = "http://" + url
	#extracting domain 
	domain = re.findall(r"://([^/]+)/?", url)[0]
	if re.match(r"^www.",domain ):
		domain = domain.replace("www.","")   
	domain_n=domain   
	if domain_n.find('.com')!=-1:
		domain_n=domain_n.replace(".com","")

	#Getting response from URL
	try:
		response = requests.get(url)
		soup = BeautifulSoup(response.text, 'html.parser')
	except:
		response = ""
		soup = -999

	#1st feature:Contains Url
	
	if((domain_n.count(".") == 3 and all(IPv4NumCheck(i) for i in domain_n.split("."))) or (domain_n.count(":") == 7 and all(IPv6NumCheck(i) for i in domain_n.split(":")))):
		url_feature.append(-1)
	else:
		url_feature.append(1)
	
    
	#2nd feature:Length of URL
	
	if(len(url)<54):
		url_feature.append(1)
	elif(len(url)>=54 and len(url)<=75):
		url_feature.append(0)
	else:
		url_feature.append(-1)
	
    
	#3rd feature:Tinyurl Check
	
	tiny=re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
					'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
					'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
					'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
					'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
					'q\.gs|is\.gd|po\.st|bc\.vc',url)
	if tiny:
		url_feature.append(-1)
	else:
		url_feature.append(1)
	
    
	#4th feature:@ in url
	
	if(url.find('@')!=-1):
		url_feature.append(-1)
	else:
		url_feature.append(1)
	
    
	#5th feature:Redirecting i.e. '//' after posn 7
	
	if(url.find('//',7)!=-1):
		url_feature.append(-1)
	else:
		url_feature.append(1)
	
    
	#6th feature:Prefix/Suffix seperated by '-'
	
	if(domain.find('-')!=-1):
		url_feature.append(-1)
	else:
		url_feature.append(1)
	
    
	#7th feature:Dots in domain
	
	if(domain.count('.')==1):
		url_feature.append(1)
	elif(domain.count('.')==2):
		url_feature.append(0)
	else:
		url_feature.append(-1)
	
	#8th Feature:Https
	"""
	certificate = get_certificate('domain')
	x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, certificate)
	b=x509.get_notBefore()
	z=b.decode('UTF-8')
	cert_creation=datetime.strptime(z, '%Y%m%d%H%M%SZ')
	try:
		today = time.strftime('%Y-%m-%d')
		current_time=datetime.strptime(today, '%Y-%m-%d')	
		ssl_age=abs((current_time-cert_creation).days)
		if(ssl_age>365):
			url_feature.append(1)
		elif(ssl_age<365 and response==""):
			url_feature.append(0)
		else:
			url_feature.append(-1)
	except:
		url_feature.append(-1)
	"""
	
	try:
		if response.text:
			url_feature.append(1)
	except:
		url_feature.append(-1)
	
	#9th Feature:domain registration period
	
	domain_info = whois.whois(domain)
	expiration_date = domain_info.expiration_date
	try:
		expiration_date = min(expiration_date)
		today = time.strftime('%Y-%m-%d')
		current_time=datetime.strptime(today, '%Y-%m-%d')	
		regis_length=abs((expiration_date-current_time).days)
		if regis_length <=365 :
			url_feature.append(-1)
		else:
			url_feature.append(1)
	except:
		url_feature.append(-1)
	
    
	#10th Feature:Favicon
	
	if soup == -999:
		url_feature.append(-1)
	else:
		try:
			for head in soup.find_all('head'):
				for head.link in soup.find_all('link', href=True):
					dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
					if url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
						url_feature.append(1)
						raise StopIteration
					else:
						url_feature.append(-1)
						raise StopIteration
		except StopIteration:
			pass
      
	if(len(url_feature)==9):
		url_feature.append(-1)
    
	#11th Feature:Check for open ports
	
	try:
		port = domain.split(":")[1]
		if port:
			url_feature.append(-1)
		else:
			url_feature.append(1)
	except:
		url_feature.append(1)
	

	#12th feature:Check if website has https token
	
	if(url.find('https',7)!=-1):
		url_feature.append(-1)
	else:
		url_feature.append(1)
	

	#13th feature : Check url
	
	i = 0
	success = 0
	if soup == -999:
		url_feature.append(-1)
	else:
		for img in soup.find_all('img', src= True):
			dots= [x.start(0) for x in re.finditer('\.', img['src'])]
			if url in img['src'] or domain in img['src'] or len(dots)==1:
				success = success + 1
			i=i+1

		for audio in soup.find_all('audio', src= True):
			dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
			if url in audio['src'] or domain in audio['src'] or len(dots)==1:
				success = success + 1
			i=i+1

		for embed in soup.find_all('embed', src= True):
			dots=[x.start(0) for x in re.finditer('\.',embed['src'])]
			if url in embed['src'] or domain in embed['src'] or len(dots)==1:
				success = success + 1
			i=i+1

		for iframe in soup.find_all('iframe', src= True):
			dots=[x.start(0) for x in re.finditer('\.',iframe['src'])]
			if url in iframe['src'] or domain in iframe['src'] or len(dots)==1:
				success = success + 1
			i=i+1

		try:
			percentage = success/float(i) * 100
			if percentage < 22.0 :
				url_feature.append(1)
			elif((percentage >= 22.0) and (percentage < 61.0)) :
				url_feature.append(0)
			else :
				url_feature.append(-1)
		except:
			url_feature.append(1)
		percentage = 0
	
	#14th feature:Check links in anchor
	
	i = 0
	unsafe=0
	if soup == -999:
		url_feature.append(-1)
	else:
		for a in soup.find_all('a', href=True):
		# 2nd condition was 'JavaScript ::void(0)' but we put JavaScript because the space between javascript and :: might not be
				# there in the actual a['href']
			if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or domain in a['href']):
				unsafe = unsafe + 1
			i = i + 1


		try:
			percentage = unsafe / float(i) * 100
		except:
			url_feature.append(1)

		if percentage < 31.0:
			url_feature.append(1)
		elif ((percentage >= 31.0) and (percentage < 67.0)):
			url_feature.append(0)
		else:
			url_feature.append(-1)
	
    
	#15th feature:Check links in tags
	
	i=0
	success =0
	if soup == -999:
		url_feature.append(-1)
	else:
		for link in soup.find_all('link', href= True):
			dots=[x.start(0) for x in re.finditer('\.',link['href'])]
			if url in link['href'] or domain in link['href'] or len(dots)==1:
				success = success + 1
			i=i+1

		for script in soup.find_all('script', src= True):
			dots=[x.start(0) for x in re.finditer('\.',script['src'])]
			if url in script['src'] or domain in script['src'] or len(dots)==1 :
				success = success + 1
			i=i+1
		try:
			percentage = success / float(i) * 100
		except:
			url_feature.append(1)

		if percentage < 17.0 :
			url_feature.append(1)
		elif((percentage >= 17.0) and (percentage < 81.0)) :
			url_feature.append(0)
		else:
			url_feature.append(-1)
	
	#16th feature:Server Form Handler
	try:
		for form in soup.find_all('form', action= True):
			if form['action'] =="" or form['action'] == "about:blank" :
				url_feature.append(-1)
				break
			elif url not in form['action'] and domain not in form['action']:
				url_feature.append(0)
				break
			else:
				url_feature.append(1)
				break
	except:
		url_feature.append(-1)
    
	#17th Feature:Check if submitting to email
	
	if response == "":
		url_feature.append(-1)
	else:
		#if open('response.txt','r').read().find('mailto:' or 'mail(') != -1:
		if re.findall(r"[mail\(\)|mailto:?]", response.text):
			url_feature.append(-1)
		else:
			url_feature.append(1)
	

	#18th Feature:Check for abnormal url
	
	if(response == ""):
		url_feature.append(-1)
	else:
		if(is_registered(domain)):
			url_feature.append(1)
		else:
			url_feature.append(-1)
	

	#19th Feature:Check for number of responses/redirecting page
	
	if(response ==""):
		url_feature.append(-1)
	else:
		if(len(response.history) <=1):
			url_feature.append(1)
		elif(len(response.history)>1 and len(response.history)<=4):
			url_feature.append(0)
		else:
			url_feature.append(-1)
	

	#20th Feature:If fake url is displayed in Status bas
	
	if response == "":
		url_feature.append(-1)
	else:
		if(re.findall("<script>.+On_mouseover.+</script>",response.text)):
			url_feature.append(-1)
		else:
			url_feature.append(1)
	
    
	#21st Feature:Check if right click is disabled
	
	if response == "":
		url_feature.append(-1)
	else:
		if re.findall(r"event.button ?== ?2", response.text):
			url_feature.append(-1)
		else:
			url_feature.append(1)
	
    
	#22 Feature: Check pop up widnow contains text
	
	if response == "":
		url_feature.append(-1)
	else:
		if re.findall(r"alert\(", response.text):
			url_feature.append(-1)
		else:
			url_feature.append(1)
	

	#23 Feature: Check for use of frameBorder
	
	if response == "":
		url_feature.append(-1)
	else:
		if re.findall(r"[<iframe>|<frameBorder>]", response.text):
			url_feature.append(1)
		else:
			url_feature.append(-1)
	
    
	#24th Feature : Check for age of domain
	
	domain_info = whois.whois(domain)
	creation_date = domain_info.creation_date
	regis_length=0
	try:
		creation_date = min(creation_date)
		today = time.strftime('%Y-%m-%d')
		current_time=datetime.strptime(today, '%Y-%m-%d')	
		regis_length=abs((current_time-creation_date).days)
		if regis_length < 183:
			url_feature.append(-1)
		else:
			url_feature.append(1)
	except:
		url_feature.append(-1)
	
	#25th Feature:Dns Records
	"""
    dns_records = []
	for query_type in dns.rdatatype.RdataType:
		try:
			dns_records.extend(list(dns.resolver.resolve(domain, query_type)))
		except dns.exception.DNSException:
			continue
	if(dns_records==""):
		url_feature.append(-1)
	else:
		url_feature.append(1)
    """
	
	dns = 1
	try:
		d = whois.whois(domain)
	except:
		dns=-1
	if dns == -1:
		url_feature.append(-1)
	else:
		if regis_length / 365 <= 1:
			url_feature.append(-1)
		else:
			url_feature.append(1)
	

	#26th Feature:Check in alexa database
	
	try:
		rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
		rank= int(rank)
		if (rank<100000):
			url_feature.append(1)
		else:
			url_feature.append(0)
	except TypeError:
		url_feature.append(-1)
	
    
	#27th Feature:Check Page rank
	
	rank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {
		"name": domain
	})

	# Extracts global rank of the website
	try:
		global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
	except:
		global_rank = -1

	try:
		if global_rank > 0 and global_rank < 100000:
			url_feature.append(-1)
		else:
			url_feature.append(1)
	except:
		url_feature.append(1)
	
    
	#28th Feature: Google Index
	
	site=search(url, 5)
	if site:
		url_feature.append(1)
	else:
		url_feature.append(-1)
	
    
	#29th Feature:Links pointing to website
	
	if response == "":
		url_feature.append(-1)
	else:
		number_of_links = len(re.findall(r"<a href=", response.text))
		if number_of_links == 0:
			url_feature.append(-1)
		elif number_of_links <= 2:
			url_feature.append(0)
		else:
			url_feature.append(1)
	
    
	#30th Feature Statistical repor check
	
	url_match=re.search('at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly',url)
	try:
		ip_address=socket.gethostbyname(domain)
		ip_match=re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
						   '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
						   '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
						   '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
						   '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
						   '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42',ip_address)
		if url_match:
			url_feature.append(-1)
		elif ip_match:
			url_feature.append(-1)
		else:
			url_feature.append(1)
	except:
		print ('Connection problem. Please check your internet connection!')
      
	if(len(url_feature)<=29):
		url_feature.append(-1)
        
	print(url_feature)
	url_feature=np.reshape(url_feature,[1,-1])
	return url_feature    

In [2]:
import pickle
import numpy as np
filename = 'finalized_model.sav'
loaded_model=pickle.load(open(filename,'rb'))
def result(input):
	prediction=loaded_model.predict(input)
	if(prediction==1):
		return "Url is Legitimate"
	else:
		return "Url is Phishing"
		



In [None]:
from flask import Flask, render_template, request      

app = Flask(__name__)

@app.route("/")
def home():
    return render_template("index.html")
    
@app.route("/predict",methods=['POST','GET'])
def predict():
	if request.method=="POST":
		inp=request.form.get("link")
		ans=result(features(inp))
		return render_template("index.html",pred=ans)
    
if __name__ == "__main__":
    app.run(host='0.0.0.0',debug=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [02/May/2021 12:53:42] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/May/2021 12:53:42] "[37mGET /static/css/main.css HTTP/1.1[0m" 200 -


Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket


[2021-05-02 12:53:57,350] ERROR in app: Exception on /predict [POST]
Traceback (most recent call last):
  File "/Users/HP/opt/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "/Users/HP/opt/anaconda3/lib/python3.8/site-packages/urllib3/util/connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/Users/HP/opt/anaconda3/lib/python3.8/socket.py", line 918, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/HP/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "/Users/HP/opt/anaconda3/lib/python3.8/site-packages/urllib