In [30]:
import os
import datetime
import math
import pandas as pd
import numpy as np
import whois
from pyquery import PyQuery
from requests import get

In [6]:
os.chdir('../')
os.chdir("FinalDataset/URL")

In [49]:
class UrlFeaturizer(object):
    def __init__(self, url):
        self.url = url
        self.domain = url.split('//')[-1].split('/')[0]
        self.today = datetime.datetime.now()

        try:
            self.whois = whois.query(self.domain).__dict__
        except:
            self.whois = None

        try:
            self.response = get(self.url)
            self.pq = PyQuery(self.url)
        except:
            self.response = None
            self.pq = None
        
        #self.response = get(self.url)
        #self.pq = PyQuery(self.url)
    
    
#function to perform whois on given url
    def perform_whois(self):
        try:
            self.whois_result = whois.whois(self.url)
            return whois_result
        except Exception:
            self.whois_result = False
            return False
        
    def get_registered_date_in_days(self):
        if(self.whois_result!=False):
            created_date = self.whois_result.creation_date
            if((created_date is not None) and (type(created_date)!=str)):
                if(type(created_date)==list):
                    created_date=created_date[0]
                today_date=datetime.datetime.now()
                days = (today_date-created_date).days
                return days
            else:
                return -1
        else:
            return -1
    
#function to fetch the website expiry date in days using URL expiration_date
    def get_expiration_date_in_days(self):
        if(self.whois_result!=False):
            expiration_date = self.whois_result.expiration_date
            if((expiration_date is not None) and (type(expiration_date)!=str)):
                if(type(expiration_date)==list):
                    expiration_date = expiration_date[0]
                today_date=datetime.datetime.now()
                days = (expiration_date-today_date).days
                return days
            else:
                return -1
        else:
            return -1
        
    def get_updated_date_in_days(self):
        if(self.whois_result!=False):
            updated_date = self.whois_result.updated_date
            if((updated_date is not None) and (type(updated_date)!=str)):
                if(type(updated_date)==list):
                    updated_date = updated_date[0]
                today_date=datetime.datetime.now()
                days = (today_date-updated_date).days
                return days
            else:
                return -1
        else:
            return -1
    
    ## URL string Features
    def entropy(self):
        string = self.url.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy

    def numDigits(self):
        digits = [i for i in self.url if i.isdigit()]
        return len(digits)

    def urlLength(self):
        return len(self.url)

    def numParameters(self):
        params = self.url.split('&')
        return len(params) - 1

    def numFragments(self):
        fragments = self.url.split('#')
        return len(fragments) - 1

    def numSubDomains(self):
        subdomains = self.url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1

    def domainExtension(self):
        ext = self.url.split('.')[-1].split('/')[0]
        return ext

    ## URL domain features
    def hasHttp(self):
        return 'http:' in self.url

    def hasHttps(self):
        return 'https:' in self.url

    def urlIsLive(self):
        return self.response == 200

    def daysSinceRegistration(self):
        if self.whois and self.whois['creation_date']:
            diff = self.today - self.whois['creation_date']
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0

    def daysSinceExpiration(self):
        if self.whois and self.whois['expiration_date']:
            diff = self.whois['expiration_date'] - self.today
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0

    ## URL Page Features
    def bodyLength(self):
        if self.pq is not None:
            return len(self.pq('html').text()) if self.urlIsLive else 0
        else:
            return 0

    def numTitles(self):
        if self.pq is not None:
            titles = ['h{}'.format(i) for i in range(7)]
            titles = [self.pq(i).items() for i in titles]
            return len([item for s in titles for item in s])
        else:
            return 0

    def numImages(self):
        if self.pq is not None:
            return len([i for i in self.pq('img').items()])
        else:
            return 0

    def numLinks(self):
        if self.pq is not None:
            return len([i for i in self.pq('a').items()])
        else:
            return 0

    def scriptLength(self):
        if self.pq is not None:
            return len(self.pq('script').text())
        else:
            return 0

    def specialCharacters(self):
        if self.pq is not None:
            bodyText = self.pq('html').text()
            schars = [i for i in bodyText if not i.isdigit() and not i.isalpha()]
            return len(schars)
        else:
            return 0

    def scriptToSpecialCharsRatio(self):
        if self.pq is not None:
            sscr = self.scriptLength()/self.specialCharacters
        else:
            sscr = 0
        return sscr

    def scriptTobodyRatio(self):
        if self.pq is not None:
            sbr = self.scriptLength()/self.bodyLength
        else:
            sbr = 0
        return sbr

    def bodyToSpecialCharRatio(self):
        if self.pq is not None:
            bscr = self.specialCharacters()/self.bodyLength
        else:
            bscr = 0
        return bscr
    
    def ip(self):
        string = self.url
        flag = False
        if ("." in string):
            elements_array = string.strip().split(".")
            if(len(elements_array) == 4):
                for i in elements_array:
                    if (i.isnumeric() and int(i)>=0 and int(i)<=255):
                        flag=True
                    else:
                        flag=False
                        break
        if flag:
            return 1 
        else:
            return 0

    def run(self):
        data = {}
        data['entropy'] = self.entropy()
        data['numDigits'] = self.numDigits()
        data['urlLength'] = self.urlLength()
        data['numParams'] = self.numParameters()
        data['hasHttp'] = self.hasHttp()
        data['hasHttps'] = self.hasHttps()
        data['urlIsLive'] = self.urlIsLive()
        data['bodyLength'] = self.bodyLength()
        data['numTitles'] = self.numTitles()
        data['numImages'] = self.numImages()
        data['numLinks'] = self.numLinks()
        data['scriptLength'] = self.scriptLength()
        data['specialChars'] = self.specialCharacters()
        data['ext'] = self.domainExtension()
        data['dsr'] = self.daysSinceRegistration()
        data['dse'] = self.daysSinceExpiration()
        data['sscr'] = self.scriptToSpecialCharsRatio()
        data['sbr'] = self.scriptTobodyRatio()
        data['bscr'] = self.bodyToSpecialCharRatio()
        data['num_%20'] = self.url.count("%20")
        data['has_ip'] = self.ip()
        self.perform_whois()
        data['whois_regDate'] = self.get_registered_date_in_days()
        data['whois_expDate'] = self.get_expiration_date_in_days()
        data['whois_updatedDate'] = self.get_updated_date_in_days()
        return data

In [15]:
l =os.listdir()

In [16]:
print(l)

['Benign_list_big_final.csv', '._Benign_list_big_final.csv', 'DefacementSitesURLFiltered.csv', '._DefacementSitesURLFiltered.csv', 'Malware_dataset.csv', '._Malware_dataset.csv', 'phishing_dataset.csv', '._phishing_dataset.csv', 'spam_dataset.csv', '._spam_dataset.csv', 'URL_feature.csv']


In [50]:
l = ['Benign_list_big_final.csv', 'DefacementSitesURLFiltered.csv', 'Malware_dataset.csv', 'phishing_dataset.csv','spam_dataset.csv']

In [None]:
emp =UrlFeaturizer("").run().keys()
A = pd.DataFrame(columns =emp)
t=[]
for j in l:
    d=pd.read_csv(j,header=None)
    dd=d.to_numpy().flatten()
    for i in dd:
        temp=UrlFeaturizer(i).run()
        temp["File"]=j.split(".")[0]
        t.append(temp)
A=A.append(t)

Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket
Error trying to connect to socket: closing socket


In [None]:
A.head()

In [124]:
A.to_csv("URL_feature.csv")
    