In [50]:
import os
import datetime
import math
import pandas as pd
import numpy as np

In [9]:
os.getcwd()

'C:\\Users\\Anirudh Bhaskar\\url_classification_dl\\FinalDataset\\URL'

In [45]:
os.chdir('../')
os.chdir("FinalDataset\\URL")


In [46]:
os.getcwd()

'C:\\Users\\Anirudh Bhaskar\\url_classification_dl\\FinalDataset\\URL'

In [28]:
class UrlFeaturizer(object):
    def __init__(self, url):
        self.url = url
        self.domain = url.split('//')[-1].split('/')[0]
        self.today = datetime.datetime.now()

        try:
            self.whois = whois.query(self.domain).__dict__
        except:
            self.whois = None

        try:
            self.response = get(self.url)
            self.pq = PyQuery(self.response.text)
        except:
            self.response = None
            self.pq = None

    ## URL string Features
    def entropy(self):
        string = self.url.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy

    def numDigits(self):
        digits = [i for i in self.url if i.isdigit()]
        return len(digits)

    def urlLength(self):
        return len(self.url)

    def numParameters(self):
        params = self.url.split('&')
        return len(params) - 1

    def numFragments(self):
        fragments = self.url.split('#')
        return len(fragments) - 1

    def numSubDomains(self):
        subdomains = self.url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1

    def domainExtension(self):
        ext = self.url.split('.')[-1].split('/')[0]
        return ext

    ## URL domain features
    def hasHttp(self):
        return 'http:' in self.url

    def hasHttps(self):
        return 'https:' in self.url

    def urlIsLive(self):
        return self.response == 200

    def daysSinceRegistration(self):
        if self.whois and self.whois['creation_date']:
            diff = self.today - self.whois['creation_date']
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0

    def daysSinceExpiration(self):
        if self.whois and self.whois['expiration_date']:
            diff = self.whois['expiration_date'] - self.today
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0

    ## URL Page Features
    def bodyLength(self):
        if self.pq is not None:
            return len(self.pq('html').text()) if self.urlIsLive else 0
        else:
            return 0

    def numTitles(self):
        if self.pq is not None:
            titles = ['h{}'.format(i) for i in range(7)]
            titles = [self.pq(i).items() for i in titles]
            return len([item for s in titles for item in s])
        else:
            return 0

    def numImages(self):
        if self.pq is not None:
            return len([i for i in self.pq('img').items()])
        else:
            return 0

    def numLinks(self):
        if self.pq is not None:
            return len([i for i in self.pq('a').items()])
        else:
            return 0

    def scriptLength(self):
        if self.pq is not None:
            return len(self.pq('script').text())
        else:
            return 0

    def specialCharacters(self):
        if self.pq is not None:
            bodyText = self.pq('html').text()
            schars = [i for i in bodyText if not i.isdigit() and not i.isalpha()]
            return len(schars)
        else:
            return 0

    def scriptToSpecialCharsRatio(self):
        if self.pq is not None:
            sscr = self.scriptLength()/self.specialCharacters
        else:
            sscr = 0
        return sscr

    def scriptTobodyRatio(self):
        if self.pq is not None:
            sbr = self.scriptLength()/self.bodyLength
        else:
            sbr = 0
        return sbr

    def bodyToSpecialCharRatio(self):
        if self.pq is not None:
            bscr = self.specialCharacters()/self.bodyLength
        else:
            bscr = 0
        return bscr

    def run(self):
        data = {}
        data['entropy'] = self.entropy()
        data['numDigits'] = self.numDigits()
        data['urlLength'] = self.urlLength()
        data['numParams'] = self.numParameters()
        data['hasHttp'] = self.hasHttp()
        data['hasHttps'] = self.hasHttps()
        data['urlIsLive'] = self.urlIsLive()
        data['bodyLength'] = self.bodyLength()
        data['numTitles'] = self.numTitles()
        data['numImages'] = self.numImages()
        data['numLinks'] = self.numLinks()
        data['scriptLength'] = self.scriptLength()
        data['specialChars'] = self.specialCharacters()
        data['ext'] = self.domainExtension()
        data['dsr'] = self.daysSinceRegistration()
        data['dse'] = self.daysSinceExpiration()
        data['sscr'] = self.scriptToSpecialCharsRatio()
        data['sbr'] = self.scriptTobodyRatio()
        data['bscr'] = self.bodyToSpecialCharRatio()
        return data

In [40]:
def mergeDict(dict1, dict2):

    dict3 = {**dict1, **dict2}
    for key, value in dict3.items():
        if key in dict1 and key in dict2:
            dict3[key] = [value , dict1[key]]
    return dict3

In [47]:
l =os.listdir()

In [48]:
print(l)

['Benign_list_big_final.csv', 'DefacementSitesURLFiltered.csv', 'Malware_dataset.csv', 'phishing_dataset.csv', 'spam_dataset.csv']


In [120]:
emp =UrlFeaturizer("").run().keys()
A = pd.DataFrame(columns =emp)
t=[]
for j in l:

    d=pd.read_csv(j,header=None)
    dd=d.to_numpy().flatten()
  
    for i in dd:
        #print(type(i))
        temp=UrlFeaturizer(i).run()
        temp["File"]=j.split(".")[0]
        t.append(temp)
A=A.append(t)
    


In [123]:
A.head()

Unnamed: 0,File,bodyLength,bscr,dse,dsr,entropy,ext,hasHttp,hasHttps,numDigits,numImages,numLinks,numParams,numTitles,sbr,scriptLength,specialChars,sscr,urlIsLive,urlLength
0,Benign_list_big_final,0,0,0,0,-4.876201,to,True,False,18,0,0,0,0,0,0,0,0,False,83
1,Benign_list_big_final,0,0,0,0,-4.9207,to,True,False,23,0,0,0,0,0,0,0,0,False,83
2,Benign_list_big_final,0,0,0,0,-4.82163,to,True,False,22,0,0,0,0,0,0,0,0,False,83
3,Benign_list_big_final,0,0,0,0,-4.88904,to,True,False,18,0,0,0,0,0,0,0,0,False,83
4,Benign_list_big_final,0,0,0,0,-4.772234,to,True,False,18,0,0,0,0,0,0,0,0,False,83


In [124]:
A.to_csv("URL_feature.csv")
    