In [None]:
!pip install tld

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import re
import os
import urllib.parse
import tld
from tld import get_tld
import time
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
data = pd.read_csv(r"mendeley_test.csv")
data.head()

Unnamed: 0,urls,label
0,http://www.dutchthewiz.com/freeware/,0
1,http://www.collectiblejewels.com,0
2,http://www.deadlinedata.com,0
3,http://www.mil.fi/maavoimat/kalustoesittely/00...,0
4,http://www.avclub.com/content/node/24539,0


In [None]:
urls = data["urls"]

In [None]:
urls

0                    http://www.dutchthewiz.com/freeware/
1                        http://www.collectiblejewels.com
2                             http://www.deadlinedata.com
3       http://www.mil.fi/maavoimat/kalustoesittely/00...
4                http://www.avclub.com/content/node/24539
                              ...                        
9995                http://www.freeadult2000.com/nonnude/
9996               http://www.malestrippersunlimited.com/
9997    http://www.troppobelle.net/disegni_a_matita/di...
9998    http://www.mature-nude-woman.com/older-women-p...
9999                http://www.anzwers.net/hot/maryblond/
Name: urls, Length: 10000, dtype: object

## **Hostname-based features**

In [None]:
#Get hostname of a URL
def get_hostname(url):
  url_parts = urllib.parse.urlsplit(url)
  hostname = url_parts.hostname
  return hostname

In [None]:
#Digits count in hostname:
def digitcount_hostname(hname):
  digits_count = re.findall("[0-9]", hname)
  return len(digits_count)

In [None]:
#@ in hostname
def Checkatrate_hostname(hname):
  present_atrate = 0
  atrate = re.search("@", hname)
  if(atrate != None):
    present_atrate = 1
  return present_atrate

In [None]:
#average word length in hostname
def avgWordLenHN(hname):
  words = re.split(r"\W+",hname)
  average_length = sum(len(i) for i in words)/len(words)
  return average_length

In [None]:
#Longest Word length in hostname
def longWordLenHN(hname):
  words = re.split(r"\W+",hname)
  longest_word = sorted(words, key = len)
  return len(longest_word[-1])

In [None]:
#hostname length
def hnlength(hname):
  return len(hname)

In [None]:
#Presence of phish word
def checkPhishWordHost(hname):
  phish_word_count = 0
  presentPhishWord = 0
  phish_words = ["wp", "login", "Includes", "Admin", "Content", "Site", "Images", "js", "Alibaba", "css", "Myaccount", "Dropbox", "Themes", "Plugins", "Signin", "View"]
  for i in phish_words:
    if i in hname:
      phish_word_count+=1

  if phish_word_count > 0:
    presentPhishWord = 1
  return presentPhishWord

In [None]:
#Get domain name
def getDomain(url):
  url_parts = urllib.parse.urlsplit(url)
  domain = url_parts.netloc
  return domain

In [None]:
#Length of domain
def domainLength(domain):
  return len(domain)

In [None]:
#Digits count in domain name
def digitcount_domain(domain):
  digits_count_domain = re.findall("[0-9]", domain)
  return len(digits_count_domain)

In [None]:
#Number of dots in the hostname
def numberDotsHN(hname):
  numberDots = re.findall("\.", hname)
  return len(numberDots)

In [None]:
#Number of hyphens in the hostname
def numberHypensHN(hname):
  numberHypens = re.findall("\-", hname)
  return len(numberHypens)

In [None]:
#Number of underscores in the hostname
def numberUnderscoreHN(hname):
  numberUS = re.findall("\_", hname)
  return len(numberUS)

In [None]:
#Validate TLD
def validateTLD(hname):
  checkTLD = 0
  try:
    urlTLD = get_tld(hname)
    if urlTLD!=None:
      checkTLD = 1
  except:
    checkTLD=0
  return checkTLD

In [None]:
#Presence of underscores in the hostname
def presenceUnderscore(hname):
  presenceUnderScore = 0
  numberUnderScore = re.findall("\_", hname)
  if(len(numberUnderScore) > 0):
    presenceUnderScore = 1

  return presenceUnderScore

## **Full-URL Based Features**

In [None]:
#Presence of @ in URL
def presenceAtrate(url):
  present_atrate = 0
  atrate = re.search("@", url)
  if(atrate != None):
    present_atrate = 1
  return present_atrate

In [None]:
#Get path in url
def getPathURL(url):
  url_parts = urllib.parse.urlsplit(url)
  path = url_parts.path
  return path

In [None]:
#Digits count in the path
def digitcount_url(path):
  digits_count = re.findall("[0-9]", path)
  return len(digits_count)

In [None]:
#Average word length in the path
def avgWordLenPath(path):
  words = re.split(r"\W+",path)
  average_length = sum(len(i) for i in words)/len(words)
  return average_length

In [None]:
#Longest word length in the path
def longWordLenPath(path):
  words = re.split(r"\W+",path)
  longest_word_path = sorted(words, key = len)
  return len(longest_word_path[-1])

In [None]:
#Getting baseurl
def baseURL(url):
  baseurl = os.path.dirname(url)
  return baseurl

In [None]:
#BaseURL length
def baseURLlen(burl):
  return len(burl)

In [None]:
#Presence of Phish-Hinted words in the path
def CheckPhishWordPath(path):
  phish_word_countPath = 0
  presentPhishWordPath = 0
  phish_words = ["wp", "login", "Includes", "Admin", "Content", "Site", "Images", "js", "Alibaba", "css", "Myaccount", "Dropbox", "Themes", "Plugins", "Signin", "View"]
  for i in phish_words:
    if i in path:
      phish_word_countPath+=1

  if phish_word_countPath > 0:
    presentPhishWordPath = 1
  return presentPhishWordPath

In [None]:
#Question mark count in the URL
def questionCount(url):
  numberQmarks = re.findall("\?",url)
  return len(numberQmarks)

In [None]:
#Slash count in the path
def slashCountPath(path):
  numberSlash = re.findall("\/",path)
  return len(numberSlash)

In [None]:
#HTTPS count in the path
def httpsCountPath(path):
  httpsPath = re.findall("https", path)
  return len(httpsPath)

In [None]:
#Presence of $ in the base URL
def presentDollar(burl):
  presentdollarBaseUrl = 0
  numberdollarbaseUrl = re.findall("\$", burl)

  if len(numberdollarbaseUrl) > 0:
    presentdollarBaseUrl = 1

  return presentdollarBaseUrl

In [None]:
# Presence of comma in the base URL
def presentComma(burl):
  presentcommaBaseUrl = 0
  numbercommabaseUrl = re.findall("\,", burl)

  if len(numbercommabaseUrl) > 0:
    presentcommaBaseUrl = 1

  return presentcommaBaseUrl

In [None]:
#Presence of * in the base URL
def presentAsterisk(burl):
  presentarBaseUrl = 0
  numberarbaseUrl = re.findall("\*", burl)

  if len(numberarbaseUrl) > 0:
    presentarBaseUrl = 1

  return presentarBaseUrl

In [None]:
# Presence of OR symbol in the base URL'
def presentOR(burl):
  presentorBaseUrl = 0
  numberorbaseUrl = re.findall("\|", burl)

  if len(numberorbaseUrl) > 0:
    presentorBaseUrl = 1

  return presentorBaseUrl

In [None]:
#Presence of semicolon in the base URL
def presentSemiColon(burl):
  presentscBaseUrl = 0
  numberscbaseUrl = re.findall("\;", burl)

  if len(numberscbaseUrl) > 0:
    presentscBaseUrl = 1

  return presentscBaseUrl

In [None]:
#Presence of white space in the base URL
def presentWhiteSpace(burl):
  presentWhiteSpace = 0
  numberWhiteSpace = re.findall(" ", burl)

  if len(numberWhiteSpace) > 0:
    presentWhiteSpace = 1

  return presentWhiteSpace

In [None]:
#Presence of HTTPS in the base URL
def presentHttps(burl):
  presenthttps = 0
  https = re.search("https", burl)
  if https != None:
    presenthttps = 1
  return presenthttps


In [None]:
#Tokens in the path
def tokensPath(path):
  countDotsPath = re.findall("\.", path)
  countcommaPath = re.findall("\,", path)
  countDollarPath = re.findall("\$", path)
  countSemiColonPath = re.findall("\;", path)
  countColonPath = re.findall("\:", path)
  countSlash = re.findall("\/", path)
  countUnderScore = re.findall("\_", path)

  return [len(countDotsPath), len(countcommaPath), len(countDollarPath), len(countSemiColonPath), len(countColonPath),len(countSlash), len(countUnderScore)]

In [None]:
# df = pd.DataFrame(columns=["digitsHostname", "PresentAtrateHost", "AvgWordLenHost", "LongWordLenHost", "HostNameLen", "domainLen", "digitsDomain", "dotsHost", "HypensHost", "UnderScoreHost", "hasTLD", "hasUnderScore", "hasAtRate", "digitsCountPath", "AvgWordLenPath", "LongWordLenPath", "baseurlLen", "hasPhishPath", "hasQuestionMark", "hasSlashPath", "httpsCountpath", "hasDollarBase", "hasCommaBase", "hasAsteriskBase", "hasOrBase", "hasSemiColonBase", "hasWhiteSpaceBase", "hasHttpsBase", "hasDotsPath", "hascommaPath", "hasDollarPath", "hasSemiColonPath", "hasColonPath", "hasSlashCount", "hasUnderScorePath"])
# print(df)

In [None]:
df = data

In [None]:
#Main Function



df["digitsHostname"] = " "
df["PresentAtrateHost"] = " "
df["AvgWordLenHost"] = " "
df["LongWordLenHost"] = " "
df["HostNameLen"] = " "
df["domainLen"] = " "
df["digitsDomain"] = " "
df["dotsHost"] = " "
df["HypensHost"] = " "
df["UnderScoreHost"] = " "
df["hasTLD"] = " "
df["hasUnderScore"] = " "

df["hasAtRate"] = " "
df["digitsCountPath"] = " "
df["AvgWordLenPath"] = " "
df["LongWordLenPath"] = " "
df["baseurlLen"] = " "
df["hasPhishPath"] = " "
df["hasQuestionMark"] = " "
df["hasSlashPath"] = " "
df["httpsCountpath"] = " "
df["hasDollarBase"] = " "
df["hasCommaBase"] = " "
df["hasAsteriskBase"] = " "
df["hasOrBase"] = " "
df["hasSemiColonBase"] = " "
df["hasWhiteSpaceBase"] = " "
df["hasHttpsBase"] = " "
df["hasDotsPath"] = " "
df["hascommaPath"] = " "
df["hasDollarPath"] = " "
df["hasSemiColonPath"] = " "
df["hasColonPath"] = " "
df["hasSlashCount"] = " "
df["hasUnderScorePath"] = " "

start = time.time()

index = 0

for i in urls:
  #Host name based features
  
  hostname = get_hostname(i)
  digitsHostname = digitcount_hostname(hostname)
  PresentAtrateHost = Checkatrate_hostname(hostname)
  AvgWordLenHost = avgWordLenHN(hostname)
  LongWordLenHost = longWordLenHN(hostname)
  HostNameLen = hnlength(hostname)
  CheckPhishHost = checkPhishWordHost(hostname)
  domainname = getDomain(i)
  domainLen = domainLength(domainname)
  digitsDomain = digitcount_domain(domainname)
  dotsHost = numberDotsHN(hostname)
  HypensHost = numberHypensHN(hostname)
  UnderScoreHost = numberUnderscoreHN(hostname)
  hastld = validateTLD(hostname)
  hasUnderScore = presenceUnderscore(hostname)

  #FullURL based
  hasAtRate = presenceAtrate(i)
  path = getPathURL(i)
  digitsCountPath = digitcount_url(path)
  AvgWordLenPath = avgWordLenPath(path)
  LongWordLenPath = longWordLenPath(path)
  baseurl = baseURL(i)
  baseurlLen = baseURLlen(baseurl)
  hasPhishPath = CheckPhishWordPath(path)
  hasQuestionMark = questionCount(i)
  hasSlashPath = slashCountPath(path)
  httpsCountpath = httpsCountPath(path)
  hasDollarBase = presentDollar(baseurl)
  hasCommaBase = presentComma(baseurl)
  hasAsteriskBase = presentAsterisk(baseurl)
  hasOrBase = presentOR(baseurl)
  hasSemiColonBase = presentSemiColon(baseurl)
  hasWhiteSpaceBase = presentWhiteSpace(baseurl)
  hasHttpsBase = presentHttps(baseurl)
  tokens_list = tokensPath(path)
  hasDotsPath = tokens_list[0]
  hascommaPath = tokens_list[1]
  hasDollarPath = tokens_list[2]
  hasSemiColonPath = tokens_list[3]
  hasColonPath = tokens_list[4]
  hasSlashCount = tokens_list[5]
  hasUnderScorePath = tokens_list[6]

  df["digitsHostname"][index] = digitsHostname
  df["PresentAtrateHost"][index] = PresentAtrateHost
  df["AvgWordLenHost"][index] = AvgWordLenHost
  df["LongWordLenHost"][index] = LongWordLenHost
  df["HostNameLen"][index] = HostNameLen
  df["domainLen"][index] = domainLen
  df["digitsDomain"][index] = digitsDomain
  df["dotsHost"][index] = dotsHost
  df["HypensHost"][index] = HypensHost
  df["UnderScoreHost"][index] = UnderScoreHost
  df["hasTLD"][index] = hastld
  df["hasUnderScore"][index] = hasUnderScore

  df["hasAtRate"][index] = hasAtRate
  df["digitsCountPath"][index] = digitsCountPath
  df["AvgWordLenPath"][index] = AvgWordLenPath
  df["LongWordLenPath"][index] = LongWordLenPath
  df["baseurlLen"][index] = baseurlLen
  df["hasPhishPath"][index] = hasPhishPath
  df["hasQuestionMark"][index] = hasQuestionMark
  df["hasSlashPath"][index] = hasSlashPath
  df["httpsCountpath"][index] = httpsCountpath
  df["hasDollarBase"][index] = hasDollarBase
  df["hasCommaBase"][index] = hasCommaBase
  df["hasAsteriskBase"][index] = hasAsteriskBase
  df["hasOrBase"][index] = hasOrBase
  df["hasSemiColonBase"][index] = hasSemiColonBase
  df["hasWhiteSpaceBase"][index] = hasWhiteSpaceBase
  df["hasHttpsBase"][index] = hasHttpsBase
  df["hasDotsPath"][index] = hasDotsPath
  df["hascommaPath"][index] = hascommaPath
  df["hasDollarPath"][index] = hasDollarPath
  df["hasSemiColonPath"][index] = hasSemiColonPath
  df["hasColonPath"][index] = hasColonPath
  df["hasSlashCount"][index] = hasSlashCount
  df["hasUnderScorePath"][index] = hasUnderScorePath

  index+=1

end = time.time()
totalTime = end - start

print("Time taken to execute this program is: ", totalTime)


Time taken to execute this program is:  136.5597722530365


In [None]:
df.to_csv(r"mendeley_test_lexical.csv")