# Testing Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from re import compile
from urllib.parse import urlparse
from socket import gethostbyname
from datetime import datetime, timezone
import math

In [3]:
class UrlFeaturizer(object):
    def __init__(self, url):
        self.url = url
        self.domain = url.split('//')[-1].split('/')[0]
        self.urlparse = urlparse(url)

    def entropy(self):
        string = self.url.lower().strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy

    def ip(self):
        string = self.url
        flag = False
        if ("." in string):
            elements_array = string.strip().split(".")
            if(len(elements_array) == 4):
                for i in elements_array:
                    if (i.isnumeric() and int(i)>=0 and int(i)<=255):
                        flag=True
                    else:
                        flag=False
                        break
        if flag:
            return 1 
        else:
            return 0

    def numDigits(self):
        digits = [i for i in self.url if i.isdigit()]
        return len(digits)

    def urlLength(self):
        return len(self.url)

    def numParameters(self):
        params = self.url.split('&')
        return len(params) - 1

    def numFragments(self):
        fragments = self.url.split('#')
        return len(fragments) - 1

    def numSubDomains(self):
        subdomains = self.url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1

    def domainExtension(self):
        ext = self.url.split('.')[-1].split('/')[0]
        return ext

    def hasHttp(self):
        return 'http:' in self.url

    def hasHttps(self):
        return 'https:' in self.url

    def url_host_is_ip(self):
        host = self.urlparse.netloc
        pattern = compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
        match = pattern.match(host)
        return match is not None

    def get_ip(self):
        try:
            ip = self.urlparse.netloc if self.url_host_is_ip() else gethostbyname(self.urlparse.netloc)
            return ip
        except:
            return None

    def url_path_length(self):
        return len(self.urlparse.path)

    def url_host_length(self):
        return len(self.urlparse.netloc)

    def url_has_port_in_string(self):
        has_port = self.urlparse.netloc.split(':')
        return len(has_port) > 1 and has_port[-1].isdigit()

    def is_encoded(self):
        return '%' in self.url.lower()

    def num_encoded_char(self):
        encs = [i for i in self.url if i == '%']
        return len(encs)

    def number_of_subdirectories(self):
        d = self.urlparse.path.split('/')
        return len(d)

    def number_of_periods(self):
        periods = [i for i in self.url if i == '.']
        return len(periods)

    def has_client_in_string(self):
        return 'client' in self.url.lower()

    def has_admin_in_string(self):
        return 'admin' in self.url.lower()

    def has_server_in_string(self):
        return 'server' in self.url.lower()

    def has_login_in_string(self):
        return 'login' in self.url.lower()
        
    def get_tld(self):
        #top-level domain
        return self.urlparse.netloc.split('.')[-1].split(':')[0]

    def count_arate(self):
        arates = [i for i in self.url if i == '@']
        return len(arates)

    def count_asterisk(self):
        asterisks = [i for i in self.url if i == '*']
        return len(asterisks)

    def count_questionmark(self):
        questionmarks = [i for i in self.url if i == '?']
        return len(questionmarks)

    def count_plus(self):
        plus = [i for i in self.url if i == '+']
        return len(plus)

    def count_exclamation(self):
        exclamation = [i for i in self.url if i == '!']
        return len(exclamation)

    def count_hyphen(self):
        hyphen = [i for i in self.url if i == '-']
        return len(hyphen)

    def count_equal(self):
        equals = [i for i in self.url if i == '=']
        return len(equals)

    def count_tilted(self):
        tilted = [i for i in self.url if i == '~']
        return len(tilted)

    def run(self):
        data = {}
#         data['url'] = self.url
        data['entropy'] = self.entropy()
        data['numDigits'] = self.numDigits()
        data['urlLength'] = self.urlLength()
        data['numParams'] = self.numParameters()
        data['hasHttp'] = self.hasHttp()
        data['hasHttps'] = self.hasHttps()
#         data['ext'] = self.domainExtension()
        data['num_%20'] = self.url.count("%20")
        data['num_@'] = self.url.count("@")
        data['has_ip'] = self.ip()
        data['path_length'] = self.url_path_length()
        data['host_length'] = self.url_host_length()
        data['has_port'] = self.url_has_port_in_string()
        data['is_encoded'] = self.is_encoded()
        data['num_encoded_char'] = self.num_encoded_char()
        data['number_of_subdirectories'] = self.number_of_subdirectories()
        data['number_of_periods'] = self.number_of_periods()
        data['has_client_in_string'] = self.has_client_in_string()
        data['has_admin_in_string'] = self.has_admin_in_string()
        data['has_server_in_string'] = self.has_server_in_string()
        data['has_login_in_string'] = self.has_login_in_string()
#         data['tld'] = self.get_tld()
        data['count_arate'] = self.count_arate()
        data['count_asterisk'] = self.count_asterisk()
        data['count_questionmark'] = self.count_questionmark()
        data['count_plus'] = self.count_plus()
        data['count_exclamation'] = self.count_exclamation()
        data['count_hyphen'] = self.count_hyphen()
        data['count_equal'] = self.count_equal()
        data['count_tilted'] = self.count_tilted()
    
        return data

In [4]:
from tensorflow.keras.utils import pad_sequences
from keras.preprocessing.text import one_hot
from tensorflow.keras.models import load_model

In [5]:
def get_padded_url(url, VOCAB_LENGTH = 464223, length_long_sentence = 373):
    one_hot_url = one_hot(url, VOCAB_LENGTH)
    padded_url = pad_sequences([one_hot_url], length_long_sentence, padding='post')
    return padded_url.reshape(1, length_long_sentence)

In [19]:
model = load_model("../Models/url_clsf_word_embed/")

In [23]:
df_log = pd.read_csv("../Datasets/Log_wireshark.csv")

In [24]:
df_log.head()

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,57,10.737953,DESKTOP-EB9QDOJ.local,4-c-0003.c-msedge.net,TCP,66,62927 > 80 [SYN] Seq=0 Win=64240 Len=0 MSS=1...
1,307,15.810955,DESKTOP-EB9QDOJ.local,dns.google,TCP,66,62928 > 53 [SYN] Seq=0 Win=64240 Len=0 MSS=1...
2,308,15.811716,DESKTOP-EB9QDOJ.local,dns.google,TCP,66,62929 > 53 [SYN] Seq=0 Win=64240 Len=0 MSS=1...
3,309,15.813638,DESKTOP-EB9QDOJ.local,dns.google,TCP,66,62930 > 53 [SYN] Seq=0 Win=64240 Len=0 MSS=1...
4,310,15.814338,DESKTOP-EB9QDOJ.local,dns.google,TCP,66,62931 > 53 [SYN] Seq=0 Win=64240 Len=0 MSS=1...


In [25]:
df_log.shape

(255, 7)

In [20]:
labels =['benign','defacement', 'malware', 'phishing']

In [8]:
df = pd.read_csv("../Datasets/malicious_phish.csv")

In [9]:
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [10]:
from tqdm import tqdm

In [24]:
preds = []
for row in tqdm(df[1000:2000].iterrows()):
    url = row[1]['url']
#     feats = np.array(list(UrlFeaturizer(url).run().values())).reshape(1, 28)
    pad_url = get_padded_url(url)
    out = model.predict(x=pad_url, verbose=0)
    prediction = labels[np.argmax(out)]
    if(prediction != 'benign'):
        preds.append([url, prediction])

1000it [00:53, 18.65it/s]


In [25]:
preds

[['peregrine.igg.biz', 'phishing'],
 ['http://www.westwoodchurch.co.uk/our-services/sunday-celebration/celebration-talks/233-17-06-2012-ruth-ruth-1-simon-lloyd',
  'phishing'],
 ['http://torcache.net/torrent/9154FAAD712957FD02C9507EB9A583DF724A7683.torrent?title=[kickass.to]grand.theft.auto.v.reloaded',
  'phishing'],
 ['gojukarate.biz', 'phishing'],
 ['canadagazette.gc.ca/archives/p1/2002/2002-01-19/html/parliament-parlement-eng.html',
  'phishing'],
 ['smartlanka.net/ACT/', 'phishing'],
 ['waatp.nl/people/dimitri-taes/', 'phishing'],
 ['krekkahotrikoa.altervista.org', 'phishing']]