In [1]:
import re
import csv
import os
import math
import pandas as pd
import urllib.parse
from __future__ import division
from collections import Counter
from tld import get_tld

In [2]:
def urlparse(address):
    if not re.search(r'^[A-Za-z0-9+.\-]+://', address):
        address = 'http://{0}'.format(address)
    return urllib.parse.urlparse(address)

In [3]:
class URLPreprocessor:
    
    def __init__(self, url,benign):
        url = url.replace(r"'",'')
        self.url = url
        self.benign = benign
        self.parsedurl = urlparse(url)
        self.tld = get_tld(url, fail_silently=True,fix_protocol=True) 
        self.domain = self.parsedurl.netloc
        self.domain_count = len(self.domain.split('.'))
        self.domain_len = len(self.domain)
        self.protocol = self.parsedurl.scheme
        self.path = self.parsedurl.path
        self.parameters = self.parsedurl.query
        self.anchor = self.parsedurl.fragment
        
        self.has_https = 1 if 'https:' in url else 0
        self.has_html = 1 if '.html' in url else 0
        self.has_php = 1 if '.php' in url else 0
        self.has_exe = 1 if '.exe' in url else 0
        
        
        if re.search(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", url):
            self.has_ip = 1
        else:
            self.has_ip = 0

            
        def count_non_alphanumeric(input_string):
            count = 0
            for char in input_string:
                if not char.isalnum():  # Check if the character is not alphanumeric
                    count += 1
            return count
        self.symbols_count = count_non_alphanumeric(url)
        
        
        
        self.path_url_ratio = len(self.path) / len(self.url)
        self.params_domain_ratio = len(self.parameters) / len(self.domain)
        self.params_url_ratio = len(self.parameters) / len(self.url)  
        self.domain_url_ratio = len(self.domain) / len(self.url)
        self.number_rate_url = len(re.sub("[^0-9]", "", self.url)) / len(self.url)
        self.path_domain_ratio = len(self.path) / len(self.domain)
        self.avg_path_token_len = sum([len(i) for i in self.path.split("/")]) / len(self.path.split("/"))
        
        #simboloebis counts vamateb imitoro magasac aqvs mnishvneloba turme
        # @, //, ., % da - s counts vamateb da kide vamateb zogadad simboloebis counts
        self.at_count = url.count(r'@')
        self.double_slash_count = url.count(r'//')
        self.hyphen_count = url.count(r'-')
        self.dot_count = url.count(r'.')
        self.percent_count = url.count(r'%')
        #vamateb top level domainebis counts
        try:
            self.tld_count = len(self.tld.split('.'))
        except:
            self.tld_count = 0
        #vamateb parametrebshi/queryshi da domainshi digit counts
        self.param_digit_count = len(re.findall(r'\d', self.parameters))
        self.domain_digit_count = len(re.findall(r'\d', self.domain))
        
        try:
            self.number_rate_after_path = len(re.sub("[^0-9]", "", self.url.split("/")[-1])) / self.url.split("/")[-1]
        except: 
            self.number_rate_after_path = 0
        
        try:
            self.params_path_ratio = len(self.parameters) / len(self.path)
        except:
            self.params_path_ratio = 0

        try:
            self.number_rate_file_name = len(re.sub("[^0-9]", "", self.path)) / len(self.path)
        except: 
            self.number_rate_file_name = 0

        

    
    
    
    def char_cont_rate(self):
        symbol_sequences = re.findall(r'[^a-zA-Z0-9]', self.domain)
        total_len = len(self.domain)
    
        current_char_sequence = 0
        longest_char_sequence = 0

        current_digit_sequence = 0
        longest_digit_sequence = 0
        
        current_sym_sequence = 0
        longest_sym_sequence = 0

        for char in self.domain:
            if char.isalpha():
                current_char_sequence += 1
                current_digit_sequence = 0  # Reset digit sequence count
            elif char.isdigit():
                current_digit_sequence += 1
                current_char_sequence = 0  # Reset character sequence count
            else:
                current_char_sequence = 0
                current_digit_sequence = 0
                
            if char in symbol_sequences:
                current_sym_sequence +=1
            else:
                longest_sym_sequence = max(longest_sym_sequence,current_sym_sequence)
                current_sym_sequence = 0
            
            longest_char_sequence = max(longest_char_sequence, current_char_sequence)
            longest_digit_sequence = max(longest_digit_sequence, current_digit_sequence)
            
        return sum([longest_char_sequence,longest_digit_sequence,longest_sym_sequence])/total_len        
    
    

    def entropy(self):
        
        base = 2
        data = self.domain
        entropy = 0.0
        #remove dots and tld
        try:
            data = data.replace(self.tld,'')
        except:
            pass
        data = data.replace('.','').replace(":",'').replace(r'%','').replace('www','').replace('=','')
        data = data.lower()
        length = len(data) * 1.0

        if length > 0:
            cnt = Counter(data)
            # These probability numbers were calculated from the Alexa Top
            # 1 million domains as of September 15th, 2017. TLDs and instances
            # of 'www' were removed so 'www.google.com' would be treated as
            # 'google' and 'images.google.com' would be 'images.google'.
            probabilities = {
                '-': 0.013342298553905901,
                #arvici @ simbolos probability da yvelaze dabals mivanicheb imitoro saertod arunda iyos
                '@': 9.04562613824129e-08,
                '_': 9.04562613824129e-06,
                '0': 0.0024875471880163543,
                '1': 0.004884638114650296,
                '2': 0.004373560237839663,
                '3': 0.0021136613076357144,
                '4': 0.001625197496170685,
                '5': 0.0013070929769758662,
                '6': 0.0014880054997406921,
                '7': 0.001471421851820583,
                '8': 0.0012663876593537805,
                '9': 0.0010327089841158806,
                'a': 0.07333590631143488,
                'b': 0.04293204925644953,
                'c': 0.027385633133525503,
                'd': 0.02769469202658208,
                'e': 0.07086192756262588,
                'f': 0.01249653250998034,
                'g': 0.038516276096631406,
                'h': 0.024017645001386995,
                'i': 0.060447396668797414,
                'j': 0.007082725266242929,
                'k': 0.01659570875496002,
                'l': 0.05815885325582237,
                'm': 0.033884915513851865,
                'n': 0.04753175014774523,
                'o': 0.09413783122067709,
                'p': 0.042555148167356144,
                'q': 0.0017231917793349655,
                'r': 0.06460084667060655,
                's': 0.07214640647425614,
                't': 0.06447722311338391,
                'u': 0.034792493336388744,
                'v': 0.011637198026847418,
                'w': 0.013318176884203925,
                'x': 0.003170491961453572,
                'y': 0.016381628936354975,
                'z': 0.004715786426736459
            }

            for char, count in cnt.items():
                observed = count / length
                try:
                    expected = probabilities[char]
                except:
                    raise KeyError
                entropy += observed * math.log((observed / expected), base)
        return entropy

        

    def process(self):
        entropy = self.entropy()
        char_cont_rate = self.char_cont_rate()
        return {
            "url":self.url,
            "benign":self.benign,
            'domain_len':self.domain_len,
            "domain_count":self.domain_count,
            "number_rate_file_name" : self.number_rate_file_name,
            "domain_url_ratio" : self.domain_url_ratio,
            "number_rate_url" : self.number_rate_url,
            "path_domain_ratio" : self.path_domain_ratio,
            "number_rate_after_path" : self.number_rate_after_path,
            "avg_path_token_len" : self.avg_path_token_len,
            "params_path_ratio" : self.params_path_ratio,
            "params_url_ratio" : self.params_url_ratio,
            "params_domain_ratio" : self.params_domain_ratio,
            "path_url_ratio" : self.path_url_ratio,
            "entropy" : entropy,
            "char_cont_rate" : char_cont_rate,
            "at_count":self.at_count,
            "double_slash_count":self.double_slash_count,
            "hyphen_count":self.hyphen_count,
            "dot_count":self.dot_count,
            "symbols_count":self.symbols_count,
            "tld_count":self.tld_count,
            "domain_digit_count":self.domain_digit_count,
            "param_digit_count":self.param_digit_count,
            "percent_count":self.percent_count,
            "has_https":self.has_https,
            "has_html":self.has_html,
            'has_php':self.has_php,
            'has_exe':self.has_exe,
            'has_ip':self.has_ip
}

In [4]:
exam_url = 'tualimpa.pt/include/cp.php?m=login'
if re.search(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", exam_url):
    has_ip = 1
else:
    has_ip = 0
has_ip



0

In [5]:
example_url = "kkk.1\\g@n.com/Recipe/Pesto-Chicken/Detail.aspx?evt19=1&referringHubId=15041"

urlparse(exam_url)
url_obj = URLPreprocessor(exam_url,1)
column_list = url_obj.process().keys()
url_obj.process()

{'url': 'tualimpa.pt/include/cp.php?m=login',
 'benign': 1,
 'domain_len': 11,
 'domain_count': 2,
 'number_rate_file_name': 0.0,
 'domain_url_ratio': 0.3235294117647059,
 'number_rate_url': 0.0,
 'path_domain_ratio': 1.3636363636363635,
 'number_rate_after_path': 0,
 'avg_path_token_len': 4.333333333333333,
 'params_path_ratio': 0.4666666666666667,
 'params_url_ratio': 0.20588235294117646,
 'params_domain_ratio': 0.6363636363636364,
 'path_url_ratio': 0.4411764705882353,
 'entropy': 1.4910735607000085,
 'char_cont_rate': 0.8181818181818182,
 'at_count': 0,
 'double_slash_count': 0,
 'hyphen_count': 0,
 'dot_count': 2,
 'symbols_count': 6,
 'tld_count': 1,
 'domain_digit_count': 0,
 'param_digit_count': 0,
 'percent_count': 0,
 'has_https': 0,
 'has_html': 0,
 'has_php': 1,
 'has_exe': 0,
 'has_ip': 0}

In [6]:
import os

directory_path = './datasets/'
filenames = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [7]:
filenames

['dataset_malicious_rlilojr.csv',
 'datav1v2_dataset.csv',
 'ISCXURL2016_dataset.csv',
 'malicious_phish_dataset.csv',
 'phishtank_verified_dataset.csv',
 'urlset_processed.csv',
 'websites_2gb_dataset.csv']

In [9]:
n = 6
df_concat = pd.read_csv(directory_path+filenames[n])
print(filenames[n])

websites_2gb_dataset.csv


In [10]:
df_concat = df_concat.dropna()


In [34]:
df_concat = df_concat.astype({'label':int})

In [35]:
df_concat.rename(columns={"domain":'url',"label":'benign'},inplace=True)

In [12]:
df_concat = df_concat.sample(frac=1).reset_index(drop=True)

In [13]:
df_concat

Unnamed: 0,url,benign
0,http://www.celticqh.com,1
1,http://members.tripod.com/justafeelingnet/,0
2,http://www.unicorn-dream.co.uk/destrier/,0
3,http://www.citibuild.co.uk/,1
4,http://www.nimbusuk.com/,1
...,...,...
226615,http://sonyaisaacs.tripod.com/index.htm,1
226616,http://www.cse.buffalo.edu/sneps/,1
226617,http://www.endtimeassembly.org/,1
226618,http://www.silverraver.com/,1


In [12]:
df_concat.groupby("benign").count()

Unnamed: 0_level_0,url
benign,Unnamed: 1_level_1
0,40246


In [8]:
df_concat = df_concat.loc[df_concat["benign"] == 0]


In [37]:
df_concat.drop(columns=df_concat.columns[0],inplace=True)

In [14]:
df_concat

Unnamed: 0,url,benign
0,http://www.celticqh.com,1
1,http://members.tripod.com/justafeelingnet/,0
2,http://www.unicorn-dream.co.uk/destrier/,0
3,http://www.citibuild.co.uk/,1
4,http://www.nimbusuk.com/,1
...,...,...
226615,http://sonyaisaacs.tripod.com/index.htm,1
226616,http://www.cse.buffalo.edu/sneps/,1
226617,http://www.endtimeassembly.org/,1
226618,http://www.silverraver.com/,1


In [15]:
df_concat_1 = df_concat[:50000]
df_concat_2 = df_concat[50000:100000]
df_concat_3 = df_concat[100000:150000]
df_concat_4 = df_concat[150000:200000]
df_concat_5 = df_concat[200000:]



In [16]:


def process(df_concat):
    length = len(df_concat)
    i = 0
    for index, row in df_concat.iterrows():
        if row['url'][-1] == '/':
            row['url'] = row['url'][:-1]
        
        try:
            try:
                url_obj = URLPreprocessor(row['url'],row['benign'])
            except:
                continue
            try:
                new_row = url_obj.process()
            except KeyError:
                continue
                
            df.loc[len(df)] = new_row

        except Exception as e:
            print('\n',e)
            print(row['url'],index)
            continue

        print(f"Progress: {i}/{length}", end='\r')
        i+=1


In [26]:
df = pd.DataFrame(columns=column_list) 
process(df_concat_5)

Progress: 26619/26620

In [27]:
directory_save = './datasets/preprocessed/'
df.to_csv(directory_save+'processed_5_'+filenames[n],index=False)

In [29]:
url_ob = URLPreprocessor('?',0)

ar iruneba


In [13]:
df.groupby('benign').count()

Unnamed: 0_level_0,url,domain_len,domain_count,number_rate_file_name,domain_url_ratio,number_rate_url,path_domain_ratio,number_rate_after_path,avg_path_token_len,params_path_ratio,...,symbols_count,tld_count,domain_digit_count,param_digit_count,percent_count,has_https,has_html,has_php,has_exe,has_ip
benign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,237200,237200,237200,237200,237200,237200,237200,237200,237200,237200,...,237200,237200,237200,237200,237200,237200,237200,237200,237200,237200


In [4]:
len('https://billowing-scene-36bb.abqerxo669.workers.dev/')

52