# Model
Random Forest classifier with **100** trees 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 

In [2]:
filename = 'final_dataset.csv'
df = pd.read_csv(filename)

In [3]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
X = df.drop(columns=["url",'benign'])
y = df['benign']

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=44)

In [6]:
model = RandomForestClassifier(n_estimators=100,random_state=44)


In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)


In [None]:
from sklearn import metrics
print("accuracy: ",metrics.accuracy_score(y_test,y_pred))

In [10]:
feature_imp = pd.Series(model.feature_importances_,index=X.columns)

In [11]:
feature_imp.sort_values(ascending=True)

number_rate_after_path    0.000000
at_count                  0.000106
double_slash_count        0.001783
tld_count                 0.006141
param_digit_count         0.011432
params_path_ratio         0.016989
params_url_ratio          0.017718
domain_digit_count        0.019298
params_domain_ratio       0.019397
number_rate_file_name     0.031697
number_rate_url           0.033988
symbols_count             0.036631
percent_count             0.036841
hyphen_count              0.047155
entropy                   0.059853
dot_count                 0.078150
avg_path_token_len        0.082618
char_cont_rate            0.088813
domain_url_ratio          0.100364
path_url_ratio            0.141084
path_domain_ratio         0.169941
dtype: float64

In [12]:
import re
import csv
import os
import math
import pandas as pd
import urllib.parse
from __future__ import division
from collections import Counter
from tld import get_tld
def urlparse(address):
    if not re.search(r'^[A-Za-z0-9+.\-]+://', address):
        address = 'http://{0}'.format(address)
    return urllib.parse.urlparse(address)



In [23]:
class URLPreprocessor:
    
    def __init__(self, url):
        url = url.replace(r"'",'')
        self.url = url
#         self.benign = benign
        self.parsedurl = urlparse(url)
        self.tld = get_tld(url, fail_silently=True,fix_protocol=True) 
        self.domain = self.parsedurl.netloc
        self.protocol = self.parsedurl.scheme
        self.path = self.parsedurl.path
        self.parameters = self.parsedurl.query
        self.anchor = self.parsedurl.fragment

        self.path_url_ratio = len(self.path) / len(self.url)
        self.params_domain_ratio = len(self.parameters) / len(self.domain)
        self.params_url_ratio = len(self.parameters) / len(self.url)  
        self.domain_url_ratio = len(self.domain) / len(self.url)
        self.number_rate_url = len(re.sub("[^0-9]", "", self.url)) / len(self.url)
        self.path_domain_ratio = len(self.path) / len(self.domain)
        self.avg_path_token_len = sum([len(i) for i in self.path.split("/")]) / len(self.path.split("/"))
        
        #simboloebis counts vamateb imitoro magasac aqvs mnishvneloba turme
        # @, //, ., % da - s counts vamateb da kide vamateb zogadad simboloebis counts
        self.at_count = url.count(r'@')
        self.double_slash_count = url.count(r'//')
        self.hyphen_count = url.count(r'-')
        self.dot_count = url.count(r'.')
        self.percent_count = url.count(r'%')
        self.symbols_count = len(re.findall(r'[:/=?.,;()]+',url))
        #vamateb top level domainebis counts
        try:
            self.tld_count = len(self.tld.split('.'))
        except:
            self.tld_count = 0
        #vamateb parametrebshi/queryshi da domainshi digit counts
        self.param_digit_count = len(re.findall(r'\d', self.parameters))
        self.domain_digit_count = len(re.findall(r'\d', self.domain))
        
        try:
            self.number_rate_after_path = len(re.sub("[^0-9]", "", self.url.split("/")[-1])) / self.url.split("/")[-1]
        except: 
            self.number_rate_after_path = 0
        
        try:
            self.params_path_ratio = len(self.parameters) / len(self.path)
        except:
            self.params_path_ratio = 0

        try:
            self.number_rate_file_name = len(re.sub("[^0-9]", "", self.path)) / len(self.path)
        except: 
            self.number_rate_file_name = 0

        

    
    
    
    def char_cont_rate(self):
        symbol_sequences = re.findall(r'[^a-zA-Z0-9]', self.domain)
        total_len = len(self.domain)
    
        current_char_sequence = 0
        longest_char_sequence = 0

        current_digit_sequence = 0
        longest_digit_sequence = 0
        
        current_sym_sequence = 0
        longest_sym_sequence = 0

        for char in self.domain:
            if char.isalpha():
                current_char_sequence += 1
                current_digit_sequence = 0  # Reset digit sequence count
            elif char.isdigit():
                current_digit_sequence += 1
                current_char_sequence = 0  # Reset character sequence count
            else:
                current_char_sequence = 0
                current_digit_sequence = 0
                
            if char in symbol_sequences:
                current_sym_sequence +=1
            else:
                longest_sym_sequence = max(longest_sym_sequence,current_sym_sequence)
                current_sym_sequence = 0
            
            longest_char_sequence = max(longest_char_sequence, current_char_sequence)
            longest_digit_sequence = max(longest_digit_sequence, current_digit_sequence)
            
        return sum([longest_char_sequence,longest_digit_sequence,longest_sym_sequence])/total_len        
    

    def entropy(self):
        
        base = 2
        data = self.domain
        entropy = 0.0
        #remove dots and tld
        try:
            data = data.replace(self.tld,'')
        except:
            pass
        data = data.replace('.','').replace(":",'').replace(r'%','')
        data = data.lower()
        length = len(data) * 1.0

        if length > 0:
            cnt = Counter(data)
            # These probability numbers were calculated from the Alexa Top
            # 1 million domains as of September 15th, 2017. TLDs and instances
            # of 'www' were removed so 'www.google.com' would be treated as
            # 'google' and 'images.google.com' would be 'images.google'.
            probabilities = {
                '-': 0.013342298553905901,
                #arvici @ simbolos probability da yvelaze dabals mivanicheb imitoro saertod arunda iyos
                '@': 9.04562613824129e-08,
                '_': 9.04562613824129e-06,
                '0': 0.0024875471880163543,
                '1': 0.004884638114650296,
                '2': 0.004373560237839663,
                '3': 0.0021136613076357144,
                '4': 0.001625197496170685,
                '5': 0.0013070929769758662,
                '6': 0.0014880054997406921,
                '7': 0.001471421851820583,
                '8': 0.0012663876593537805,
                '9': 0.0010327089841158806,
                'a': 0.07333590631143488,
                'b': 0.04293204925644953,
                'c': 0.027385633133525503,
                'd': 0.02769469202658208,
                'e': 0.07086192756262588,
                'f': 0.01249653250998034,
                'g': 0.038516276096631406,
                'h': 0.024017645001386995,
                'i': 0.060447396668797414,
                'j': 0.007082725266242929,
                'k': 0.01659570875496002,
                'l': 0.05815885325582237,
                'm': 0.033884915513851865,
                'n': 0.04753175014774523,
                'o': 0.09413783122067709,
                'p': 0.042555148167356144,
                'q': 0.0017231917793349655,
                'r': 0.06460084667060655,
                's': 0.07214640647425614,
                't': 0.06447722311338391,
                'u': 0.034792493336388744,
                'v': 0.011637198026847418,
                'w': 0.013318176884203925,
                'x': 0.003170491961453572,
                'y': 0.016381628936354975,
                'z': 0.004715786426736459
            }

            for char, count in cnt.items():
                observed = count / length
                expected = probabilities[char]
                entropy += observed * math.log((observed / expected), base)
        return entropy

        

    def process(self):
        entropy = self.entropy()
        char_cont_rate = self.char_cont_rate()
        return {
            "url":self.url,
#             "benign":self.benign,
            "number_rate_file_name" : self.number_rate_file_name,
            "domain_url_ratio" : self.domain_url_ratio,
            "number_rate_url" : self.number_rate_url,
            "path_domain_ratio" : self.path_domain_ratio,
            "number_rate_after_path" : self.number_rate_after_path,
            "avg_path_token_len" : self.avg_path_token_len,
            "params_path_ratio" : self.params_path_ratio,
            "params_url_ratio" : self.params_url_ratio,
            "params_domain_ratio" : self.params_domain_ratio,
            "path_url_ratio" : self.path_url_ratio,
            "entropy" : entropy,
            "char_cont_rate" : char_cont_rate,
            "at_count":self.at_count,
            "double_slash_count":self.double_slash_count,
            "hyphen_count":self.hyphen_count,
            "dot_count":self.dot_count,
            "symbols_count":self.symbols_count,
            "tld_count":self.tld_count,
            "domain_digit_count":self.domain_digit_count,
            "param_digit_count":self.param_digit_count,
            "percent_count":self.percent_count
            
            }

In [24]:
def pipeline(url_list):
    url_in = []
    for url in url_list:
        url_processor = URLPreprocessor(url)
        url_in.append(list(url_processor.process().values()))
    output = model.predict(url_in)
    output = [str(prediction) for prediction in output]

    return {"url":output[0]}


In [28]:
pipeline(["https://www.youtube.com"])



ValueError: could not convert string to float: 'https://www.youtube.com'

In [27]:
df = pd.read_csv('./urlset.csv/urlset.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 131194: invalid start byte