# Imports

In [117]:
from pathlib import Path
import os
import sys

project_dir = Path().resolve().parent
#sys.path.append(project_dir)

import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import RegexpTokenizer
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from feature_engine.encoding import OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline


pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

# Data

In [86]:
df = pd.read_csv(os.path.join(project_dir, 'data', 'phishing_site_urls.csv'))

In [87]:
df = df.sample(10000, random_state=777).reset_index(drop=True)

In [88]:
df.head()

Unnamed: 0,URL,Label
0,rickrodriguez.net/,good
1,facebook.com/creaturefeaturemusic?sk=app_24051...,good
2,pageinsider.com/bronfman.org,good
3,www.miskatonic.org/freebsd.html,good
4,www.eppicard.steelespaint.com/sitelogonclient/...,bad


# Feature Engineering

## Features No Leak

In [89]:
def get_filename_extension(text):
    file = os.path.basename(text)
    
    if '.' not in file:
        return 'NULL'
    
    return file.split('.')

In [90]:
def is_valid_ip(domain):
    ip_pattern = r'^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
    
    if re.match(ip_pattern, domain):
        return 1
    else:
        return 0

In [91]:
def get_qty_features(X, cols):
    
    for col in cols:
        #Len
        X[f'len_{col}'] = X[col].str.len()
        #Dots
        X[f'qty_dots_{col}'] = X[col].str.count('.')
        #Hyphen
        X[f'qty_hyphens_{col}'] = X[col].str.count('-')
        #Underscore
        X[f'qty_undescore_{col}'] = X[col].str.count('_')
        #Numbers
        X[f'qty_numbers_{col}'] = X[col].str.count('\b')
        #Vogais
        X[f'qty_vogais_{col}'] = X[col].str.count(r'[aeiouAEIOU]')
        #Especiais
        X[f'qty_especiais_{col}'] = X[col].str.count(r'[!@#$%^&*()_+]')
        

In [92]:
def get_file_name_extension(X):
    X['file'] = X['URL'].apply(os.path.basename)
    
    X['file'] = np.where(X['file'].str.contains('\.'),X['file'], '')
    
    X[['file_name', 'file_extension']] = X['file'].str.split('.', n=1, expand=True).fillna('')
    

In [146]:
def get_valid_words(text, tokenizer):
    text = tokenizer.tokenize(text)
    
    result = ' '.join(text)
    
    return result

In [125]:
class BuildFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        X_tmp = X.reset_index(drop=True)
        
        #Domain
        X_tmp['domain'] = X_tmp['URL'].str.split("/", n=1).str[0].fillna('')
        X_tmp['domain_splited'] = X_tmp['domain'].str.split('\.')
        
        X_tmp['org_domain'] = X_tmp['domain_splited'].apply(lambda x: 1 if 'org' in x else 0)
        X_tmp['com_domain'] = X_tmp['domain_splited'].apply(lambda x: 1 if 'com' in x else 0)
        X_tmp['gov_domain'] = X_tmp['domain_splited'].apply(lambda x: 1 if 'gov' in x else 0)
        
        #Query
        X_tmp['query'] = X_tmp['URL'].str.split('\?').str[1].fillna('')
        X_tmp['qtd_args_query'] = X_tmp['query'].str.count('=')
        
        #File
        get_file_name_extension(X_tmp)
        
        #General Features
        get_qty_features(X_tmp, ['URL', 'domain', 'query', 'file', 'file_name', 'file_extension'])
        
        X_tmp['have_domain'] = np.where(X_tmp['domain'] == '', 0, 1)
        X_tmp['have_query'] = np.where(X_tmp['query'] == '', 0, 1)
        X_tmp['have_file'] = np.where(X_tmp['file'] == '', 0, 1)
        
        return X_tmp

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)


## Features Embedding

In [136]:
class BuildFeaturesEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        
    def fit(self, X, y=None):
        
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        
        return self
    
    def transform(self, X):
        
        X_tmp = X.reset_index(drop=True)
        
        X_tmp['domain'] = X_tmp['URL'].str.split("/", n=1).str[0].fillna('')
        
        X_tmp['tokenized_domain'] = X_tmp['domain'].map(lambda text: get_valid_words(text, self.tokenizer))
        
        X_tmp['tokenized_total'] = X_tmp['URL'].map(lambda text: get_valid_words(text, self.tokenizer))
        
        return X_tmp
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

## Features Embedding Leak

In [127]:
class BuildFeaturesEmbeddingLeak(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):        
        self.cv_domain = CountVectorizer(decode_error='ignore').fit(X['tokenized_domain'])
        self.cv_total = CountVectorizer(decode_error='ignore').fit(X['tokenized_total'])
        
    def transform(self, X):
        X_tmp = X.reset_index(drop=True)
        matrix_domain = self.cv_domain.transform(X_tmp['tokenized_domain'])
        matrix_domain = pd.DataFrame(matrix_domain.toarray(), columns=[item + '_domain' for item in list(self.cv_domain.get_feature_names_out())])
        
        matrix_total = self.cv_total.transform(X_tmp['tokenized_total'])
        matrix_total = pd.DataFrame(matrix_total.toarray(), columns=[item + '_total' for item in list(self.cv_total.get_feature_names_out())])
        
        X_tmp = pd.concat([X_tmp, matrix_domain, matrix_total], axis=1)
        
        return X_tmp
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

## Fit Transform

In [137]:
pipe1 = Pipeline(
    steps = [
        ('prep', BuildFeatures())
    ]
)
pipe1

In [138]:
df1 = pipe1.transform(df)

In [141]:
df1.shape

(10000, 57)

In [142]:
df1.head()

Unnamed: 0,URL,Label,domain,domain_splited,org_domain,com_domain,gov_domain,query,qtd_args_query,file,file_name,file_extension,len_URL,qty_dots_URL,qty_hyphens_URL,qty_undescore_URL,qty_numbers_URL,qty_vogais_URL,qty_especiais_URL,len_domain,qty_dots_domain,qty_hyphens_domain,qty_undescore_domain,qty_numbers_domain,qty_vogais_domain,qty_especiais_domain,len_query,qty_dots_query,qty_hyphens_query,qty_undescore_query,qty_numbers_query,qty_vogais_query,qty_especiais_query,len_file,qty_dots_file,qty_hyphens_file,qty_undescore_file,qty_numbers_file,qty_vogais_file,qty_especiais_file,len_file_name,qty_dots_file_name,qty_hyphens_file_name,qty_undescore_file_name,qty_numbers_file_name,qty_vogais_file_name,qty_especiais_file_name,len_file_extension,qty_dots_file_extension,qty_hyphens_file_extension,qty_undescore_file_extension,qty_numbers_file_extension,qty_vogais_file_extension,qty_especiais_file_extension,have_domain,have_query,have_file
0,rickrodriguez.net/,good,rickrodriguez.net,"[rickrodriguez, net]",0,0,0,,0,,,,18,18,0,0,0,6,0,17,17,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,facebook.com/creaturefeaturemusic?sk=app_24051...,good,facebook.com,"[facebook, com]",0,1,0,sk=app_2405167945,1,,,,51,51,0,1,0,16,1,12,12,0,0,0,5,0,17,17,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
2,pageinsider.com/bronfman.org,good,pageinsider.com,"[pageinsider, com]",0,1,0,,0,bronfman.org,bronfman,org,28,28,0,0,0,9,0,15,15,0,0,0,6,0,0,0,0,0,0,0,0,12,12,0,0,0,3,0,8,8,0,0,0,2,0,3,3,0,0,0,1,0,1,0,1
3,www.miskatonic.org/freebsd.html,good,www.miskatonic.org,"[www, miskatonic, org]",1,0,0,,0,freebsd.html,freebsd,html,31,31,0,0,0,7,0,18,18,0,0,0,5,0,0,0,0,0,0,0,0,12,12,0,0,0,2,0,7,7,0,0,0,2,0,4,4,0,0,0,0,0,1,0,1
4,www.eppicard.steelespaint.com/sitelogonclient/...,bad,www.eppicard.steelespaint.com,"[www, eppicard, steelespaint, com]",0,1,0,,0,index.html,index,html,56,56,0,0,0,17,0,29,29,0,0,0,9,0,0,0,0,0,0,0,0,10,10,0,0,0,2,0,5,5,0,0,0,2,0,4,4,0,0,0,0,0,1,0,1


In [147]:
pipe2 = Pipeline(
    steps = [
        ('Embedding', BuildFeaturesEmbedding()),
        ('Vectorize', BuildFeaturesEmbeddingLeak())
    ]
)
pipe2

In [148]:
df2 = pipe2.fit_transform(df)

In [149]:
df2.shape

(10000, 29088)

In [150]:
df2.head()

Unnamed: 0,URL,Label,domain,tokenized_domain,tokenized_total,aaa_domain,aaacaricatures_domain,aaahq_domain,aaeadlxbvjodkcs_domain,aaretroleague_domain,aaronmaxdesign_domain,aass_domain,aastp_domain,aaus_domain,abacast_domain,abacos_domain,abbigliamentotorino_domain,abbsearch_domain,abc_domain,abccornet_domain,abcdespanol_domain,abcfamily_domain,abclocal_domain,abcnews_domain,abdouworld_domain,abebooks_domain,abgabiei_domain,abm_domain,about_domain,aboutfilm_domain,abovethelaw_domain,abqtrib_domain,abris_domain,absolutearts_domain,absoluteastronomy_domain,absolutemadonna_domain,absolutenow_domain,abyznewslinks_domain,ac_domain,acaciainvest_domain,academia_domain,academic_domain,academickids_domain,academicsofflight_domain,acapella_domain,acbanet_domain,acc_domain,accademiaperlaformazione_domain,access_domain,accessdance_domain,...,zra_total,zrchem_total,zrcqyjjidjjxt_total,zrdom_total,zrvk_total,zrzyhmnzxzniwnotp_total,zs_total,zsem_total,zseries_total,zshg_total,zsigri_total,zst_total,zsul_total,zt_total,zuberi_total,zugzwang_total,zumq_total,zurad_total,zurbr_total,zuse_total,zuserver_total,zuvslr_total,zuvyz_total,zvh_total,zvoadihdn_total,zw_total,zwcy_total,zwtmdwdus_total,zx_total,zxadf_total,zxaze_total,zxd_total,zxdeh_total,zxdy_total,zxhpa_total,zxjob_total,zxjremv_total,zxqkcohl_total,zxy_total,zy_total,zydmrj_total,zyv_total,zyzyzykuwudqazs_total,zz_total,zzjixiang_total,zzjwhz_total,zzsflduf_total,zzvjgoyyaghgxsfnc_total,zzz_total,zzzz_total
0,rickrodriguez.net/,good,rickrodriguez.net,rickrodriguez net,rickrodriguez net,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,facebook.com/creaturefeaturemusic?sk=app_24051...,good,facebook.com,facebook com,facebook com creaturefeaturemusic sk app,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,pageinsider.com/bronfman.org,good,pageinsider.com,pageinsider com,pageinsider com bronfman org,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,www.miskatonic.org/freebsd.html,good,www.miskatonic.org,www miskatonic org,www miskatonic org freebsd html,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,www.eppicard.steelespaint.com/sitelogonclient/...,bad,www.eppicard.steelespaint.com,www eppicard steelespaint com,www eppicard steelespaint com sitelogonclient ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
