# Imports

In [103]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from feature_engine.encoding import OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

# Data

In [86]:
df = pd.read_csv('phishing_site_urls.csv')

In [87]:
df = df.sample(10000, random_state=777).reset_index(drop=True)

In [88]:
df.head()

Unnamed: 0,URL,Label
0,rickrodriguez.net/,good
1,facebook.com/creaturefeaturemusic?sk=app_24051...,good
2,pageinsider.com/bronfman.org,good
3,www.miskatonic.org/freebsd.html,good
4,www.eppicard.steelespaint.com/sitelogonclient/...,bad


# Feature Engineering

## Features No Leak

In [89]:
class BuildFeatures:
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.tokenizer = RegexpTokenizer(r'[A-Za-z]+')
        
        #Precisa mesmo do steemer??
        self.sbs = SnowballStemmer("english")
        return self

    def transform(self, X):
        
        #Domain
        X['domain'] = X['URL'].str.split("/", n=1).str[0]
        X['domain_splited'] = X['domain'].str.split('\.')
        X['qtd_dots_domain'] = X['domain'].str.count('\.')
        X['qtd_hifens_domain'] = X['domain'].str.count('-')
        X['qtd_underline_domain'] = X['domain'].str.count('\_')
        X['qtd_asterisco_domain'] = X['domain'].str.count('\*')
        X['qtd_numeros_domain'] = X['domain'].str.count('\d')
        X['qtd_www_domain'] = X['domain'].str.count('www')
        X['org_domain'] = X['domain'].str.contains('.org.')
        X['com_domain'] = X['domain'].str.contains('.com')
        X['gov_domain'] = X['domain'].str.contains('.gov')
        X['tokenized_domain'] = X['domain'].map(lambda text: self.tokenizer.tokenize(text))
        X['tokenized_stemmed_domain'] = X['tokenized_domain'].map(lambda text: [self.sbs.stem(word) for word in text])
        X['tokenized_stemmed_domain'] = X['tokenized_stemmed_domain'].map(lambda x: ' '.join(x))
        
        #Querie
        X['query'] = X['URL'].str.split('\?').str[1]
        X['qtd_args_query'] = X['query'].str.count('=')
        X['qtd_dots_query'] = X['query'].str.count('\.')
        
        # Total
        X['qtd_dots_total'] = X['URL'].str.count('\.')
        X['qtd_hifens_total'] = X['URL'].str.count('-')
        X['qtd_underline_total'] = X['URL'].str.count('\_')
        X['qtd_asterisco_total'] = X['URL'].str.count('\*')
        X['qtd_numeros_total'] = X['URL'].str.count('\d')
        X['qtd_slash_total'] = X['URL'].str.count('/')
        X['qtd_www_total'] = X['URL'].str.count('www')
        X['tokenized_total'] = X['URL'].map(lambda text: self.tokenizer.tokenize(text))
        X['tokenized_stemmed_total'] = X['tokenized_total'].map(lambda text: [self.sbs.stem(word) for word in text])
        X['tokenized_stemmed_total'] = X['tokenized_stemmed_total'].map(lambda x: ' '.join(x))
        
        
        return X

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)


## Features Leak

In [90]:
class BuildFeaturesLeak():
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        #Brincar com parametros max e min
        #TFIDF?
        self.cv_domain = CountVectorizer(decode_error='ignore',max_features=100).fit(X['tokenized_stemmed_domain'])
        self.cv_total = CountVectorizer(decode_error='ignore',max_features=100).fit(X['tokenized_stemmed_total'])
        return self
        
    def transform(self, X):
        #Remover stop words? não acho que seja necessário
        matrix_domain = self.cv_domain.transform(X['tokenized_stemmed_domain'])
        matrix_domain = pd.DataFrame(matrix_domain.toarray(), columns=[item + '_domain' for item in list(self.cv_total.get_feature_names_out())])
        
        matrix_total = self.cv_total.transform(X['tokenized_stemmed_total'])
        matrix_total = pd.DataFrame(matrix_total.toarray(), columns=[item + '_total' for item in list(self.cv_total.get_feature_names_out())])
        
        X = pd.concat([X, matrix_domain, matrix_total], axis=1)
        
        return X
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

## Fit Transform

In [91]:
bf = BuildFeatures()

In [92]:
df = bf.fit_transform(df)

In [93]:
bfl = BuildFeaturesLeak()

In [94]:
df = bfl.fit_transform(df)

In [98]:
df.head()

Unnamed: 0,URL,Label,domain,domain_splited,qtd_dots_domain,qtd_hifens_domain,qtd_underline_domain,qtd_asterisco_domain,qtd_numeros_domain,qtd_www_domain,org_domain,com_domain,gov_domain,tokenized_domain,tokenized_stemmed_domain,query,qtd_args_query,qtd_dots_query,qtd_dots_total,qtd_hifens_total,qtd_underline_total,qtd_asterisco_total,qtd_numeros_total,qtd_slash_total,qtd_www_total,tokenized_total,tokenized_stemmed_total,admindomain,amazondomain,ampdomain,anddomain,answerdomain,appdomain,archivdomain,articldomain,artistdomain,aspdomain,aspxdomain,audomain,battldomain,bindomain,bizdomain,blogdomain,blogspotdomain,brdomain,cadomain,cgidomain,citidomain,cmddomain,codomain,...,jstotal,kansatotal,linkedintotal,listtotal,logintotal,montrealtotal,movitotal,musictotal,myliftotal,nametotal,nettotal,newtotal,newstotal,oftotal,orgtotal,pagetotal,paypaltotal,peopltotal,phptotal,playertotal,producttotal,pubtotal,reftotal,rutotal,schooltotal,searchtotal,securtotal,sitetotal,sporttotal,statetotal,storitotal,teamtotal,thetotal,tototal,tooltotal,topictotal,uktotal,updattotal,ustotal,videototal,viewtotal,watchtotal,webtotal,webscrtotal,wikitotal,wikipediatotal,wptotal,wwwtotal,yahoototal,youtubtotal
0,rickrodriguez.net/,good,rickrodriguez.net,"[rickrodriguez, net]",1,0,0,0,0,0,False,False,False,"[rickrodriguez, net]",rickrodriguez net,,,,1,0,0,0,0,1,0,"[rickrodriguez, net]",rickrodriguez net,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,facebook.com/creaturefeaturemusic?sk=app_24051...,good,facebook.com,"[facebook, com]",1,0,0,0,0,0,False,True,False,"[facebook, com]",facebook com,sk=app_2405167945,1.0,0.0,1,0,1,0,10,1,0,"[facebook, com, creaturefeaturemusic, sk, app]",facebook com creaturefeaturemus sk app,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,pageinsider.com/bronfman.org,good,pageinsider.com,"[pageinsider, com]",1,0,0,0,0,0,False,True,False,"[pageinsider, com]",pageinsid com,,,,2,0,0,0,0,1,0,"[pageinsider, com, bronfman, org]",pageinsid com bronfman org,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,www.miskatonic.org/freebsd.html,good,www.miskatonic.org,"[www, miskatonic, org]",2,0,0,0,0,1,False,False,False,"[www, miskatonic, org]",www miskaton org,,,,3,0,0,0,0,1,1,"[www, miskatonic, org, freebsd, html]",www miskaton org freebsd html,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,www.eppicard.steelespaint.com/sitelogonclient/...,bad,www.eppicard.steelespaint.com,"[www, eppicard, steelespaint, com]",3,0,0,0,0,1,False,True,False,"[www, eppicard, steelespaint, com]",www eppicard steelespaint com,,,,4,0,0,0,0,2,1,"[www, eppicard, steelespaint, com, sitelogoncl...",www eppicard steelespaint com sitelogoncli ind...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [99]:
df.shape

(10000, 227)

# Encoding