In [33]:
# !pip3 install googlesearch-python
# !pip3 install tld
# !pip3 install SPF2IP
# !pip install install tldextract

In [26]:
import pandas as pd
import numpy as np
from tld import get_tld, is_tld
import re
import bz2
import pickle
from googlesearch import search
from SPF2IP import SPF2IP
from urllib.parse import urlparse, parse_qs
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

In [27]:
df = pd.read_csv("../data/malicious_phish.csv")
origin = pd.read_csv("../data/raw/dataset-phishing-domain-detection-cybersecurity/dataset_cybersecurity_michelle.csv")

origin.columns.tolist()

['qty_dot_url',
 'qty_hyphen_url',
 'qty_underline_url',
 'qty_slash_url',
 'qty_questionmark_url',
 'qty_equal_url',
 'qty_at_url',
 'qty_and_url',
 'qty_exclamation_url',
 'qty_space_url',
 'qty_tilde_url',
 'qty_comma_url',
 'qty_plus_url',
 'qty_asterisk_url',
 'qty_hashtag_url',
 'qty_dollar_url',
 'qty_percent_url',
 'qty_tld_url',
 'length_url',
 'qty_dot_domain',
 'qty_hyphen_domain',
 'qty_underline_domain',
 'qty_slash_domain',
 'qty_questionmark_domain',
 'qty_equal_domain',
 'qty_at_domain',
 'qty_and_domain',
 'qty_exclamation_domain',
 'qty_space_domain',
 'qty_tilde_domain',
 'qty_comma_domain',
 'qty_plus_domain',
 'qty_asterisk_domain',
 'qty_hashtag_domain',
 'qty_dollar_domain',
 'qty_percent_domain',
 'qty_vowels_domain',
 'domain_length',
 'domain_in_ip',
 'server_client_domain',
 'qty_dot_directory',
 'qty_hyphen_directory',
 'qty_underline_directory',
 'qty_slash_directory',
 'qty_questionmark_directory',
 'qty_equal_directory',
 'qty_at_directory',
 'qty_and_dir

In [28]:
#df_phish = df.copy()
df_phish = df.loc[(df['type'] == 'benign') | (df['type'] == 'phishing')]
df_phish[['type']].value_counts()


type    
benign      428103
phishing     94111
Name: count, dtype: int64

In [29]:
df_phish.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign


## Updated - functions

In [34]:
import tldextract
import os
def process_new_url(df):
    # Extract domain
    try:
        df['domain'] = df['url'].apply(lambda x: tldextract.extract(x).domain + '.' + tldextract.extract(x).suffix)
    except:
        df.loc[:, 'domain'] = None

    # Extract directory
    df['directory'] = df['url'].apply(lambda x: os.path.dirname(urlparse(x).path))

    # Extract params
    df['params'] = df['url'].apply(lambda x: urlparse(x).query)

    # Extract file
    df['file'] = df['url'].apply(lambda x: os.path.basename(urlparse(x).path))

    # Add length-related columns
    df.loc[:, 'domain_length'] = df['domain'].str.len()
    df.loc[:, 'directory_length'] = df['directory'].str.len()
    df.loc[:, 'params_length'] = df['params'].str.len()
    df.loc[:, 'file_length'] = df['file'].str.len()
    df.loc[:, 'length_url'] = df['url'].str.len()

    return df


In [35]:
df_phish = process_new_url(df_phish)
df_phish.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['domain'] = df['url'].apply(lambda x: tldextract.extract(x).domain + '.' + tldextract.extract(x).suffix)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['directory'] = df['url'].apply(lambda x: os.path.dirname(urlparse(x).path))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['params'] = df[

Unnamed: 0,url,type,domain,directory,params,file,domain_length,directory_length,params_length,file_length,length_url
0,br-icloud.com.br,phishing,br-icloud.com.br,,,br-icloud.com.br,16,0,0,16,16
1,mp3raid.com/music/krizz_kaliko.html,benign,mp3raid.com,mp3raid.com/music,,krizz_kaliko.html,11,17,0,17,35
2,bopsecrets.org/rexroth/cr/1.htm,benign,bopsecrets.org,bopsecrets.org/rexroth/cr,,1.htm,14,25,0,5,31
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign,buzzfil.net,/m/show-art,,ils-etaient-loin-de-s-imaginer-que-le-hibou-al...,11,11,0,88,118
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign,go.com,espn.go.com/nba/player/_/id/3457,,brandon-rush,6,32,0,12,45


### Reformat url dataset

In [36]:
def special_chars_qty(df):
    vowels=['a','e','i','o','u']
    features = {'at':'@', 'questionmark':'?', 'underline':'_', 'hyphen':'-', 'equal':'=', 'dot':'.', 
            'hashtag':'#', 'percent':'%', 'plus':'+', 'dollar':'$', 'exclamation':'!', 'asterisk':'*', 
            'comma':',', 'slash':'/', 'space':' ', 'tilde':'~','and':'&'}
    cols=['url','domain','params','directory','file']

    # add quantity of special characters for all cols
    for i in range(len(cols)):
        for key, value in features.items():
                df.loc[:, "qty_" + key + '_'+ cols[i]] = df_phish.loc[:, cols[i]].apply(lambda x: x.count(value) if x else -1)

    # add vowel qtr for domain
    df.loc[:,"qty_vowels_domain"] = df.loc[:,'domain'].apply(lambda x: 0 if x is None else sum(char.lower() in vowels for char in x))

    return df   


def shortened(url):
    match = re.search(
                      'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0 


def check_google_index(url):
    site = search(url, 3)
    return 1 if site else 0


# check if email in url
def check_email(url):
    regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    if(re.fullmatch(regex, url)):
        return 1
    else:
        return 0


# check if domain has spf record
def check_spf(dom):
    try:
        spf_records = SPF2IP().query(dom)
        if spf_records:
            return 1
        else:
            return 0
    except:
        return 0


# check the quantity of tlds in url
def count_tlds(url):
    # Regular expression pattern to extract TLDs from a URL
    tld_pattern = r'\.([a-zA-Z]{2,})$'
    
    # Find all matches of the TLD pattern in the URL
    tlds = re.findall(tld_pattern, url)
    
    # Return the count of unique TLDs
    return len(set(tlds))


# check the number of params in url
def count_params(url):
    # Parse the URL
    parsed_url = urlparse(url)
    
    # Extract the query parameters
    query_params = parsed_url.query
    
    # Parse the query parameters
    parsed_query_params = parse_qs(query_params)
    
    # Count the number of parameters
    num_params = len(parsed_query_params)
    
    return num_params


# use this to complete preprocessing of new dataset 
def reformat_df(df):
     
    df_new = special_chars_qty(df)
    df_new.loc[:,'url_shortened'] = df_new.loc[:,'url'].apply(lambda x: shortened(x))

    # add the quantity of params in url
    df_new.loc[:,'qty_params'] = df_new.loc[:,'url'].apply(lambda x: count_params(x))

    # check if google index is available for url & domain
    df_new.loc[:,'url_google_index'] = df_new.loc[:,'url'].apply(lambda i: check_google_index(i))
    df_new.loc[:,'domain_google_index'] = df_new.loc[:,'domain'].apply(lambda i: check_google_index(i)) 

    # check if email is in utl
    df_new.loc[:,'email_in_url'] = df_new.loc[:,'url'].apply(lambda i: check_email(i))

    # check if domain has spf record
    df_new.loc[:,'domain_spf'] = df_new.loc[:,'domain'].apply(check_spf)

    # check qty of tld in url
    df_new.loc[:,'qty_tld_url']=df_new.loc[:,'url'].apply(count_tlds)
    
    # Check if TLD is present in params column and return 1 or -1
    df_new['tld_present_params'] = df_new.apply(lambda row: 1 if tldextract.extract(row['params']).suffix in row['domain'] else -1, axis=1)

    # Create a mapping dictionary
    label_mapping = {'benign': 0, 'phishing': 1}
    df_new['type'] = df_new['type'].map(label_mapping)
    
    return df_new
    
    

  'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
  'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
  'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
  'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
  'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
  'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
  'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
  'tr\.im|link\.zip\.net',


In [37]:
# takes <60s
df_res = reformat_df(df_phish)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "qty_" + key + '_'+ cols[i]] = df_phish.loc[:, cols[i]].apply(lambda x: x.count(value) if x else -1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "qty_" + key + '_'+ cols[i]] = df_phish.loc[:, cols[i]].apply(lambda x: x.count(value) if x else -1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

In [38]:
# count how many -1 there are for each column
def count_minus_one(df):
    counts = {}
    for col in df.columns:
        # Count the occurrences of -1 in the column
        count_minus_one = (df[col] == -1).sum()
        
        # Store the count in the dictionary
        counts[col] = count_minus_one
    
    return counts


In [39]:
#take too long
'''
def count_redirects(url):
    try:
        response = requests.get(url, allow_redirects=True)
        num_redirects = len(response.history)
        return num_redirects
    except requests.exceptions.RequestException:
        # If an exception occurs during the request (e.g., invalid URL), return -1
        return -1

df_res['qty_redirects'] = df_res.loc[:,'url'].apply(count_redirects)
'''

"\ndef count_redirects(url):\n    try:\n        response = requests.get(url, allow_redirects=True)\n        num_redirects = len(response.history)\n        return num_redirects\n    except requests.exceptions.RequestException:\n        # If an exception occurs during the request (e.g., invalid URL), return -1\n        return -1\n\ndf_res['qty_redirects'] = df_res.loc[:,'url'].apply(count_redirects)\n"

In [40]:
# Check columns present in orginal df but not in df_phish
def complete_test_data(original, new_df):

    # add columns with unavailable info with -1
    missing_columns = list(original.columns.difference(new_df.columns))
    missing_columns.remove('phishing')
    for column in missing_columns:
        new_df.loc[:,column] = -1

    # remove url, domain, directory, params, file columns
    new_df = new_df.drop(columns=['url','domain','directory','params','file'])
    
    return new_df


In [41]:
df_final = complete_test_data(origin,df_res)
origin.columns.difference(df_final.columns)

  new_df.loc[:,column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.loc[:,column] = -1
  new_df.loc[:,column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.loc[:,column] = -1
  new_df.loc[:,column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.loc[:,column] = -1
  new_df.loc[:,column] = -1
A value is tryi

Index(['phishing'], dtype='object')

In [42]:
df_final.head()

Unnamed: 0,type,domain_length,directory_length,params_length,file_length,length_url,qty_at_url,qty_questionmark_url,qty_underline_url,qty_hyphen_url,...,qty_ip_resolved,qty_mx_servers,qty_nameservers,qty_redirects,server_client_domain,time_domain_activation,time_domain_expiration,time_response,tls_ssl_certificate,ttl_hostname
0,1,16,0,0,16,16,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,0,11,17,0,17,35,0,0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,0,14,25,0,5,31,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,0,11,11,0,88,118,0,0,0,16,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
6,0,6,32,0,12,45,0,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [43]:
df_final.describe()

Unnamed: 0,type,domain_length,directory_length,params_length,file_length,length_url,qty_at_url,qty_questionmark_url,qty_underline_url,qty_hyphen_url,...,qty_ip_resolved,qty_mx_servers,qty_nameservers,qty_redirects,server_client_domain,time_domain_activation,time_domain_expiration,time_response,tls_ssl_certificate,ttl_hostname
count,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,...,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0,522214.0
mean,0.180215,13.491237,29.960221,7.43148,14.330052,55.545755,0.002683,0.165796,0.416134,1.59223,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
std,0.384367,4.692182,25.726849,30.260055,19.920389,44.435153,0.059797,0.405223,1.257731,3.091309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,0.0,10.0,16.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,0.0,13.0,24.0,0.0,10.0,43.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
75%,0.0,16.0,36.0,0.0,17.0,68.0,0.0,0.0,0.0,2.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
max,1.0,152.0,2160.0,2005.0,610.0,2175.0,10.0,20.0,79.0,87.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## Final comparison & adjustment with model features

In [44]:
def feature_check(model, df):
    extra_col = df.columns.difference(model.feature_names_in_)
    if not extra_col.empty:
        df = df.drop(columns=extra_col)
        df = df[model.feature_names_in_]
    return df

In [45]:
# obtain ttl for hostname of url
# import dns.resolver

# def get_ttl(url):
#     try:
#         result = dns.resolver.resolve(url, 'A')
#         if result.response.answer:
#             return int(result.response.answer[0].ttl)
#     except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN,
#             dns.exception.Timeout, dns.resolver.NoNameservers):
#         return 0
#     return 0

# df_phish.loc[:,'ttl_hostname'] = df_phish.loc[:,'domain'].apply(lambda i: get_ttl(i))

# import requests
# def check_ssl_certificate(url):
#     try:
#         response = requests.head(url)
#         # Check if the response status code is 200 (OK) or 301 (Moved Permanently)
#         if response.status_code == 200 or response.status_code == 301:
#             return 1  # SSL certificate is available
#     except requests.exceptions.SSLError:
#         pass  # SSL certificate is not available or there is an SSL error
#     except requests.exceptions.RequestException:
#         pass  # Handle other request exceptions if needed
#     return 0  # SSL certificate is not available

# df_phish.loc[:,'tls_ssl_certificate	'] = df_phish.loc[:,'url'].apply(check_ssl_certificate)

### fit trained model

In [46]:
import pickle

with open("../models/tuned_model.pkl", "rb") as f:
    model = pickle.load(f)

In [47]:
# # Load the compressed model
# with bz2.BZ2File('../model/rf_model_0327.pbz2', 'rb') as f:
#     compressed_model = f.read()

# # Decompress and load the model
# model = pickle.loads(compressed_model)

df_to_fit = feature_check(model,df_final)
# Compare predicted results with indicator column
predicted = model.predict(df_to_fit)
indicator_column = df_final['type'].values


In [48]:
count_minus_one(df_to_fit)

{'qty_dot_url': 0,
 'qty_hyphen_url': 0,
 'qty_underline_url': 0,
 'qty_slash_url': 0,
 'qty_questionmark_url': 0,
 'qty_equal_url': 0,
 'qty_at_url': 0,
 'qty_and_url': 0,
 'qty_exclamation_url': 0,
 'qty_space_url': 0,
 'qty_tilde_url': 0,
 'qty_comma_url': 0,
 'qty_plus_url': 0,
 'qty_asterisk_url': 0,
 'qty_hashtag_url': 0,
 'qty_dollar_url': 0,
 'qty_percent_url': 0,
 'qty_tld_url': 0,
 'length_url': 0,
 'qty_dot_domain': 0,
 'qty_hyphen_domain': 0,
 'qty_underline_domain': 0,
 'qty_at_domain': 0,
 'qty_vowels_domain': 0,
 'domain_length': 0,
 'domain_in_ip': 522214,
 'server_client_domain': 522214,
 'qty_dot_directory': 22046,
 'qty_hyphen_directory': 22046,
 'qty_underline_directory': 22046,
 'qty_slash_directory': 22046,
 'qty_questionmark_directory': 22046,
 'qty_equal_directory': 22046,
 'qty_at_directory': 22046,
 'qty_and_directory': 22046,
 'qty_exclamation_directory': 22046,
 'qty_space_directory': 22046,
 'qty_tilde_directory': 22046,
 'qty_comma_directory': 22046,
 'qty

In [49]:
# Evaluate prediction performance
# Create confusion matrix
confusion_mat = confusion_matrix(indicator_column, predicted)
# Assuming 'indicator_column' and 'predicted' are defined
accuracy = accuracy_score(indicator_column, predicted)
precision = precision_score(indicator_column, predicted)
print(confusion_mat)
print('accuracy:', accuracy)


[[287757 140346]
 [ 62320  31791]]
accuracy: 0.6119100598605169


In [None]:
confusion_mat_2 = confusion_matrix(indicator_column, predicted)
# Assuming 'indicator_column' and 'predicted' are defined
accuracy_2 = accuracy_score(indicator_column, predicted)
precision_2 = precision_score(indicator_column, predicted)
print(confusion_mat_2)
print('accuracy:', accuracy_2)

[[     1 428102]
 [    33  94078]]
accuracy: 0.1801541130647589


In [None]:
# check how many 0s and 1s predicted
np.unique(predicted,return_counts=True)

(array([0, 1]), array([    34, 522180]))