In [62]:
pip install tld

Note: you may need to restart the kernel to use updated packages.


# GoPhish feature extraction

### Importing libraries

In [63]:
import re #regex
import string
import logging

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tldextract import extract as tld_extract
from tld import get_tld, is_tld
from tld.exceptions import TldDomainNotFound, TldBadUrl, TldIOError

### Importing csv file

Edited csv file to only show a value for 1 if the url is malicious, defaced or has malware. The URL will be marked as 0 if it is benign

In [64]:
url_data = pd.read_csv('goPhish_url_data.csv')
url_data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [65]:
url_data.info() #Displaying some information about the csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 9.9+ MB


### Checking for null values in dataset

In [66]:
url_data.isnull().sum() #isnull() returns boolean value, .sum() returns the sum of the values

url     0
type    0
dtype: int64

### Extracting count of each value

In [67]:
count = url_data['type'].value_counts()
print(count)

type
0    428103
1    223088
Name: count, dtype: int64


### Normalising the data

Removing 'www.' from the url to normalise the data and extract the featues necessary to detect. 

- 'regex=True' uses the regular expression pattern to find the 'www.' in the whole url string, otherwise it would only find the first instance in the string

In [68]:
url_data['url'] = url_data['url'].replace('www.','', regex=True)

In [69]:
url_data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://garage-pirenne.be/index.php?option=com_...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [70]:
url_data["type"].value_counts()

type
0    428103
1    223088
Name: count, dtype: int64

### Extracting url length

In [71]:
def get_url_length(url):
    prefixes = ['http://','https://'] #Defining common prefixes for URLs
    
    for prefix in prefixes: #Loop to exclude prefix in length of URL
        if url.startswith(prefix):
            url = url[len(prefix):]
    
    url = url.replace('www.','')
    
    return len(url)

In [72]:
url_data['url_len'] = url_data['url'].apply(lambda x: get_url_length(str(x))) #Adding column to show URL length

In [73]:
url_data.head()

Unnamed: 0,url,type,url_len
0,br-icloud.com.br,1,16
1,mp3raid.com/music/krizz_kaliko.html,0,35
2,bopsecrets.org/rexroth/cr/1.htm,0,31
3,http://garage-pirenne.be/index.php?option=com_...,1,77
4,http://adventure-nicaragua.net/index.php?optio...,1,228


### Extracting top level domain

Extracting the top level domain from the url using the tld module

- 'as_object = true' instructs the function to return a parsed url object
- Cells ran with no error but domains are not found. This could be for multiple reasons. Most likely the domain does not exist on the internet anymore

In [74]:
def extract_tld(url):
    try:
        res = get_tld(url, as_object = True, fail_silenty = False, fix_protocol = True)
        pri_domain = res.parsed_url.netloc
    except:
        pri_domain = None
    return pri_domain

In [75]:
url_data['domain'] = url_data['url'].apply(lambda x: extract_tld(x))

In [76]:
url_data.head()

Unnamed: 0,url,type,url_len,domain
0,br-icloud.com.br,1,16,
1,mp3raid.com/music/krizz_kaliko.html,0,35,
2,bopsecrets.org/rexroth/cr/1.htm,0,31,
3,http://garage-pirenne.be/index.php?option=com_...,1,77,
4,http://adventure-nicaragua.net/index.php?optio...,1,228,


### Extracting character type and count

This involves a few functions that extract 3 different values of the url.

- Count of letters
- Count of digits
- Count of special characters
- Lastly, there will be a list of each of the special characters to identify which ones have been used and if they are suspicious

In [None]:
def count_letters(url)
    num_letters = sum(char.isalpha() for char in url)
    return num_letters