In [6]:
pip install tld

Collecting tld
  Downloading tld-0.13-py2.py3-none-any.whl (263 kB)
     -------------------------------------- 263.8/263.8 kB 2.7 MB/s eta 0:00:00
Installing collected packages: tld
Successfully installed tld-0.13
Note: you may need to restart the kernel to use updated packages.


# GoPhish feature extraction

### Importing libraries

In [2]:
import re #regex
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from keras.models import Sequential
from colorama import Fore
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from tld import get_tld, is_tld






### Importing csv file

Edited csv file to only show a value for 1 if the url is malicious, defaced or has malware. The URL will be marked as 0 if it is benign

In [8]:
url_data = pd.read_csv('goPhish_url_data.csv') #reading data
url_data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [9]:
url_data.info() #Displaying some information about the csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 9.9+ MB


### Checking for null values in dataset

In [10]:
url_data.isnull().sum() #isnull() returns boolean value, .sum() returns the sum of the values

url     0
type    0
dtype: int64

### Extracting count of each value

In [11]:
count = url_data['type'].value_counts()
print(count)

0    428103
1    223088
Name: type, dtype: int64


### Normalising the data

Removing 'www.' from the url to normalise the data and extract the featues necessary to detect. 

- 'regex=True' uses the regular expression pattern to find the 'www.' in the whole url string, otherwise it would only find the first instance in the string

In [12]:
url_data['url'] = url_data['url'].replace('www.','', regex=True) #replacing www

In [13]:
url_data.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://garage-pirenne.be/index.php?option=com_...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


### Extracting url length

In [14]:
def get_url_length(url):
    prefixes = ['http://','https://'] #Defining common prefixes for URLs
    
    for prefix in prefixes: #Loop to exclude prefix in length of URL
        if url.startswith(prefix):
            url = url[len(prefix):]
    
    url = url.replace('www.','')
    
    return len(url)

In [15]:
url_data['url_len'] = url_data['url'].apply(lambda x: get_url_length(str(x))) #Adding column to show URL length

In [16]:
url_data.head()

Unnamed: 0,url,type,url_len
0,br-icloud.com.br,1,16
1,mp3raid.com/music/krizz_kaliko.html,0,35
2,bopsecrets.org/rexroth/cr/1.htm,0,31
3,http://garage-pirenne.be/index.php?option=com_...,1,77
4,http://adventure-nicaragua.net/index.php?optio...,1,228


### Extracting top level domain

Extracting the top level domain from the url using the tld module

- 'as_object = true' instructs the function to return a parsed url object
- Cells ran with no error but domains are not found. This could be for multiple reasons. Most likely the domain does not exist on the internet anymore

def extract_tld(url):
    try:
        res = get_tld(url, as_object = True, fail_silenty = False, fix_protocol = True)
        pri_domain = res.parsed_url.netloc
    except:
        pri_domain = None
    return pri_domain

url_data['domain'] = url_data['url'].apply(lambda x: extract_tld(x))

url_data.head()

### Extracting character type and count

This involves a few functions that extract 3 different values of the url.

- Count of letters
- Count of digits
- Count of special characters
- Lastly, there will be a list of each of the special characters to identify which ones have been used and if they are suspicious

In [17]:
def count_letters(url):
    num_letters = sum(char.isalpha() for char in url)
    return num_letters

def count_digits(url):
    num_digits = sum(char.isdigit() for char in url)
    return num_digits

def count_special_chars(url):
    special_chars = set(string.punctuation)
    num_special_chars = sum(char in special_chars for char in url)
    return num_special_chars

In [18]:
url_data['letters_count']        = url_data['url'].apply(lambda x: count_letters(x))
url_data['digits_count']         = url_data['url'].apply(lambda x: count_digits(x))
url_data['special_chars_count']  = url_data['url'].apply(lambda x: count_special_chars(x))

In [19]:
feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
for i in feature:
    url_data[i] = url_data['url'].apply(lambda x: x.count(i))

In [20]:
url_data.head()

Unnamed: 0,url,type,url_len,letters_count,digits_count,special_chars_count,@,?,-,=,.,#,%,+,$,!,*,",",//
0,br-icloud.com.br,1,16,13,0,3,0,0,1,0,2,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0,0,0,2,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0,0,0,2,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,1,77,60,7,17,0,1,1,4,2,0,0,0,0,0,0,0,1
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,1,1,3,2,0,0,0,0,0,0,0,1


### Identifying shortening services

If a shortening service has been used then the next functions will attempt to identify if the url has one attributed

In [21]:
def has_shortening_service(url):
    pattern = re.compile(r'https?://(?:www\.)?(?:\w+\.)*(\w+)\.\w+')
    match = pattern.search(url)

    if match:
        domain = match.group(1)
        common_shortening_services = ['bit', 'goo', 'tinyurl', 'ow', 't', 'is',
                                      'cli', 'yfrog', 'migre', 'ff', 'url4', 'twit',
                                      'su', 'snipurl', 'short', 'BudURL', 'ping', 
                                      'post', 'Just', 'bkite', 'snipr', 'fic', 
                                      'loopt', 'doiop', 'short', 'kl', 'wp', 
                                      'rubyurl', 'om', 'to', 'bit', 't', 'lnkd', 
                                      'db', 'qr', 'adf', 'goo', 'bitly', 'cur', 
                                      'tinyurl', 'ow', 'bit', 'ity', 'q', 'is', 
                                      'po', 'bc', 'twitthis', 'u', 'j', 'buzurl', 
                                      'cutt', 'u', 'yourls', 'x', 'prettylinkpro', 
                                      'scrnch', 'filoops', 'vzturl', 'qr', '1url', 
                                      'tweez', 'v', 'tr', 'link', 'zip']
        
        if domain.lower() in common_shortening_services:
            return 1
    return 0

In [22]:
url_data['shortened']  = url_data['url'].apply(lambda x: has_shortening_service(x))

In [23]:
url_data.head()

Unnamed: 0,url,type,url_len,letters_count,digits_count,special_chars_count,@,?,-,=,.,#,%,+,$,!,*,",",//,shortened
0,br-icloud.com.br,1,16,13,0,3,0,0,1,0,2,0,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0,0,0,2,0,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0,0,0,2,0,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,1,77,60,7,17,0,1,1,4,2,0,0,0,0,0,0,0,1,0
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,1,1,3,2,0,0,0,0,0,0,0,1,0


### Checking if network location appears in the url more than once

In [24]:
def abnormal_url(url):
    parsed_url = urlparse(url)
    netloc = parsed_url.netloc
    if netloc:
        netloc = str(netloc)
        match = re.search(netloc, url)
        if match:
            return 1
        else:
            return 0
    return 0

In [25]:
url_data['abnormal_url']  = url_data['url'].apply(lambda x: abnormal_url(x))

In [26]:
url_data.head()

Unnamed: 0,url,type,url_len,letters_count,digits_count,special_chars_count,@,?,-,=,...,#,%,+,$,!,*,",",//,shortened,abnormal_url
0,br-icloud.com.br,1,16,13,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,1,77,60,7,17,0,1,1,4,...,0,0,0,0,0,0,0,1,0,1
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,1,1,3,...,0,0,0,0,0,0,0,1,0,1


### Checking if there is a secure http protocol in the url

In [27]:
def secure_http(url):
    return int(urlparse(url).scheme == 'https')

In [28]:
url_data['secure_http']  = url_data['url'].apply(lambda x: secure_http(x))

In [29]:
url_data.head()

Unnamed: 0,url,type,url_len,letters_count,digits_count,special_chars_count,@,?,-,=,...,%,+,$,!,*,",",//,shortened,abnormal_url,secure_http
0,br-icloud.com.br,1,16,13,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,1,77,60,7,17,0,1,1,4,...,0,0,0,0,0,0,1,0,1,0
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,1,1,3,...,0,0,0,0,0,0,1,0,1,0


### Checking if url has IP address

In [30]:
def contain_ip(url):
    try:
        parsed_url = urlparse(url)
        if parsed_url.hostname:
            ip = ipaddress.ip_address(parsed_url.hostname)
            return isinstance(ip, (ipaddress.IPv4Address, ipaddress.IPv6Address))
    except ValueError:
        pass
    return 0

In [31]:
url_data['have_ip'] = url_data['url'].apply(lambda x: contain_ip(x))

In [32]:
url_data.head()

Unnamed: 0,url,type,url_len,letters_count,digits_count,special_chars_count,@,?,-,=,...,+,$,!,*,",",//,shortened,abnormal_url,secure_http,have_ip
0,br-icloud.com.br,1,16,13,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,0,35,29,1,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,0,31,25,1,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,1,77,60,7,17,0,1,1,4,...,0,0,0,0,0,1,0,1,0,0
4,http://adventure-nicaragua.net/index.php?optio...,1,228,199,22,14,0,1,1,3,...,0,0,0,0,0,1,0,1,0,0


In [49]:
url_data.shape


(651191, 23)

In [51]:
url_data.isnull().sum()


url                    0
type                   0
url_len                0
letters_count          0
digits_count           0
special_chars_count    0
@                      0
?                      0
-                      0
=                      0
.                      0
#                      0
%                      0
+                      0
$                      0
!                      0
*                      0
,                      0
//                     0
shortened              0
abnormal_url           0
secure_http            0
have_ip                0
dtype: int64

In [54]:
X = url_data.drop(['url','type'],axis=1)#,'type_code'
y = url_data['type']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [3]:
models = [Sequential,DecisionTreeClassifier,RandomForestClassifier,AdaBoostClassifier,KNeighborsClassifier,SGDClassifier,
         ExtraTreesClassifier,GaussianNB]
accuracy_test=[]
for m in models:
    print('#############################################')
    print('######-Model =>\033[07m {} \033[0m'.format(m))
    model_ = m()
    model_.fit(X_train, y_train)
    pred = model_.predict(X_test)
    acc = accuracy_score(pred, y_test)
    accuracy_test.append(acc)
    print('Test Accuracy :\033[32m \033[01m {:.2f}% \033[30m \033[0m'.format(acc*100))
    print('\033[01m              Classification_report \033[0m')
    print(classification_report(y_test, pred))
    print('\033[01m             Confusion_matrix \033[0m')
    cf_matrix = confusion_matrix(y_test, pred)
    plot_ = sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True,fmt= '0.2%')
    plt.show()
    print('\033[31m###################- End -###################\033[0m')

#############################################
######-Model =>[07m <class 'keras.src.engine.sequential.Sequential'> [0m



NameError: name 'X_train' is not defined