# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import ipaddress as ip
import pickle

In [3]:
pd.options.display.max_columns = 50

# Importing Data and EDA

In [4]:
dataset = pd.read_csv('data_cleaned.csv')

In [5]:
dataset.head()

Unnamed: 0,url,label
0,http://diaryofagameaddict.com,1
1,http://espdesign.com.au,1
2,http://iamagameaddict.com,1
3,http://kalantzis.net,1
4,http://slightlyoffcenter.net,1


In [6]:
len(dataset)

420464

In [7]:
dataset['label'].value_counts()

0    344821
1     75643
Name: label, dtype: int64

# Feature Generartion

##### 01.  Long URL to Hide the Suspicious Part
Phishers can use long URL to hide the doubtful part in the address bar. For example: 
http://federmacedoadv.com.br/3f/aze/ab51e2e319e51502f416dbe46b773a5e/?cmd=_home&amp;dispatch=11004d58f5b74f8dc1e7c2e8dd4105e811004d58f5b74f8dc1e7c2e8dd4105e8@phishing.website.html

In [8]:
length = lambda x : len(x)

##### 02. Using the IP Address
If an IP address is used as an alternative of the domain name in the URL, such as “http://125.98.3.123/fake.html”, users can be sure that someone is trying to steal their personal information. Sometimes, the IP address is even transformed into hexadecimal code as shown in the following link “http://0x58.0xCC.0xCA.0x62/2/paypal.ca/index.html”. 

If the domain part has an IP address -> Phishing

Otherwise -> Legitimate

In [9]:
def isIp(x):
    try:
        if ip.ip_address(x):
            return 1
    except:
        return 0

##### 03. URL’s having “@” Symbol
Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol. 

In [10]:
countAt = lambda x : x.count('@')

##### 04. Redirecting using “//”
The existence of “//” within the URL path means that the user will be redirected to another website. An example of such URL’s is: “http://www.legitimate.com//http://www.phishing.com”. We examin the location where the “//” appears. We find that if the URL starts with “HTTP”, that means the “//” should appear in the sixth position. However, if the URL employs “HTTPS” then the “//” should appear in seventh position.


In [11]:
countDoubleSlash = lambda x : x.count('//')

##### 05. Adding Prefix or Suffix Separated by (-) to the Domain
The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage. For example http://www.Confirme-paypal.com/.


In [12]:
countHyphen = lambda x : x.count('-')

##### 06. Sub Domain and Multi Sub Domains
Let us assume we have the following link: http://www.hud.ac.uk/students/. A domain name might include the country-code top-level domains (ccTLD), which in our example is “uk”. The “ac” part is shorthand for “academic”, the combined “ac.uk” is called a second-level domain (SLD) and “hud” is the actual name of the domain. To produce a rule for extracting this feature, we firstly have to omit the (www.) from the URL which is in fact a sub domain in itself. Then, we have to remove the (ccTLD) if it exists. Finally, we count the remaining dots. If the number of dots is greater than one, then the URL is classified as “Suspicious” since it has one sub domain. However, if the dots are greater than two, it is classified as “Phishing” since it will have multiple sub domains. Otherwise, if the URL has no sub domains, we will assign “Legitimate” to the feature. 


In [13]:
countDots = lambda x: x.count('.')

##### 07. num of delimeters:
[';','_','?','=','&']

In [14]:
def countDelimeters(x):
    count = 0
    for delim in [';','_','?','=','&']:
        count += x.count(delim)
    return count

##### 08. subdirectory count

In [15]:
countSubDirectory = lambda x : x.count('/')

##### 09. query count

In [16]:
def countQueries(x):
    if not x:
        return 0
    else:
        return len(x.split('&'))

##### Using URL Shortening Services “TinyURL”
URL shortening is a method on the “World Wide Web” in which a URL may be made considerably smaller in length and still lead to the required webpage. This is accomplished by means of an “HTTP Redirect” on a domain name that is short, which links to the webpage that has a long URL. For example, the URL “http://portal.hud.ac.uk/” can be shortened to “bit.ly/19DXSk4”.


##### Domain Registration Length
Based on the fact that a phishing website lives for a short period of time, we believe that trustworthy domains are regularly paid for several years in advance. In our dataset, we find that the longest fraudulent domains have been used for one year only. 


In [17]:
# https://www.whois.com/whois/

##### URL of Anchor
An anchor is an element defined by the 'a' tag. This feature is treated exactly as “Request URL”.

In [18]:
def generateFeatures(x):
    features = []
    features.append(length(x))
    features.append(isIp(x))
    features.append(countAt(x))
    features.append(countDoubleSlash(x))
    features.append(countHyphen(x))
    features.append(countDots(x))
    features.append(countDelimeters(x))
    features.append(countSubDirectory(x))
    features.append(countQueries(x))
    return features

In [19]:
def generateTokens(x):
    allTokens = []
    tokenSlash = x.split('/')
    for ts in tokenSlash:
        tokensHyphen = ts.split('-')
        for th in tokensHyphen:
            tokensDot = th.split('.')
            allTokens += tokensDot
    return list(set(allTokens))

In [20]:
featureset = pd.DataFrame(dataset['url'].copy())

In [21]:
featureset.head()

Unnamed: 0,url
0,http://diaryofagameaddict.com
1,http://espdesign.com.au
2,http://iamagameaddict.com
3,http://kalantzis.net
4,http://slightlyoffcenter.net


In [22]:
featureset['length'] = featureset['url'].apply(length)

In [23]:
featureset['isIp'] = featureset['url'].apply(isIp)

In [24]:
featureset['countAt'] = featureset['url'].apply(countAt)

In [25]:
featureset['countDoubleSlash'] = featureset['url'].apply(countDoubleSlash)

In [26]:
featureset['countHyphen'] = featureset['url'].apply(countHyphen)

In [27]:
featureset['countDots'] = featureset['url'].apply(countDots)

In [28]:
featureset['countDelimeters'] = featureset['url'].apply(countDelimeters)

In [29]:
featureset['countSubDirectory'] = featureset['url'].apply(countSubDirectory)

In [30]:
featureset['countQueries'] = featureset['url'].apply(countQueries)

In [31]:
featureset.head()

Unnamed: 0,url,length,isIp,countAt,countDoubleSlash,countHyphen,countDots,countDelimeters,countSubDirectory,countQueries
0,http://diaryofagameaddict.com,29,0,0,1,0,1,0,2,1
1,http://espdesign.com.au,23,0,0,1,0,2,0,2,1
2,http://iamagameaddict.com,25,0,0,1,0,1,0,2,1
3,http://kalantzis.net,20,0,0,1,0,1,0,2,1
4,http://slightlyoffcenter.net,28,0,0,1,0,1,0,2,1


In [32]:
X = featureset.drop(['url'], axis=1)
y = dataset['label']

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y)

# Machine Learning

## Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
LR1 = LogisticRegression()
LR1.fit(X_train, y_train)
print(LR1.score(X_eval, y_eval))

final_LR1 = LogisticRegression()
final_LR1.fit(X, y)

pickle.dump(final_LR1, open('LogisticRegression1.pickle', 'wb'))

0.830387381559


## Random Forests

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
RF1 = RandomForestClassifier()
RF1.fit(X_train, y_train)
print(RF1.score(X_eval, y_eval))

final_RF1 = RandomForestClassifier()
final_RF1.fit(X, y)

pickle.dump(final_RF1, open('RandomForestClassifier1.pickle', 'wb'))

0.858327942464


# Testing

In [39]:
from sklearn.metrics import classification_report, confusion_matrix

In [40]:
test = pd.read_csv('test.csv')

In [41]:
test.head()

Unnamed: 0,URL,Label
0,http://shagali.com/shiro/437594df261a1e7222745...,1
1,http://https-security.esy.es/recovery-chekpoin...,1
2,http://www.netflix-notifications.com/,1
3,http://sec-verifyfb-login-fbpages.esy.es/recov...,1
4,http://recever0076.esy.es/recovery-chekpoint-l...,1


In [42]:
len(test)

7030

In [43]:
X_test = pd.DataFrame(columns=featureset.columns.drop('url'))

In [44]:
X_test.head()

Unnamed: 0,length,isIp,countAt,countDoubleSlash,countHyphen,countDots,countDelimeters,countSubDirectory,countQueries


In [45]:
for i in range(len(test)):
    features = generateFeatures(test['URL'].loc[i])    
    X_test.loc[i] = features

In [46]:
X_test.head()

Unnamed: 0,length,isIp,countAt,countDoubleSlash,countHyphen,countDots,countDelimeters,countSubDirectory,countQueries
0,205,0,0,1,0,2,16,11,4
1,58,0,0,1,3,3,0,3,1
2,37,0,0,1,1,2,0,3,1
3,70,0,0,1,5,3,0,3,1
4,55,0,0,1,2,3,0,3,1


In [47]:
y_test = test['Label']

## Logistic Regression

In [48]:
y_pred = final_LR1.predict(X_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[3454   40]
 [2912  624]]
             precision    recall  f1-score   support

          0       0.54      0.99      0.70      3494
          1       0.94      0.18      0.30      3536

avg / total       0.74      0.58      0.50      7030



In [49]:
y_pred = final_RF1.predict(X_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[3251  243]
 [2461 1075]]
             precision    recall  f1-score   support

          0       0.57      0.93      0.71      3494
          1       0.82      0.30      0.44      3536

avg / total       0.69      0.62      0.57      7030

