# Import Lib

In [9]:
import math
import matplotlib.pyplot as plt
import datetime as dt
import os
import pandas as pd
os.chdir('D:\Pandas')
raw_data = pd.read_csv("100-legitimate-art.txt") #loading only 100 samples (art websites data)
display(raw_data.head(5))


Unnamed: 0,websites
0,http://www.emuck.com:3000/archive/egan.html
1,http://danoday.com/summit.shtml
2,http://groups.yahoo.com/group/voice_actor_appr...
3,http://voice-international.com/
4,http://www.livinglegendsltd.com/


In [10]:
raw_data['websites'].str.split("://").head() #Here we divided the protocol from the entire URL. but need it to be divided it 
                                                 #seperate column

0         [http, www.emuck.com:3000/archive/egan.html]
1                     [http, danoday.com/summit.shtml]
2    [http, groups.yahoo.com/group/voice_actor_appr...
3                     [http, voice-international.com/]
4                    [http, www.livinglegendsltd.com/]
Name: websites, dtype: object

In [13]:
seperation_of_protocol = raw_data['websites'].str.split("://",expand = True) 
#expand argument in the split method will give you a new column
seperation_of_protocol.head()

Unnamed: 0,0,1
0,http,www.emuck.com:3000/archive/egan.html
1,http,danoday.com/summit.shtml
2,http,groups.yahoo.com/group/voice_actor_appreciatio...
3,http,voice-international.com/
4,http,www.livinglegendsltd.com/


In [19]:
type(seperation_of_protocol)
seperation_domain_name = seperation_of_protocol[1].str.split("/",1,expand = True)
#split(seperator,no of splits according to seperator(delimiter),expand)
type(seperation_domain_name)

pandas.core.frame.DataFrame

In [20]:
seperation_domain_name.columns=["domain_name","address"] #renaming columns of data frame
seperation_domain_name.head()

Unnamed: 0,domain_name,address
0,www.emuck.com:3000,archive/egan.html
1,danoday.com,summit.shtml
2,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,voice-international.com,
4,www.livinglegendsltd.com,


In [21]:
splitted_data = pd.concat([seperation_of_protocol[0],seperation_domain_name],axis=1)
splitted_data.columns = ['protocol','domain_name','address']
splitted_data.head()

Unnamed: 0,protocol,domain_name,address
0,http,www.emuck.com:3000,archive/egan.html
1,http,danoday.com,summit.shtml
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,http,voice-international.com,
4,http,www.livinglegendsltd.com,


# Features Extraction


If the length of the URL is greater than or equal 54 characters then the URL classified as phishing

0 --- indicates legitimate

1 --- indicates Phishing

2 --- indicates Suspicious

In [22]:
def long_url(l):
    """This function is defined in order to differntiate website based on the length of the URL"""
    if len(l) < 54:
        return 0
    elif len(l) >= 54 and len(l) <= 75:
        return 2
    return 1

#Applying the above defined function in order to divide the websites into 3 categories
splitted_data['long_url'] = raw_data['websites'].apply(long_url)

#Will show the results only the websites which are legitimate according to above condition as 0 is legitimate website
splitted_data[splitted_data.long_url == 0].head()

Unnamed: 0,protocol,domain_name,address,long_url
0,http,www.emuck.com:3000,archive/egan.html,0
1,http,danoday.com,summit.shtml,0
3,http,voice-international.com,,0
4,http,www.livinglegendsltd.com,,0
5,http,voicechasers.com,forum/viewforum.php?f=8,0


# Feature-2

'''URL’s having “@” Symbol
Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.

IF {Url Having @ Symbol→ Phishing Otherwise→ Legitimate }

0 --- indicates legitimate

1 --- indicates Phishing'''

In [30]:
def have_at_symbol(l):
    """This function is used to check whether the URL contains @ symbol or not"""
    if "@" in l:
        return 1
    return 0

splitted_data['having_@_symbol'] = raw_data['websites'].apply(have_at_symbol)

splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol
0,http,www.emuck.com:3000,archive/egan.html,0,0
1,http,danoday.com,summit.shtml,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0
3,http,voice-international.com,,0,0
4,http,www.livinglegendsltd.com,,0,0


# Feature 3
IF {ThePosition of the Last Occurrence of "//" in the URL > 7→ Phishing

Otherwise→ Legitimate

0 --- indicates legitimate

1 --- indicates Phishing

In [31]:
def redirection(l):
    """If the url has symbol(//) after protocol then such URL is to be classified as phishing """
    if "//" in l:
        return 1
    return 0
splitted_data['redirection_//_symbol'] = seperation_of_protocol[1].apply(redirection)
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol
0,http,www.emuck.com:3000,archive/egan.html,0,0,0
1,http,danoday.com,summit.shtml,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0
3,http,voice-international.com,,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0


# Feature - 4
IF {Domain Name Part Includes (−) Symbol → Phishing

Otherwise → Legitimate

1 --> indicates phishing

0 --> indicates legitimate

In [32]:
def prefix_suffix_seperation(l):
    if '-' in l:
        return 1
    return 0

splitted_data['prefix_suffix_seperation'] = seperation_domain_name['domain_name'].apply(prefix_suffix_seperation)

splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0
3,http,voice-international.com,,0,0,0,1
4,http,www.livinglegendsltd.com,,0,0,0,0


# Feature - 5

Sub-Domain and Multi Sub-Domains
The legitimate URL link has two dots in the URL since we can ignore typing “www.”. If the number of dots is equal to three then the URL is classified as “Suspicious” since it has one sub-domain. However, if the dots are greater than three it is classified as “Phishy” since it will have multiple sub-domains

0 --- indicates legitimate

1 --- indicates Phishing

2 --- indicates Suspicious

In [33]:
def sub_domains(l):
    if l.count('.') < 3:
        return 0
    elif l.count('.') == 3:
        return 2
    return 1
splitted_data['sub_domains'] = splitted_data['domain_name'].apply(sub_domains)
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0


# Classification of URLs using Random forest

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
#Features
x = splitted_data.columns[4:9]
x

Index(['having_@_symbol', 'redirection_//_symbol', 'prefix_suffix_seperation',
       'sub_domains'],
      dtype='object')

In [70]:
#variable to be predicted; yes = 0 and no = 1
y = pd.factorize(splitted_data['long_url'])[0]
y

array([0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2,
       0, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 0, 2, 0], dtype=int32)

In [71]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_estimators=100,n_jobs=2,random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(splitted_data[x], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [72]:
test_data = pd.read_csv("test_data.csv")
clf.predict(test_data[x])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [73]:
clf.predict_proba(test_data[x])[0:10] #predicted probability for each class.

array([[0.81918506, 0.        , 0.18081494],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273],
       [0.75011684, 0.03610044, 0.21378273]])

# Evaluating classifier

In [74]:
preds = test_data.long_url[clf.predict(test_data[x])] #predicted values
preds.head(10)

0    1
0    1
0    1
0    1
0    1
0    1
0    1
0    1
0    1
0    1
Name: long_url, dtype: int64

In [75]:
actual = pd.Series(test_data['long_url']) #actual values
confusion_matrix(actual,preds)

array([[  0, 150,   0],
       [  0, 165,   0],
       [  0,  75,   0]], dtype=int64)

In [76]:
accuracy_score(actual,preds) 

0.4230769230769231

In [77]:
#importance of features
list(zip(splitted_data[x], clf.feature_importances_))

[('having_@_symbol', 0.0),
 ('redirection_//_symbol', 0.3069906291304721),
 ('prefix_suffix_seperation', 0.1381889737932768),
 ('sub_domains', 0.5548203970762511)]