In [311]:
import numpy as np
import pandas as pd

In [312]:
#Helper Functions

def _printHelper(heading, msg):
    print("=============================================")
    print(heading + "\n")
    print(msg)
    print("=============================================")
    
def printError(error):
    _printHelper("ERROR", error)
    


In [313]:
#Making CSV into pandas.dataframe object
df = pd.read_csv("phishing_data.csv")

In [314]:
#Printing out the different columns
print(df.columns)

print(len(df.columns))

Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 

In [315]:
#Printing out first 10 rows of data
df.head(10)

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.progarchives.com/album.asp?id=61737,46,20,zero,3,zero,0,1,0,0,...,1,one,0,627,6678,78526,0,0,5,phishing
1,http://signin.eday.co.uk.ws.edayisapi.dllsign....,128,120,0,10,0,0,0,0,0,...,1,zero,0,300,65,0,0,1,0,phishing
2,http://www.avevaconstruction.com/blesstool/ima...,52,25,0,3,0,0,0,0,0,...,1,zero,0,119,1707,0,0,1,0,phishing
3,http://www.jp519.com/,21,13,0,2,0,0,0,0,0,...,1,one,0,130,1331,0,0,0,0,legitimate
4,https://www.velocidrone.com/,28,19,0,2,0,0,0,0,0,...,0,zero,0,164,1662,312044,0,0,4,legitimate
5,https://support-appleld.com.secureupdate.duila...,128,50,1,4,1,0,1,2,0,...,1,one,0,25,3993,5707171,0,1,0,phishing
6,https://www.authpro.com/auth/ubabankng/?action...,50,15,0,2,0,0,1,0,0,...,1,zero,0,705,7330,154708,0,0,4,phishing
7,http://littlee.com.au/alibaba/login.alibaba.co...,51,14,0,5,0,0,0,0,0,...,1,zero,1,0,-1,0,0,1,0,phishing
8,http://www.tutorialspoint.com/dbms/,35,22,0,2,0,0,0,0,0,...,0,one,0,67,5046,379,0,0,5,legitimate
9,http://www.domarada.sk,22,15,0,2,0,0,0,0,0,...,1,one,0,148,3505,0,0,0,2,legitimate


Understanding **length_url** and **length_hostname** columns and what they represent

Gives us the length of the URL link

In [316]:
len("http://www.progarchives.com/album.asp?id=61737")

46

Gives us the length of the hostname

The host name is the address to the server

In [317]:
len("www.progarchives.com")

20

### Cleaning up all Features that count the number of (feature)

Making sure all the elements inside this column are **integers**

In [318]:
def cleanUpOneZero(columnName):
    
    
    if (df[columnName].isnull().values.any()):
        printError("There are some null values")
        
    for index,element in enumerate(df[columnName]):
        if (type(element) != int):
            try:
                int(element)
            except:
                if (element == "zero"):
                    df[columnName].iat[index] = 0
                elif (element == "one"):
                    df[columnName].iat[index] = 1
                else:
                    print()
                    print("For Column:")
                    print("============\n")
                    printError('Never account for the number "' + str(element) + '"')
                    print()
                    break
                
    try:
        df[columnName] = pd.to_numeric(df[columnName])
    except:
        print()
        print("For Column:")
        print("============\n")
        printError("Failed to convert column (" + columnName + ") to integers")
        print()
        
    if  (len(df[df[columnName] > 0]) == 0):
        del df[columnName]
        print()
        print("Deleting " + columnName + " as there is only one number for the feature")
        print("(i.e.) There is only one value for all the records")
        print()

In [319]:
nbLst = ['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path']


for nb_feature in nbLst:
    cleanUpOneZero(nb_feature)



Deleting nb_or as there is only one number for the feature
(i.e.) There is only one value for all the records



### Deletes

Deleting Column "abnormal_subdomain", this is very abstract and I don't know how they classified it.

In [343]:
del df['abnormal_subdomain']

In [323]:
#Printing out the different columns
print(df.columns)

print(len(df.columns))


Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_eq', 'nb_underscore', 'nb_tilde',
       'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 'nb_hyper

In [351]:
colName = "random_domain"



print(df[["url",colName]].head())

dfNew = df[["url",colName]]

print(dfNew[dfNew[colName] > 0].head(15))

print(dfNew[dfNew[colName] == 0].head(15))

print()
print(df.iat[0,0])
# print()
# print(df.iat[0,0])
print()
print(df.iat[3   ,0])
print()
print(df.iat[21   ,0])
print()
print(df.iat[56  ,0])


                                                 url  random_domain
0     http://www.progarchives.com/album.asp?id=61737              0
1  http://signin.eday.co.uk.ws.edayisapi.dllsign....              0
2  http://www.avevaconstruction.com/blesstool/ima...              0
3                              http://www.jp519.com/              1
4                       https://www.velocidrone.com/              0
                                                   url  random_domain
3                                http://www.jp519.com/              1
21       http://kam-net.ci/2026584619/verification.php              1
56   http://www.ijtte.com/uploads/2014-09-07/5d577a...              1
59                https://kodi.tv/addons/context-menus              1
76   http://ijikc.co.in/rating/KEN7XX7Y0P9/wemail_a...              1
102  http://scqijie.com/wp-content/themes/chigue/im...              1
104         https://for-sale.yakaz.com/digital-cameras              1
108  http://teamwlg.com/aa209Pag

In [352]:
nmbLst = ['1',
         '2',
         '3',
         '4',
         '5',
         '6',
         '7',
         '8',
         '9']

numbCount = 0
for char in df.iat[0,0]:
    if (char in nmbLst):
        numbCount += 1
        
print(numbCount/len(df.iat[0,0]))

0.10869565217391304


In [None]:
toAdd = [
    'https_token',
    'ratio_digits_url',
    'punycode',
    'port',
    'tld_in_path',
    'tld_in_subdomain',
    'nb_subdomains',
    'prefix_suffix',
    '',
    '',
    '',
    '',
    '',
    '',
    '',
    '',
    '',
    
    
]

description = [
    'Is it https protocol? If so, it will be 0. Else it is the http protocol, 1',
    'The length of the URL divided by the number of digits in URL',
    'Is the URL in ASCII or is it encoded in punnycode as an internationalized domain names (IDN)',
    'Does the URL contain a port address',
    'Is there a Top Level Domain in the path of the address (refer to the document image)',
    'Is there a Top Level Domain in the subdomain of the address',
    'Number of subdomains in the URL',
    'If there is either a prefix or a suffix somewhere in the URL, 1 for True, 0 for False',
    '',
    '',
    '',
    '',
    '',
    '',
    '',
    '',
]