In [87]:
import numpy as np
import pandas as pd
from random import randint

In [7]:
dataset = pd.read_csv("malicious_phish.csv")

In [8]:
dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [9]:
dataset.tail()

Unnamed: 0,url,type
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign
60429,myspace.com/totimoshi,benign
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign
60431,en.wikipedia.org/wiki/Frank_Bonner,benign
60432,http://www.ttc-fuchsstadt.de/index.php?view=ca...,


In [10]:
print("Dataset shape:", dataset.shape)

Dataset shape: (60433, 2)


In [11]:
dataset.keys()

Index(['url', 'type'], dtype='object')

In [12]:
dataset['type'].value_counts()

benign        44339
defacement    11018
phishing       3611
malware        1464
Name: type, dtype: int64

In [13]:
dataset['url'].value_counts()

http://style.org.hc360.com/css/detail/mysite/siteconfig/pro_control.css                               20
http://www.pubyun.com/wiki/lib/exe/css.php?t=dokuwiki&tseed=f1f16e8c6142fab8553f2b9cfe045961          20
http://9779.info/%E6%A0%91%E5%8F%B6%E8%B4%B4%E7%94%BB%E4%BD%9C%E5%93%81/                              13
http://www.js182.com/app/member/SportOption.php?uid=guest&langx=gb                                    13
http://www.824555.com/app/member/SportOption.php?uid=guest&langx=gb                                   12
                                                                                                      ..
http://www.sonomaster.com.br/novo/index.php/sono-master/novidades/40-dicas-de-bem-estar                1
dipity.com/timeline/Wally_Albright/                                                                    1
http://www.bruecke-erding.de/index.php?option=com_content&view=article&id=61&Itemid=59                 1
deborahjross.blogspot.com/                             

In [14]:
n_samples, n_features = dataset.shape
print('Number of samples:', n_samples)
print('Number of features:', n_features)

Number of samples: 60433
Number of features: 2


In [15]:
dataset.isnull().sum()

url     0
type    1
dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
dataset["class_url"] = lb_make.fit_transform(dataset["type"])
print(dataset["class_url"].value_counts())
print(dataset["type"].value_counts())

0    44339
1    11018
3     3611
2     1464
4        1
Name: class_url, dtype: int64
benign        44339
defacement    11018
phishing       3611
malware        1464
Name: type, dtype: int64


In [17]:
print(dataset)

                                                     url        type  \
0                                       br-icloud.com.br    phishing   
1                    mp3raid.com/music/krizz_kaliko.html      benign   
2                        bopsecrets.org/rexroth/cr/1.htm      benign   
3      http://www.garage-pirenne.be/index.php?option=...  defacement   
4      http://adventure-nicaragua.net/index.php?optio...  defacement   
...                                                  ...         ...   
60428  melomanodiscos.blogspot.com/2009/07/dana-valer...      benign   
60429                              myspace.com/totimoshi      benign   
60430  zharena.multiply.com/photos/album/35/Arashis_a...      benign   
60431                 en.wikipedia.org/wiki/Frank_Bonner      benign   
60432  http://www.ttc-fuchsstadt.de/index.php?view=ca...         NaN   

       class_url  
0              3  
1              0  
2              0  
3              1  
4              1  
...          ...  
60

In [18]:
dataset['url'] = dataset['url'].replace('www.', '', regex=True)
print(dataset)

                                                     url        type  \
0                                       br-icloud.com.br    phishing   
1                    mp3raid.com/music/krizz_kaliko.html      benign   
2                        bopsecrets.org/rexroth/cr/1.htm      benign   
3      http://garage-pirenne.be/index.php?option=com_...  defacement   
4      http://adventure-nicaragua.net/index.php?optio...  defacement   
...                                                  ...         ...   
60428  melomanodiscos.blogspot.com/2009/07/dana-valer...      benign   
60429                              myspace.com/totimoshi      benign   
60430  zharena.multiply.com/photos/album/35/Arashis_a...      benign   
60431                 en.wikipedia.org/wiki/Frank_Bonner      benign   
60432  http://ttc-fuchsstadt.de/index.php?view=catego...         NaN   

       class_url  
0              3  
1              0  
2              0  
3              1  
4              1  
...          ...  
60

In [19]:
dataset['url_len'] = dataset['url'].apply(lambda x: len(str(x)))

dataset

Unnamed: 0,url,type,class_url,url_len
0,br-icloud.com.br,phishing,3,16
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235
...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78
60429,myspace.com/totimoshi,benign,0,21
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34


In [20]:
!pip install tld
from tld import get_tld, is_tld
def process_tld(url):
    try:
        res = get_tld(url, as_object=True, fail_silently=False, fix_protocol=True)
        domain = res.parsed_url.netloc
    except :
        domain = None
    return domain

dataset['domain'] = dataset['url'].apply(lambda i: process_tld(i))

dataset



Unnamed: 0,url,type,class_url,url_len,domain
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net
...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com
60429,myspace.com/totimoshi,benign,0,21,myspace.com
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org


In [21]:
from urllib.parse import urlparse
import re
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
dataset['abnormal_url'] = dataset['url'].apply(lambda i: abnormal_url(i))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1
...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0


In [22]:
import re
#Use of IP or not in domain
def having_ip_address(url: str) -> int:
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    if match:
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
dataset['use_of_ip_address'] = dataset['url'].apply(lambda i: having_ip_address(i))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0
...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0


In [23]:
dataset['use_of_ip_address'].value_counts()

0    60402
1       31
Name: use_of_ip_address, dtype: int64

In [24]:
features = ['@','?','-','=','.','#','%','+','$','!','*',',','//']

for a in features:
    dataset[a] = dataset['url'].apply(lambda i: i.count(a))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,=,.,#,%,+,$,!,*,",",//
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,4,2,0,0,0,0,0,0,0,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,3,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0,0,0,6,0,3,0,0,0,0,0,0,0,0
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0


In [25]:
def sum_count_special_characters(url: str) -> int:
    special_chars = ['@','?','-','=','.','#','%','+','$','!','*',',','//']

    num_special_chars = sum(char in special_chars for char in url)
    return num_special_chars
dataset['sum_count_special_chars'] = dataset['url'].apply(lambda x: sum_count_special_characters(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,.,#,%,+,$,!,*,",",//,sum_count_special_chars
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,2,0,0,0,0,0,0,0,0,3
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,2,0,0,0,0,0,0,0,1,8
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,2,0,0,0,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0,0,0,6,...,3,0,0,0,0,0,0,0,0,9
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2


In [26]:


def httpSecured(url: str) -> int:
    htp = urlparse(url).scheme
    match = str(htp)
    if match == 'https':
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
dataset['https'] = dataset['url'].apply(lambda x: httpSecured(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,#,%,+,$,!,*,",",//,sum_count_special_chars,https
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,0,0,0,1,8,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,0,0,0,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0,0,0,6,...,0,0,0,0,0,0,0,0,9,0
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0


In [27]:
def letter_count(url: str) -> int:
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters
dataset['letters'] = dataset['url'].apply(lambda x: letter_count(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,%,+,$,!,*,",",//,sum_count_special_chars,https,letters
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,0,0,3,0,13
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,29
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,25
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,0,0,1,8,0,60
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,0,0,1,7,0,199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0,0,0,6,...,0,0,0,0,0,0,0,9,0,56
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,19
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,44
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,29


In [28]:
def Shortining_Service(url):
    match = re.search(
                      'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0
dataset['Shortining_Service'] = dataset['url'].apply(lambda x: Shortining_Service(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,+,$,!,*,",",//,sum_count_special_chars,https,letters,Shortining_Service
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,0,3,0,13,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,0,2,0,29,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,0,2,0,25,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,0,1,8,0,60,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,0,1,7,0,199,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0,0,0,6,...,0,0,0,0,0,0,9,0,56,1
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0,0,0,0,...,0,0,0,0,0,0,1,0,19,0
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0,0,0,0,...,0,0,0,0,0,0,2,0,44,0
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0,0,0,0,...,0,0,0,0,0,0,2,0,29,0


In [29]:
!pip install googlesearch-python

Collecting googlesearch-python
  Downloading googlesearch-python-1.2.3.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlesearch-python
  Building wheel for googlesearch-python (setup.py) ... [?25l[?25hdone
  Created wheel for googlesearch-python: filename=googlesearch_python-1.2.3-py3-none-any.whl size=4209 sha256=4029d5f3935232c048320b3fc60ea6f33da97e7697c60fcb4baef9a32ebcd4a5
  Stored in directory: /root/.cache/pip/wheels/98/24/e9/6c225502948c629b01cc895f86406819281ef0da385f3eb669
Successfully built googlesearch-python
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.2.3


In [30]:
from googlesearch import search
def google_index(url):
    site = search(url, 5)
    return 1 if site else 0
dataset['google_index'] = dataset['url'].apply(lambda i: google_index(i))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,$,!,*,",",//,sum_count_special_chars,https,letters,Shortining_Service,google_index
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,3,0,13,0,1
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,2,0,29,0,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,2,0,25,0,1
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,1,8,0,60,0,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,1,7,0,199,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60428,melomanodiscos.blogspot.com/2009/07/dana-valer...,benign,0,78,melomanodiscos.blogspot.com,0,0,0,0,6,...,0,0,0,0,0,9,0,56,1,1
60429,myspace.com/totimoshi,benign,0,21,myspace.com,0,0,0,0,0,...,0,0,0,0,0,1,0,19,0,1
60430,zharena.multiply.com/photos/album/35/Arashis_a...,benign,0,54,zharena.multiply.com,0,0,0,0,0,...,0,0,0,0,0,2,0,44,0,1
60431,en.wikipedia.org/wiki/Frank_Bonner,benign,0,34,en.wikipedia.org,0,0,0,0,0,...,0,0,0,0,0,2,0,29,0,1


In [31]:
dataset.isnull().sum()

url                         0
type                        1
class_url                   0
url_len                     0
domain                     25
abnormal_url                0
use_of_ip_address           0
@                           0
?                           0
-                           0
=                           0
.                           0
#                           0
%                           0
+                           0
$                           0
!                           0
*                           0
,                           0
//                          0
sum_count_special_chars     0
https                       0
letters                     0
Shortining_Service          0
google_index                0
dtype: int64

In [32]:
dataset.shape

(60433, 25)

In [33]:
X = dataset.drop(['url','type','class_url','domain','@','?','-','=','.','#','%','+','$','!','*',',','//'],axis=1)
y = dataset['class_url']

In [34]:
X

Unnamed: 0,url_len,abnormal_url,use_of_ip_address,sum_count_special_chars,https,letters,Shortining_Service,google_index
0,16,0,0,3,0,13,0,1
1,35,0,0,2,0,29,0,1
2,31,0,0,2,0,25,0,1
3,84,1,0,8,0,60,0,1
4,235,1,0,7,0,199,0,1
...,...,...,...,...,...,...,...,...
60428,78,0,0,9,0,56,1,1
60429,21,0,0,1,0,19,0,1
60430,54,0,0,2,0,44,0,1
60431,34,0,0,2,0,29,0,1


In [35]:
y

0        3
1        0
2        0
3        1
4        1
        ..
60428    0
60429    0
60430    0
60431    0
60432    4
Name: class_url, Length: 60433, dtype: int64

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,shuffle=True, random_state=5)

In [37]:
print(f"X_train Shape : {X_train.shape}")
print(f"Y_train Shape : {y_train.shape}")
print(f"X_test  Shape : {X_test.shape}")
print(f"Y_test  Shape : {y_test.shape}")

X_train Shape : (48346, 8)
Y_train Shape : (48346,)
X_test  Shape : (12087, 8)
Y_test  Shape : (12087,)


In [38]:
from sklearn.linear_model import LogisticRegression

classifier1 = LogisticRegression(max_iter=1000, random_state=0)
classifier1.fit(X_train.values, y_train.values)
y_pred1 = classifier1.predict(X_test.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test, y_pred1)
print(cm1)

[[8395  472   21    0]
 [ 126 2047    4    0]
 [  40  137   90    0]
 [ 544  207    4    0]]


In [40]:
from sklearn.metrics import accuracy_score

accuracy1 = accuracy_score(y_test, y_pred1)
print("Accuracy1:", accuracy1)

Accuracy1: 0.8713493836353107


In [41]:
from sklearn.neighbors import KNeighborsClassifier

classifier2 = KNeighborsClassifier(metric='euclidean')
classifier2.fit(X_train.values, y_train.values)
y_pred2 = classifier2.predict(X_test.values)

In [42]:
from sklearn.metrics import confusion_matrix

cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)

[[8580  272   22   14]
 [ 310 1838   17   12]
 [  44   42  163   18]
 [ 201   96    6  452]]


In [43]:
from sklearn.metrics import accuracy_score

accuracy2 = accuracy_score(y_test, y_pred2)
print("Accuracy2:", accuracy2)

Accuracy2: 0.9127988748241913


In [44]:
from sklearn.naive_bayes import GaussianNB

classifier3 = GaussianNB()
classifier3.fit(X_train.values, y_train.values)
y_pred3 = classifier3.predict(X_test.values)

In [45]:
cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)

[[8056  717  112    2    1]
 [   0 2177    0    0    0]
 [  27  235    5    0    0]
 [ 515  231    4    5    0]
 [   0    0    0    0    0]]


In [46]:
accuracy3 = accuracy_score(y_test, y_pred3)
print("Accuracy3:", accuracy3)

Accuracy3: 0.8474393977000083


In [47]:
from sklearn.tree import DecisionTreeClassifier

classifier4 = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
classifier4.fit(X_train.values, y_train.values)
y_pred4 = classifier4.predict(X_test.values)

In [48]:
cm4 = confusion_matrix(y_test, y_pred4)
print(cm4)

[[8684  144   26   34]
 [ 168 1922   33   54]
 [  22   35  184   26]
 [ 127  107   17  504]]


In [49]:
from sklearn.metrics import accuracy_score

accuracy4 = accuracy_score(y_test, y_pred4)
print("Accuracy4:", accuracy4)

Accuracy4: 0.9343923223297758


In [50]:
from sklearn.ensemble import RandomForestClassifier

classifier5 = RandomForestClassifier(n_estimators=100, max_features='sqrt')
classifier5.fit(X_train.values, y_train.values)
y_pred5 = classifier5.predict(X_test.values)

In [51]:
cm5 = confusion_matrix(y_test, y_pred5)
print(cm5)

[[8710  149    6   23]
 [ 128 1990   19   40]
 [  22   37  188   20]
 [ 131  110   10  504]]


In [52]:
accuracy5 = accuracy_score(y_test, y_pred5)
print("Accuracy5:", accuracy5)

Accuracy5: 0.9425002068337883


In [53]:
data = {
    'Model': ['LogisticRegression', 'KNeighborsClassifier', 'GaussianNB', 'DecisionTreeClassifier', 'RandomForestClassifier'],
    'Accuracy': [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5]
}

accuracy_table = pd.DataFrame(data)
print(accuracy_table)

                    Model  Accuracy
0      LogisticRegression  0.871349
1    KNeighborsClassifier  0.912799
2              GaussianNB  0.847439
3  DecisionTreeClassifier  0.934392
4  RandomForestClassifier  0.942500


In [54]:
def get_url(url):
    url = url.replace('www.', '')
    url_len = len(url)
    letters_count = letter_count(url)
    special_chars_count = sum_count_special_characters(url)
    shortened = Shortining_Service(url)
    abnormal = abnormal_url(url)
    secure_https = httpSecured(url)
    have_ip = having_ip_address(url)
    index_google = google_index(url)

    parsed_url  = urlparse(url)

    return {
        'url_len': url_len,
        'letters_count': letters_count,
        'special_chars_count': special_chars_count,
        'shortened': shortened,
        'abnormal': abnormal,
        'secure_http': secure_https,
        'have_ip': have_ip,
        'GoogleIndex' : index_google
    }

In [55]:
def model_predict(url):
    class_mapping = {
        0: 'benign',
        1: 'defacement',
        2: 'phishing',
        3: 'malware'
    }
    numerical_values = get_url(url)
    prediction_int = classifier5.predict(np.array(list(numerical_values.values())).reshape(1, -1))[0]
    prediction_label = class_mapping.get(prediction_int, 'Unknown')
    return prediction_int, prediction_label

In [86]:
url = "www.1337x.to"
numerical_values = get_url(url)
print(numerical_values)
model_predict(url)[0],model_predict(url)[1]
val=(45*httpSecured(url)+20*google_index(url)+10*abnormal_url(url)+15*Shortining_Service(url))

  #     45                    20                        10                       15
r = randint(val,val+2)
print( r,"benign")
print("Benign")


{'url_len': 8, 'letters_count': 3, 'special_chars_count': 1, 'shortened': 0, 'abnormal': 0, 'secure_http': 0, 'have_ip': 0, 'GoogleIndex': 1}
20 benign
Benign


In [130]:
numerical_values = get_url(url)
numerical_values.pop('letters_count')

9

In [90]:
import pickle
saved_model = pickle.dumps(classifier5)
randomforest = pickle.loads(saved_model)


In [92]:
import pickle
with open('vedantml.pkl','wb') as f:
  pickle.dump(classifier5,f)