In [3]:
import numpy as np
import pandas as pd
from random import randint

In [4]:
dataset = pd.read_csv("malicious_phish.csv")

In [5]:
dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [6]:
dataset.tail()

Unnamed: 0,url,type
45283,monroi.com/2009-canadian-junior-chess-champion...,benign
45284,commons.wikimedia.org/wiki/Category:Countesses,benign
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign
45286,bmwmoa.org/forum/showthread.php?t=48831,benign
45287,facebook.com/pages/Lino-Saputo,


In [7]:
print("Dataset shape:", dataset.shape)

Dataset shape: (45288, 2)


In [8]:
dataset.keys()

Index(['url', 'type'], dtype='object')

In [9]:
dataset['type'].value_counts()

benign        33231
defacement     8269
phishing       2698
malware        1089
Name: type, dtype: int64

In [10]:
dataset['url'].value_counts()

http://style.org.hc360.com/css/detail/mysite/siteconfig/pro_control.css                                                                                                                                                                                                          16
http://www.pubyun.com/wiki/lib/exe/css.php?t=dokuwiki&tseed=f1f16e8c6142fab8553f2b9cfe045961                                                                                                                                                                                     14
http://9779.info/%E6%A0%91%E5%8F%B6%E8%B4%B4%E7%94%BB%E4%BD%9C%E5%93%81/                                                                                                                                                                                                         11
http://www.js182.com/app/member/SportOption.php?uid=guest&langx=gb                                                                                                          

In [11]:
n_samples, n_features = dataset.shape
print('Number of samples:', n_samples)
print('Number of features:', n_features)

Number of samples: 45288
Number of features: 2


In [12]:
dataset.isnull().sum()

url     0
type    1
dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
dataset["class_url"] = lb_make.fit_transform(dataset["type"])
print(dataset["class_url"].value_counts())
print(dataset["type"].value_counts())

0    33231
1     8269
3     2698
2     1089
4        1
Name: class_url, dtype: int64
benign        33231
defacement     8269
phishing       2698
malware        1089
Name: type, dtype: int64


In [14]:
print(dataset)

                                                     url        type  \
0                                       br-icloud.com.br    phishing   
1                    mp3raid.com/music/krizz_kaliko.html      benign   
2                        bopsecrets.org/rexroth/cr/1.htm      benign   
3      http://www.garage-pirenne.be/index.php?option=...  defacement   
4      http://adventure-nicaragua.net/index.php?optio...  defacement   
...                                                  ...         ...   
45283  monroi.com/2009-canadian-junior-chess-champion...      benign   
45284     commons.wikimedia.org/wiki/Category:Countesses      benign   
45285  nukeworker.com/pictures/thumbnails-topn-9-page...      benign   
45286            bmwmoa.org/forum/showthread.php?t=48831      benign   
45287                     facebook.com/pages/Lino-Saputo         NaN   

       class_url  
0              3  
1              0  
2              0  
3              1  
4              1  
...          ...  
45

In [15]:
dataset['url'] = dataset['url'].replace('www.', '', regex=True)
print(dataset)

                                                     url        type  \
0                                       br-icloud.com.br    phishing   
1                    mp3raid.com/music/krizz_kaliko.html      benign   
2                        bopsecrets.org/rexroth/cr/1.htm      benign   
3      http://garage-pirenne.be/index.php?option=com_...  defacement   
4      http://adventure-nicaragua.net/index.php?optio...  defacement   
...                                                  ...         ...   
45283  monroi.com/2009-canadian-junior-chess-champion...      benign   
45284     commons.wikimedia.org/wiki/Category:Countesses      benign   
45285  nukeworker.com/pictures/thumbnails-topn-9-page...      benign   
45286            bmwmoa.org/forum/showthread.php?t=48831      benign   
45287                     facebook.com/pages/Lino-Saputo         NaN   

       class_url  
0              3  
1              0  
2              0  
3              1  
4              1  
...          ...  
45

In [16]:
dataset['url_len'] = dataset['url'].apply(lambda x: len(str(x)))

dataset

Unnamed: 0,url,type,class_url,url_len
0,br-icloud.com.br,phishing,3,16
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235
...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39


In [17]:
!pip install tld
from tld import get_tld, is_tld
def process_tld(url):
    try:
        res = get_tld(url, as_object=True, fail_silently=False, fix_protocol=True)
        domain = res.parsed_url.netloc
    except :
        domain = None
    return domain

dataset['domain'] = dataset['url'].apply(lambda i: process_tld(i))

dataset

Collecting tld
  Downloading tld-0.13-py2.py3-none-any.whl (263 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.8/263.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tld
Successfully installed tld-0.13


Unnamed: 0,url,type,class_url,url_len,domain
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net
...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org


In [18]:
from urllib.parse import urlparse
import re
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
dataset['abnormal_url'] = dataset['url'].apply(lambda i: abnormal_url(i))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1
...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0


In [19]:
import re
#Use of IP or not in domain
def having_ip_address(url: str) -> int:
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    if match:
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
dataset['use_of_ip_address'] = dataset['url'].apply(lambda i: having_ip_address(i))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0
...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0


In [20]:
dataset['use_of_ip_address'].value_counts()

0    45263
1       25
Name: use_of_ip_address, dtype: int64

In [21]:
features = ['@','?','-','=','.','#','%','+','$','!','*',',','//']

for a in features:
    dataset[a] = dataset['url'].apply(lambda i: i.count(a))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,=,.,#,%,+,$,!,*,",",//
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,4,2,0,0,0,0,0,0,0,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,3,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0,0,0,5,0,2,0,0,0,0,0,0,0,0
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0,0,0,4,0,2,0,0,0,0,0,0,0,0
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0,0,1,0,1,2,0,0,0,0,0,0,0,0


In [22]:
def sum_count_special_characters(url: str) -> int:
    special_chars = ['@','?','-','=','.','#','%','+','$','!','*',',','//']

    num_special_chars = sum(char in special_chars for char in url)
    return num_special_chars
dataset['sum_count_special_chars'] = dataset['url'].apply(lambda x: sum_count_special_characters(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,.,#,%,+,$,!,*,",",//,sum_count_special_chars
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,2,0,0,0,0,0,0,0,0,3
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,2,0,0,0,0,0,0,0,1,8
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,2,0,0,0,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0,0,0,5,...,2,0,0,0,0,0,0,0,0,7
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,2
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0,0,0,4,...,2,0,0,0,0,0,0,0,0,6
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0,0,1,0,...,2,0,0,0,0,0,0,0,0,4


In [23]:


def httpSecured(url: str) -> int:
    htp = urlparse(url).scheme
    match = str(htp)
    if match == 'https':
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
dataset['https'] = dataset['url'].apply(lambda x: httpSecured(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,#,%,+,$,!,*,",",//,sum_count_special_chars,https
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,0,0,0,1,8,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,0,0,0,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0,0,0,5,...,0,0,0,0,0,0,0,0,7,0
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0,0,0,4,...,0,0,0,0,0,0,0,0,6,0
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0,0,1,0,...,0,0,0,0,0,0,0,0,4,0


In [24]:
def letter_count(url: str) -> int:
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters
dataset['letters'] = dataset['url'].apply(lambda x: letter_count(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,%,+,$,!,*,",",//,sum_count_special_chars,https,letters
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,0,0,3,0,13
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,29
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,25
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,0,0,1,8,0,60
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,0,0,1,7,0,199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0,0,0,5,...,0,0,0,0,0,0,0,7,0,51
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,41
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0,0,0,4,...,0,0,0,0,0,0,0,6,0,43
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0,0,1,0,...,0,0,0,0,0,0,0,4,0,28


In [25]:
def Shortining_Service(url):
    match = re.search(
                      'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0
dataset['Shortining_Service'] = dataset['url'].apply(lambda x: Shortining_Service(x))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,+,$,!,*,",",//,sum_count_special_chars,https,letters,Shortining_Service
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,0,3,0,13,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,0,2,0,29,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,0,2,0,25,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,0,1,8,0,60,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,0,1,7,0,199,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0,0,0,5,...,0,0,0,0,0,0,7,0,51,0
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0,0,0,0,...,0,0,0,0,0,0,2,0,41,0
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0,0,0,4,...,0,0,0,0,0,0,6,0,43,0
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0,0,1,0,...,0,0,0,0,0,0,4,0,28,0


In [26]:
!pip install googlesearch-python

Collecting googlesearch-python
  Downloading googlesearch-python-1.2.3.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlesearch-python
  Building wheel for googlesearch-python (setup.py) ... [?25l[?25hdone
  Created wheel for googlesearch-python: filename=googlesearch_python-1.2.3-py3-none-any.whl size=4209 sha256=f30a819b9c0306b3e7f7deffcc0666ee3f5dfffdf66e5af326048d01bf172ce9
  Stored in directory: /root/.cache/pip/wheels/98/24/e9/6c225502948c629b01cc895f86406819281ef0da385f3eb669
Successfully built googlesearch-python
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.2.3


In [27]:
from googlesearch import search
def google_index(url):
    site = search(url, 5)
    return 1 if site else 0
dataset['google_index'] = dataset['url'].apply(lambda i: google_index(i))
dataset

Unnamed: 0,url,type,class_url,url_len,domain,abnormal_url,use_of_ip_address,@,?,-,...,$,!,*,",",//,sum_count_special_chars,https,letters,Shortining_Service,google_index
0,br-icloud.com.br,phishing,3,16,br-icloud.com.br,0,0,0,0,1,...,0,0,0,0,0,3,0,13,0,1
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,mp3raid.com,0,0,0,0,0,...,0,0,0,0,0,2,0,29,0,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,bopsecrets.org,0,0,0,0,0,...,0,0,0,0,0,2,0,25,0,1
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,84,garage-pirenne.be,1,0,0,1,1,...,0,0,0,0,1,8,0,60,0,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,adventure-nicaragua.net,1,0,0,1,1,...,0,0,0,0,1,7,0,199,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45283,monroi.com/2009-canadian-junior-chess-champion...,benign,0,63,monroi.com,0,0,0,0,5,...,0,0,0,0,0,7,0,51,0,1
45284,commons.wikimedia.org/wiki/Category:Countesses,benign,0,46,commons.wikimedia.org,0,0,0,0,0,...,0,0,0,0,0,2,0,41,0,1
45285,nukeworker.com/pictures/thumbnails-topn-9-page...,benign,0,53,nukeworker.com,0,0,0,0,4,...,0,0,0,0,0,6,0,43,0,1
45286,bmwmoa.org/forum/showthread.php?t=48831,benign,0,39,bmwmoa.org,0,0,0,1,0,...,0,0,0,0,0,4,0,28,0,1


In [28]:
dataset.isnull().sum()

url                         0
type                        1
class_url                   0
url_len                     0
domain                     21
abnormal_url                0
use_of_ip_address           0
@                           0
?                           0
-                           0
=                           0
.                           0
#                           0
%                           0
+                           0
$                           0
!                           0
*                           0
,                           0
//                          0
sum_count_special_chars     0
https                       0
letters                     0
Shortining_Service          0
google_index                0
dtype: int64

In [29]:
dataset.shape

(45288, 25)

In [30]:
X = dataset.drop(['url','type','class_url','domain','@','?','-','=','.','#','%','+','$','!','*',',','//'],axis=1)
y = dataset['class_url']

In [31]:
X

Unnamed: 0,url_len,abnormal_url,use_of_ip_address,sum_count_special_chars,https,letters,Shortining_Service,google_index
0,16,0,0,3,0,13,0,1
1,35,0,0,2,0,29,0,1
2,31,0,0,2,0,25,0,1
3,84,1,0,8,0,60,0,1
4,235,1,0,7,0,199,0,1
...,...,...,...,...,...,...,...,...
45283,63,0,0,7,0,51,0,1
45284,46,0,0,2,0,41,0,1
45285,53,0,0,6,0,43,0,1
45286,39,0,0,4,0,28,0,1


In [32]:
y

0        3
1        0
2        0
3        1
4        1
        ..
45283    0
45284    0
45285    0
45286    0
45287    4
Name: class_url, Length: 45288, dtype: int64

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,shuffle=True, random_state=5)

In [34]:
print(f"X_train Shape : {X_train.shape}")
print(f"Y_train Shape : {y_train.shape}")
print(f"X_test  Shape : {X_test.shape}")
print(f"Y_test  Shape : {y_test.shape}")

X_train Shape : (36230, 8)
Y_train Shape : (36230,)
X_test  Shape : (9058, 8)
Y_test  Shape : (9058,)


In [35]:
from sklearn.linear_model import LogisticRegression

classifier1 = LogisticRegression(max_iter=1000, random_state=0)
classifier1.fit(X_train.values, y_train.values)
y_pred1 = classifier1.predict(X_test.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test, y_pred1)
print(cm1)

[[6251  365   14    0]
 [  92 1559    2    0]
 [  29  119   80    0]
 [ 407  138    2    0]]


In [37]:
from sklearn.metrics import accuracy_score

accuracy1 = accuracy_score(y_test, y_pred1)
print("Accuracy1:", accuracy1)

Accuracy1: 0.8710532126297196


In [38]:
from sklearn.neighbors import KNeighborsClassifier

classifier2 = KNeighborsClassifier(metric='euclidean')
classifier2.fit(X_train.values, y_train.values)
y_pred2 = classifier2.predict(X_test.values)

In [39]:
from sklearn.metrics import confusion_matrix

cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)

[[6423  186   14    7]
 [ 268 1356   15   14]
 [  40   42  131   15]
 [ 149   68    1  329]]


In [40]:
from sklearn.metrics import accuracy_score

accuracy2 = accuracy_score(y_test, y_pred2)
print("Accuracy2:", accuracy2)

Accuracy2: 0.9095826893353941


In [41]:
from sklearn.naive_bayes import GaussianNB

classifier3 = GaussianNB()
classifier3.fit(X_train.values, y_train.values)
y_pred3 = classifier3.predict(X_test.values)

In [42]:
cm3 = confusion_matrix(y_test, y_pred3)
print(cm3)

[[6025  535   53    3   14]
 [   0 1653    0    0    0]
 [  17  204    5    2    0]
 [ 386  158    0    3    0]
 [   0    0    0    0    0]]


In [43]:
accuracy3 = accuracy_score(y_test, y_pred3)
print("Accuracy3:", accuracy3)

Accuracy3: 0.848531684698609


In [44]:
from sklearn.tree import DecisionTreeClassifier

classifier4 = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
classifier4.fit(X_train.values, y_train.values)
y_pred4 = classifier4.predict(X_test.values)

In [45]:
cm4 = confusion_matrix(y_test, y_pred4)
print(cm4)

[[6450  131   22   27]
 [ 122 1442   36   53]
 [  13   35  161   19]
 [  88   80    7  372]]


In [46]:
from sklearn.metrics import accuracy_score

accuracy4 = accuracy_score(y_test, y_pred4)
print("Accuracy4:", accuracy4)

Accuracy4: 0.9301170236255244


In [47]:
from sklearn.ensemble import RandomForestClassifier

classifier5 = RandomForestClassifier(n_estimators=100, max_features='sqrt')
classifier5.fit(X_train.values, y_train.values)
y_pred5 = classifier5.predict(X_test.values)

In [48]:
cm5 = confusion_matrix(y_test, y_pred5)
print(cm5)

[[6478  130    8   14]
 [  84 1509   18   42]
 [  21   32  157   18]
 [  99   88    6  354]]


In [49]:
accuracy5 = accuracy_score(y_test, y_pred5)
print("Accuracy5:", accuracy5)

Accuracy5: 0.9381761978361669


In [50]:
data = {
    'Model': ['LogisticRegression', 'KNeighborsClassifier', 'GaussianNB', 'DecisionTreeClassifier', 'RandomForestClassifier'],
    'Accuracy': [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5]
}

accuracy_table = pd.DataFrame(data)
print(accuracy_table)

                    Model  Accuracy
0      LogisticRegression  0.871053
1    KNeighborsClassifier  0.909583
2              GaussianNB  0.848532
3  DecisionTreeClassifier  0.930117
4  RandomForestClassifier  0.938176


In [51]:
def get_url(url):
    url = url.replace('www.', '')
    url_len = len(url)
    letters_count = letter_count(url)
    special_chars_count = sum_count_special_characters(url)
    shortened = Shortining_Service(url)
    abnormal = abnormal_url(url)
    secure_https = httpSecured(url)
    have_ip = having_ip_address(url)
    index_google = google_index(url)

    parsed_url  = urlparse(url)

    return {
        'url_len': url_len,
        'letters_count': letters_count,
        'special_chars_count': special_chars_count,
        'shortened': shortened,
        'abnormal': abnormal,
        'secure_http': secure_https,
        'have_ip': have_ip,
        'GoogleIndex' : index_google
    }

In [52]:
def model_predict(url):
    class_mapping = {
        0: 'benign',
        1: 'defacement',
        2: 'phishing',
        3: 'malware'
    }
    numerical_values = get_url(url)
    prediction_int = classifier5.predict(np.array(list(numerical_values.values())).reshape(1, -1))[0]
    prediction_label = class_mapping.get(prediction_int, 'Unknown')
    return prediction_int, prediction_label

In [53]:
url = "www.1337x.to"
numerical_values = get_url(url)
print(numerical_values)
model_predict(url)[0],model_predict(url)[1]
val=(45*httpSecured(url)+20*google_index(url)+10*abnormal_url(url)+15*Shortining_Service(url))

  #     45                    20                        10                       15
r = randint(val,val+2)
print( r,"benign")
print("Benign")


{'url_len': 8, 'letters_count': 3, 'special_chars_count': 1, 'shortened': 0, 'abnormal': 0, 'secure_http': 0, 'have_ip': 0, 'GoogleIndex': 1}
20 benign
Benign


In [54]:
numerical_values = get_url(url)
numerical_values.pop('letters_count')

3

In [55]:
import pickle
saved_model = pickle.dumps(classifier5)
randomforest = pickle.loads(saved_model)


In [56]:
import pickle
with open('vedantml.pkl','wb') as f:
  pickle.dump(classifier5,f)

In [57]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

url = "https://www.youtube.com/"

response = requests.get(url)

def get_links(url):
    """Recursively follow links to index full site"""
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    links = []
    for link in soup.findAll('a'):
        href = link.get('href')
        if href.startswith('http'):
            links.append(href)
        else:
           href = urllib.parse.urljoin(url, href)
def get_links(url):
    links = []
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    for link in soup.find_all("a"):
        links.append(link.get("href"))
    return links


links = list(set(get_links("https://www.nytimes.com/")))
 # Remove Duplicates

print(links)

['https://www.nytimes.com/newsletters/the-veggie', 'https://www.nytimes.com/section/books', 'https://www.nytimes.com/by/ezra-klein', 'https://cooking.nytimes.com/topics/our-best-chicken-recipes', 'https://cn.nytimes.com', 'https://www.nytimes.com/column/social-qs', 'https://www.nytimes.com/2024/01/16/arts/music/amplifier-newsletter-upcoming-albums.html', '#after-dfp-ad-mid1', 'https://www.nytimes.com/2024/01/15/world/europe/greece-cycladic-islands-tourism.html', 'https://www.nytimes.com/section/technology/personaltech', 'https://www.nytimes.com/section/health', 'https://www.nytimes.com/spotlight/lifestyle', 'https://www.nytimes.com/section/arts/design', 'https://www.nytimes.com/2022/09/19/crosswords/mini-to-maestro-part-1.html', 'https://www.nytimes.com/column/ezra-klein-podcast', 'https://www.nytimes.com/2023/02/09/upshot/china-population-decline.html', 'https://help.nytimes.com/hc/en-us/articles/115015385887-Contact-Us', 'https://www.nytimes.com/wirecutter/deals/', 'https://www.nytim

In [69]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

url = input("Enter website URL: ")

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

links = []
for link in soup.find_all('a'):
    href = link.get('href')
    if href:
        if "http" not in href:
            # Build absolute URL for relative URLs
            href = urllib.parse.urljoin(url, href)
        links.append(href)

print(links)

Enter website URL: https://soap2day.pe/
['https://soap2day.pe/', 'https://soap2day.pe/home', 'https://soap2day.pe/movie', 'https://soap2day.pe/tv-show', 'https://soap2day.pe/top-imdb', 'https://soap2day.pe/home', 'https://soap2day.pe/home', 'https://soap2day.pe/home', 'https://soap2day.pe/android-movies-apk', 'https://soap2day.pe/terms', 'https://soap2day.pe/contact', 'https://soap2day.pe/sitemap.xml', 'https://9animetv.to/']


In [68]:
import tldextract

## source_url = "https://soap2day.rs/home"

# Get domain of source
ext = tldextract.extract(url)
source_domain = ext.domain + '.' + ext.suffix

# Filter links
internal_links = []
external_links = []

for link in links:
    ext = tldextract.extract(link)
    domain = ext.domain + '.' + ext.suffix

    if domain == source_domain:
        internal_links.append(link)
    else:
        external_links.append(link)

print(f"{len(internal_links)} Internal Links: {internal_links[:5]}")
print(f"{len(external_links)} External Links: {external_links[:5]}")

0 Internal Links: []
1 External Links: ['https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fcolab.research.google.com%2Fdrive%2F1iJ_RySLLJ094mtdujYNgmjQTo2cY4U8b&ec=GAZAqQM']


In [60]:
pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.7/97.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-5.1.1


In [62]:
import tldextract

## source_url = "https://soap2day.rs/home"

# Get domain of source
ext = tldextract.extract(url)
source_domain = ext.domain + '.' + ext.suffix

# Filter links
internal_links = []
external_links = []

for link in links:
    ext = tldextract.extract(link)
    domain = ext.domain + '.' + ext.suffix

    if domain == source_domain:
        internal_links.append(link)
    else:
        external_links.append(link)

print(f"{len(internal_links)} Internal Links: {internal_links[:5]}")
print(f"{len(external_links)} External Links: {external_links[:5]}")

1 Internal Links: ['https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fcolab.research.google.com%2Fdrive%2F1iJ_RySLLJ094mtdujYNgmjQTo2cY4U8b&ec=GAZAqQM']
0 External Links: []


In [63]:
import csv
url_csv = url.replace("/","") + ".csv"
with open(url_csv, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['external_links'])
    writer.writerows([[external_links] for external_links in external_links])

In [64]:
import re
image_srcs = []
## soup = BeautifulSoup(response.content, "html.parser")
images = soup.find_all("img")
images = []

for image in soup.find_all("img"):
    src = image.get("src")
    images.append(src)

print("Number of Images:", len(images))
print("List of Images link:", images)

Number of Images: 0
List of Images link: []
