In [10]:
import pandas as pd # use for data manipulation and analysis
import numpy as np # use for multi-dimensional array and matrix

import seaborn as sns # use for high-level interface for drawing attractive and informative statistical graphics 
import matplotlib.pyplot as plt # It provides an object-oriented API for embedding plots into applications
%matplotlib inline 
# It sets the backend of matplotlib to the 'inline' backend:
import time # calculate time 

from sklearn.linear_model import LogisticRegression # algo use to predict good or bad
from sklearn.naive_bayes import MultinomialNB # nlp algo use to predict good or bad

from sklearn.model_selection import train_test_split # spliting the data between feature and target
from sklearn.metrics import classification_report # gives whole report about metrics (e.g, recall,precision,f1_score,c_m)
from sklearn.metrics import confusion_matrix # gives info about actual and predict
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text  
from nltk.stem.snowball import SnowballStemmer # stemmes words
from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes  
from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos

from PIL import Image # getting images in notebook
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator# creates words colud

from bs4 import BeautifulSoup # use for scraping the data from website
from selenium import webdriver # use for automation chrome 
import networkx as nx # for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks.

import pickle# use to dump model 

import warnings # ignores pink warnings 
warnings.filterwarnings('ignore')

In [11]:
phish_data = pd.read_csv('phishing_site_urls.csv')

In [13]:
phish_data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [14]:
phish_data.tail()


Unnamed: 0,URL,Label
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [25]:
phish_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [26]:
phish_data.isnull().sum()

URL      0
Label    0
dtype: int64

In [54]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [55]:
phish_data.URL[0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [56]:
tokenizer.tokenize(phish_data.URL[0])

['nobell',
 'it',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 'login',
 'access']

In [57]:
print('Getting words tokenized ...')
t0= time.perf_counter()
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words tokenized ...
Time taken 6.959831999964081 sec


In [58]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized
359477,imdb.com/name/nm1318670/,good,"[imdb, com, name, nm]"
187551,filespart.com/d/deliver-us-from-eva-megaupload...,good,"[filespart, com, d, deliver, us, from, eva, me..."
378232,listentomusics.com/products/Cccm-Gospel/Montre...,good,"[listentomusics, com, products, Cccm, Gospel, ..."
298448,canadavisa.com/about-british-columbia.html,good,"[canadavisa, com, about, british, columbia, html]"
536197,ensaenerji.com/h5piv,bad,"[ensaenerji, com, h, piv]"


In [59]:
stemmer = SnowballStemmer("english")

In [60]:
print('Getting words stemmed ...')
t0= time.perf_counter()
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting words stemmed ...
Time taken 126.31797650002409 sec


In [61]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed
170695,en.wikipedia.org/wiki/2011_Big_12_Men's_Basket...,good,"[en, wikipedia, org, wiki, Big, Men, s, Basket...","[en, wikipedia, org, wiki, big, men, s, basket..."
283784,artipot.com/articles/1072787/the-making-of-a-m...,good,"[artipot, com, articles, the, making, of, a, m...","[artipot, com, articl, the, make, of, a, motor..."
458298,veromi.com/Chad-Bowser.aspx,good,"[veromi, com, Chad, Bowser, aspx]","[veromi, com, chad, bowser, aspx]"
544943,viviendadelrincon.com/74t3nf4gv4,bad,"[viviendadelrincon, com, t, nf, gv]","[viviendadelrincon, com, t, nf, gv]"
20577,ourtimes.yolasite.com/contact-us.php,bad,"[ourtimes, yolasite, com, contact, us, php]","[ourtim, yolasit, com, contact, us, php]"


In [62]:
print('Getting joiningwords ...')
t0= time.perf_counter()
phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

Getting joiningwords ...
Time taken 1.031782100093551 sec


In [63]:
phish_data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
255610,vinnierattolle.blogspot.com/2008/12/krofftapal...,good,"[vinnierattolle, blogspot, com, krofftapalooza...","[vinnierattoll, blogspot, com, krofftapalooza,...",vinnierattoll blogspot com krofftapalooza sigm...
236316,rustyreedshouseofblues.com/,good,"[rustyreedshouseofblues, com]","[rustyreedshouseofblu, com]",rustyreedshouseofblu com
524874,fdrare.lfrfipzyydydnlkwuys.gq:33638/1972/05/04...,bad,"[fdrare, lfrfipzyydydnlkwuys, gq, bill, outsid...","[fdrare, lfrfipzyydydnlkwuy, gq, bill, outsid,...",fdrare lfrfipzyydydnlkwuy gq bill outsid rapid...
269689,adn.com/2011/10/09/2112513/1-of-3-whales-in-nu...,good,"[adn, com, of, whales, in, nushagak, river, html]","[adn, com, of, whale, in, nushagak, river, html]",adn com of whale in nushagak river html
393511,mrchsl.com/en/Sainte-Barbe_e,good,"[mrchsl, com, en, Sainte, Barbe, e]","[mrchsl, com, en, saint, barb, e]",mrchsl com en saint barb e


In [9]:
#sliceing classes
bad_sites = phish_data[phish_data.Label == 'bad']
good_sites = phish_data[phish_data.Label == 'good']

NameError: name 'phish_data' is not defined

In [8]:
bad_sites.head()

NameError: name 'bad_sites' is not defined

In [66]:
good_sites.head()

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
18231,esxcc.com/js/index.htm?us.battle.net/noghn/en/...,good,"[esxcc, com, js, index, htm, us, battle, net, ...","[esxcc, com, js, index, htm, us, battl, net, n...",esxcc com js index htm us battl net noghn en r...
18232,wwweira¯&nvinip¿ncH¯wVö%ÆåyDaHðû/ÏyEùuË\nÓ6...,good,"[www, eira, nvinip, ncH, wV, yDaH, yE, u, rT, ...","[www, eira, nvinip, nch, wv, ydah, ye, u, rt, ...",www eira nvinip nch wv ydah ye u rt u g m i xz...
18233,'www.institutocgr.coo/web/media/syqvem/dk-óij...,good,"[www, institutocgr, coo, web, media, syqvem, d...","[www, institutocgr, coo, web, media, syqvem, d...",www institutocgr coo web media syqvem dk ij r ...
18234,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,good,"[Y, ko, D, l, qt]","[y, ko, d, l, qt]",y ko d l qt
18236,ruta89fm.com/images/AS@Vies/1i75cf7b16vc<Fd16...,good,"[ruta, fm, com, images, AS, Vies, i, cf, b, vc...","[ruta, fm, com, imag, as, vie, i, cf, b, vc, f...",ruta fm com imag as vie i cf b vc f d b g sd v...


In [2]:
def plot_wordcloud(text, mask=None, max_words=400, max_font_size=120, figure_size=(24.0,16.0), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'com','http'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='white',
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    mask = mask)
    wordcloud.generate(text)
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'green', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  

In [5]:
data = good_sites.text_sent
data.reset_index(drop=True, inplace=True)

NameError: name 'good_sites' is not defined