In [1]:
cd D:\1MYFILES\ML_NGFW\FIREWALLS\-Phishing url detection

D:\1MYFILES\ML_NGFW\FIREWALLS\-Phishing url detection


In [2]:
# import seaborn as sb # helps in statistical data visualization in different forms
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt # plots the data
%matplotlib inline 

import time 

from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB # a simple learning algo which used bayes rule assuming the objects having some characteristics 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from nltk.tokenize import RegexpTokenizer # Tokenizes the url to get the required words
from nltk.stem.snowball import SnowballStemmer # stemms different works into meaningless words 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer # Vectorizes the tokens into a matrix 
from sklearn.pipeline import make_pipeline 

from bs4 import BeautifulSoup
!pip install selenium 
from selenium import webdriver
import networkx as nx # plots the internal link redirections within a website

import pickle # we'll store data in form of non-primitives and that has to be converted to character stream when we train it and "pickle" helps us to do that 
import warnings



In [3]:
phish_data = {}
phish_data = pd.read_csv('phishing_site_urls.csv')

In [15]:
# phish_data.info()

In [8]:
phish_data.isnull().sum()

URL      0
Label    0
dtype: int64

In [9]:
label_counts = pd.DataFrame(phish_data.Label.value_counts())
label_counts

Unnamed: 0,Label
good,392924
bad,156422


In [None]:
sb.set_style('darkgrid')
sb.barplot(label_counts.index,label_counts.Label)
# sb.barplot( index , labels ) 

In [4]:
    tokenizer = RegexpTokenizer(r'[A-Za-z]+') # tokenizes the given url based on the string argument r'[a-zA-Z]\w+\'?\w*'

In [5]:
# tokenizing all the rows
# by passing all the URLs into the anonymous function i.e. mapping all the URLs into anonymous function.
# we use anonymous functions here for simplicity of code and no requirement of creating a function and storing it.
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows

In [6]:
phish_data.sample(5) 

Unnamed: 0,URL,Label,text_tokenized
391655,montrealinc.ca/en/index.php,good,"[montrealinc, ca, en, index, php]"
419610,raiders.com/schedule/season-schedule.html,good,"[raiders, com, schedule, season, schedule, html]"
228758,pipl.com/directory/name/Guyon/Peter,good,"[pipl, com, directory, name, Guyon, Peter]"
148742,bentley.umich.edu/exhibits/umtimeline/general.php,good,"[bentley, umich, edu, exhibits, umtimeline, ge..."
326243,facebook.com/ShawneeTownship,good,"[facebook, com, ShawneeTownship]"


In [7]:
stemmer = SnowballStemmer("english")

In [8]:
# Stemming all the repetetive words into meaning less words
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

In [9]:
phish_data.head(15)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, ffb, d, dca, cce, f, login, SkyPe...","[nobel, it, ffb, d, dca, cce, f, login, skype,..."
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin..."
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into..."
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex...","[mail, printakid, com, www, onlin, americanexp..."
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi...","[thewhiskeydreg, com, wp, content, theme, wide..."
5,smilesvoegol.servebbs.org/voegol.php,bad,"[smilesvoegol, servebbs, org, voegol, php]","[smilesvoegol, servebb, org, voegol, php]"
6,premierpaymentprocessing.com/includes/boleto-2...,bad,"[premierpaymentprocessing, com, includes, bole...","[premierpaymentprocess, com, includ, boleto, v..."
7,myxxxcollection.com/v1/js/jih321/bpd.com.do/do...,bad,"[myxxxcollection, com, v, js, jih, bpd, com, d...","[myxxxcollect, com, v, js, jih, bpd, com, do, ..."
8,super1000.info/docs,bad,"[super, info, docs]","[super, info, doc]"
9,horizonsgallery.com/js/bin/ssl1/_id/www.paypal...,bad,"[horizonsgallery, com, js, bin, ssl, id, www, ...","[horizonsgalleri, com, js, bin, ssl, id, www, ..."


In [10]:
# joining all the list elements without any commas 
phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))

In [11]:
phish_data.head(15)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, ffb, d, dca, cce, f, login, SkyPe...","[nobel, it, ffb, d, dca, cce, f, login, skype,...",nobel it ffb d dca cce f login skype com en cg...
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin...",www dghjdgf com paypal co uk cycgi bin webscrc...
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into...",serviciosbi com paypal cgi bin get into herf s...
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex...","[mail, printakid, com, www, onlin, americanexp...",mail printakid com www onlin americanexpress c...
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi...","[thewhiskeydreg, com, wp, content, theme, wide...",thewhiskeydreg com wp content theme widescreen...
5,smilesvoegol.servebbs.org/voegol.php,bad,"[smilesvoegol, servebbs, org, voegol, php]","[smilesvoegol, servebb, org, voegol, php]",smilesvoegol servebb org voegol php
6,premierpaymentprocessing.com/includes/boleto-2...,bad,"[premierpaymentprocessing, com, includes, bole...","[premierpaymentprocess, com, includ, boleto, v...",premierpaymentprocess com includ boleto via php
7,myxxxcollection.com/v1/js/jih321/bpd.com.do/do...,bad,"[myxxxcollection, com, v, js, jih, bpd, com, d...","[myxxxcollect, com, v, js, jih, bpd, com, do, ...",myxxxcollect com v js jih bpd com do do l popu...
8,super1000.info/docs,bad,"[super, info, docs]","[super, info, doc]",super info doc
9,horizonsgallery.com/js/bin/ssl1/_id/www.paypal...,bad,"[horizonsgallery, com, js, bin, ssl, id, www, ...","[horizonsgalleri, com, js, bin, ssl, id, www, ...",horizonsgalleri com js bin ssl id www paypal c...


In [None]:
# DATA CLEANING
# noise = ["[]"]
# df = pd.read_csv('cleaned_data.csv', na_values = noise )
# df.isnull().sum()

In [None]:
phish_data.info()

In [None]:
ls -l

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser = webdriver.Chrome(r"chromedriver.exe")

In [None]:
list_urls = ['https://ezee.com/cell-phones/','https://ezee.com/login.php?from=account.php%3Faction%3D'] #here i take phishing sites 
links_with_text = []

In [None]:
for url in list_urls:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source,"html.parser")
    for line in soup.find_all('a' or 'link'):
        href = line.get('href')
        links_with_text.append([url, href])

In [None]:
df = pd.DataFrame(links_with_text, columns=["from", "to"])
# DataFrame is 2D datastructure in pandas.   

In [None]:
df.to_csv('dataframe.csv')

In [None]:
# network graph using networkx 
GA = nx.from_pandas_edgelist(df, source="from", target="to")
nx.draw(GA, with_labels=False) # draws the internal hyperlink redirection network 

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer # Vectorizes the tokens into a matrix 

cv = CountVectorizer() # converts the tokenized-stemmed words into a sparse matrix in a numerical format w.r.t frequency of word, as machine can't understand words

In [32]:
phish_data[:3]

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, ffb, d, dca, cce, f, login, SkyPe...","[nobel, it, ffb, d, dca, cce, f, login, skype,...",nobel it ffb d dca cce f login skype com en cg...
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin...",www dghjdgf com paypal co uk cycgi bin webscrc...
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into...",serviciosbi com paypal cgi bin get into herf s...


In [29]:
feature = cv.fit_transform(phish_data.text_sent) # converts the tokens into algo-understandable numerical format by calculating different statistical terms like mean, standard deviation etc...,

In [30]:
feature[:5]
feature

<549346x350837 sparse matrix of type '<class 'numpy.int64'>'
	with 3676066 stored elements in Compressed Sparse Row format>

In [31]:
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)
# trainX & testX are compressed sparse matrixes
# trainY & testY are splitted datasets 
# unable to store 1.4TiB data

In [None]:
lr = LogisticRegression(max_iter = 549346 ) # Increasing the max_iter 

In [None]:
# Training the lr model with training split data of X & Y 
lr.fit(trainX,trainY)
# ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
# Increase the number of iterations (max_iter) or scale the data

In [None]:
# from sklearn import preprocessing
# scaler = preprocessing.StandardScaler().fit(trainY) 
# scaled = scaler.transform(trainX) 

# cant enter strings to be scaled 
# sparse matrices can't be chosen 

In [None]:
lr.score(trainX,trainY)

In [None]:
lr.score(testX,testY) 
# accuracy improved from 0.960 --> 0.9664 on increasing the max_iter 

In [None]:
# Storing the performance in dictionary 
score = {}
score['LogReg'] = np.round (lr.score(testX,testY) , 3 ) 

In [None]:
score['LogReg']

In [None]:
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])

print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sb.heatmap(con_mat, annot = True,fmt='d')

In [None]:
# comparing the true labels i.e. true dataset with the predicted dataset and prepares the report of quality of prediction.
print(classification_report(testY, lr.predict(testX)))
# predictions will be done from sparse matrices. 
# Accuracy - out of all the good and bad predictions how many of them are acutally true
# Precision - out of all good/(+ve) predictions done how many of them are true +ve. similiarly for bad/(-ve)
# recall - it simply says how truthful the model is 
# recall - no. of predicted good / no.of actual real good  
# let's say no of +ve s are 6 but predicted are only 4 to be good 
# recall = 4 / 6 = 0.67
# f1 score = harmonic mean of precision & recall
# hm = 2*(h1 * h2 ) / ( h1 + h2 ) 


In [None]:
## MULTINOMIAL NAIVE BAYES ALGORITHM 
mnb = MultinomialNB()
mnb.fit(trainX,trainY) 

In [None]:
mnb.score(trainX,trainY)

In [None]:
mnb.score(testX,testY)

In [33]:
score['multiNB'] = np.round(mnb.score(testX,testY), 2 )
score['multiNB']

NameError: name 'np' is not defined

In [None]:
print('Training Accuracy :',mnb.score(trainX,trainY))
print('Testing Accuracy :',mnb.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])

print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sb.heatmap(con_mat, annot = True,fmt='d')

## LOGISTIC REGRESSION PIPELINE

In [32]:
lr = LogisticRegression(max_iter = 549347)

### COUNT VECTORIZER PIPES 

In [48]:
# pipeline = make_pipeline(cv(tokenizer.tokenize) , lr ) 
# arguments - ( preprocessing_techniques , model(i.e. estimator))
lr_pipeline = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize), lr)
# lr_pipeline = make_pipeline(TfidfVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), lr)

In [21]:
# INCLUDING SNOWBALLSTEMMER IN TRANSFORMERS 
lr_pipeline = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,SnowballStemmer(),stop_words='english'), lr)

SyntaxError: positional argument follows keyword argument (2554803405.py, line 2)

### TF-IDF PIPES 

In [34]:
lr_pipeline = make_pipeline(TfidfTransformer(), lr)

In [49]:
feature = cv.fit_transform(phish_data.text_sent)

In [50]:
# training the model with feature extractions and labelled data 
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)

In [52]:
lr_pipeline.fit(trainX,trainY)
lr_pipeline.score(testX,testY) 

AttributeError: lower not found

In [46]:
lr_pipeline.predict("https://www.codewithrandom.com/2022/08/24/search-icon-inside-input-html-css/")

ValueError: could not convert string to float: 'https://www.codewithrandom.com/2022/08/24/search-icon-inside-input-html-css/'

In [None]:
score['LogReg_tfid_feature_pipeline'] = lr_pipeline.score(testX,testY) 

In [None]:
print('Training Accuracy :',lr_pipeline.score(trainX,trainY))
print('Testing Accuracy :',lr_pipeline.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr_pipeline.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr_pipeline.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sb.heatmap(con_mat, annot = True,fmt='d')

In [None]:
pickle.dump(lr_pipeline, open('lr_model.pkl','wb'))

In [None]:
lr_model = pickle.load(open('lr_model.pkl','rb'))
result = lr_model.score(testX, testY) 
print(result) 

In [None]:
print(np.round(result , 2 ) )

In [None]:
score['LogReg_pipeline'] = lr_model.score(testX, testY) 

In [30]:
from sklearn import linear_model

reg = linear_model.BayesianRidge()
reg.fit(trainX,trainY)
reg.score(testX,testY)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

## MULTINOMIAL-NB PIPELINE

In [None]:
# COUNTVECTORIZER TRANSFORMER WITHOUT FEATURES EXTRACTED
mnb_pipeline = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words = 'english'),mnb)
trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label)

In [None]:
mnb_pipeline.fit(trainX, trainY) 
mnb_pipeline.score(testX, testY)

In [None]:
score['multiNB_pipeline'] = mnb_pipeline.score(testX, testY) 
pickle.dump(mnb_pipeline, open('mnb_model.pkl', 'wb'))

In [None]:
# TFIDFTRANSFORMER WITH FEATURES EXTRACTED
mnb_pipeline = make_pipeline(TfidfTransformer(),mnb)
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)

In [None]:
mnb_pipeline.fit(trainX, trainY) 
mnb_pipeline.score(testX, testY)

In [None]:
score['multiNB_tfid_feature_pipeline'] = mnb_pipeline.score(testX, testY) 
pickle.dump(mnb_pipeline, open('mnb_model.pkl', 'wb'))

In [None]:
# TFIDFVECTORIZER TRANSFORMER WITH FEATURES EXTRACTED
# mnb_pipeline = make_pipeline(TfidfVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words = 'english'),mnb)
# trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)
# VECTORIZERS NEED LOWER BOUND WITH THE FEATURE EXTRACTION DATA

In [None]:
mnb_model = pickle.load(open('mnb_model.pkl', 'rb'))
mnb_model.score(testX,testY) 

In [None]:
print('Training Accuracy :',mnb_pipeline.score(trainX,trainY))
print('Testing Accuracy :',mnb_pipeline.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb_pipeline.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])

print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb_pipeline.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sb.heatmap(con_mat, annot = True,fmt='d')

# OTHER PIPELINE MODELS

In [11]:
import xgboost as xgb 
xgb_classifier = xgb.XGBClassifier() 

from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier()

from sklearn.linear_model import SGDClassifier
sgd_classifier = SGDClassifier() 

from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(max_iter = 110359) 

from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier() 

from sklearn.svm import SVC
svm_classifier = SVC() 

from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier() 

from sklearn.naive_bayes import MultinomialNB
mnb_classifier = MultinomialNB() 

### COUNT VECTORIZER PIPES 

In [22]:
xgb_pipeline = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), xgb_classifier)


In [23]:
ada_pipeline = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), ada_classifier)

In [24]:
sgd_pipeline = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), sgd_classifier)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer # Vectorizes the tokens into a matrix 

cv = CountVectorizer() # converts the tokenized-stemmed words into a sparse matrix in a numerical format w.r.t frequency of word, as machine can't understand words
feature = cv.fit_transform(phish_data.text_sent) # converts the tokens into algo-understandable numerical format by calculating different statistical terms like mean, standard deviation etc...,


### TF-IDF PIPES 

In [12]:
xgb_pipeline = make_pipeline(TfidfTransformer(), xgb_classifier)

In [13]:
ada_pipeline = make_pipeline(TfidfTransformer(), ada_classifier)

In [14]:
sgd_pipeline = make_pipeline(TfidfTransformer(), sgd_classifier)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer # Vectorizes the tokens into a matrix 

cv = CountVectorizer()
feature = cv.fit_transform(phish_data.text_sent)
# training the model with feature extractions and labelled data 
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)

In [16]:
trainY.head(10)

78444     good
178700    good
424825    good
106682     bad
315430    good
106491     bad
263791    good
170055    good
122187     bad
198075    good
Name: Label, dtype: object

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
trainY = le.fit_transform(trainY)

In [18]:
for i in range( 0 , 9) : 
    print( trainY[i] ) 

1
1
1
0
1
0
1
1
0


In [19]:
xgb_pipeline.fit( trainX , trainY ) 

Pipeline(steps=[('tfidftransformer', TfidfTransformer()),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=0, gpu_id=-1,
                               grow_policy='depthwise', importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_threshold=64, max_cat_to_onehot=4,
                               max_delta_step=0, max_depth=6, max_leaves=0,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n

In [41]:
testY = le.fit_transform(testY)

In [28]:
xgb_pipeline.score( testX, testY ) 

0.9132426076002825

In [44]:
xgb_pipeline.predict( testY ) 

ValueError: Expected 2D array, got 1D array instead:
array=[1. 0. 0. ... 1. 1. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [20]:
pickle.dump(xgb_pipeline, open('xgb_pipeline.pkl', 'wb'))

In [21]:
sgd_pipeline.fit( trainX , trainY ) 

Pipeline(steps=[('tfidftransformer', TfidfTransformer()),
                ('sgdclassifier', SGDClassifier())])

In [29]:
sgd_pipeline.score( testX, testY ) 

0.9121649664693419

In [22]:
pickle.dump(sgd_pipeline, open('sgd_pipeline.pkl', 'wb'))

In [None]:
ada_pipeline.fit( trainX , trainY ) 

In [None]:
pickle.dump(ada_pipeline, open('ada_pipeline.pkl', 'wb'))

## BERT TRANSFORMER

In [4]:
try:
    import numpy as np
    import pandas as pd

    import torch
    import transformers as ppb # pytorch transformers
    
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
    
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    
    from sklearn.naive_bayes import MultinomialNB
    
    import warnings

    import swifter
    import tqdm
    tqdm.pandas()

    warnings.filterwarnings('ignore')
except Exception  as e: pass

In [9]:
df = {}
df = pd.read_csv('phishing_site_urls.csv')
# df = df.dropna(how='all')

In [14]:
X = df[1]
Y = df[2]
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

KeyError: 1

In [11]:
class BertTokenizer(object):

    def __init__(self, text=[]):
        self.text = text

        # For DistilBERT:
        self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

        # Load pretrained model/tokenizer
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)

        self.model = self.model_class.from_pretrained(self.pretrained_weights)

    def get(self):

        df = pd.DataFrame(data={"text":self.text})
        tokenized = df["text"].swifter.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))

        max_len = 0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)

        padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

        attention_mask = np.where(padded != 0, 1, 0)
        input_ids = torch.tensor(padded)
        attention_mask = torch.tensor(attention_mask)

        with torch.no_grad(): last_hidden_states = self.model(input_ids, attention_mask=attention_mask)
        
        features = last_hidden_states[0][:, 0, :].numpy()

        return features

In [12]:
_instance =BertTokenizer(text=x_train)
tokens = _instance.get()

NameError: name 'x_train' is not defined

# MODEL

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(tokens, y_train)

# TEST

In [None]:
_instance =BertTokenizer(text=x_test)
tokensTest = _instance.get()

In [None]:
predicted = lr_clf.predict(tokensTest)

In [None]:
np.mean(predicted == y_test)

# Performance evaluation of all models

In [None]:
acc = pd.DataFrame.from_dict(score,orient = 'index',columns=['Accuracy'])
sb.set_style('darkgrid')
sb.barplot(acc.index,acc.Accuracy)
print(acc)

In [None]:
%pip install pyngrok nest_asyncio fastapi uvicorn loguru
import uvicorn
from fastapi import FastAPI
import joblib,os
from fastapi import FastAPI
from pydantic import BaseModel
from loguru import logger

app = FastAPI()

#pkl
phish_model = open('lr_model.pkl','rb')
phish_model_ls = joblib.load(phish_model)

# ML Aspect
@app.get('/predict/{feature}')
async def predict(features):
	X_predict = []
	X_predict.append(str(features))
	y_Predict = phish_model_ls.predict(X_predict)
	if y_Predict == 'bad':
		result = "This is a Phishing Site"
	else:
		result = "This is not a Phishing Site"

	return (features, result)
if __name__ == '__main__':
	uvicorn.run(app,host="127.0.0.1",port=8000)