## BERT implementation

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 
import pandas as pd  
import numpy as np 

In [2]:
df = pd.read_csv('./dataset/badurls.csv')
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [3]:
import re

def tokenize_text(url):
    url = re.sub('[^\w\s]',' ',url)
    url = re.sub('_',' ',url)
    url = re.sub('\s+',' ',url)
    return url

In [4]:
df['url_text'] = df['url'].apply(tokenize_text)
df.sample(5)

Unnamed: 0,url,label,url_text
264350,johnbetts-fineminerals.com/jhbnyc/gifs/51524.htm,good,johnbetts fineminerals com jhbnyc gifs 51524 htm
403859,nenerian.info/stats/main,bad,nenerian info stats main
181048,archive.org/stream/namesofforeigner00eglew/nam...,good,archive org stream namesofforeigner00eglew nam...
122107,onlinelibrary.wiley.com/doi/10.1002/ccd.1067/a...,good,onlinelibrary wiley com doi 10 1002 ccd 1067 a...
383125,two.jreoplte.biz/5cc7dt7efq\ndema.tvdiadema.co...,bad,two jreoplte biz 5cc7dt7efq ndema tvdiadema co...


In [5]:
df.describe()

Unnamed: 0,url,label,url_text
count,411247,411247,411247
unique,411247,2,410716
top,diaryofagameaddict.com,good,kf25zx com images us battle net
freq,1,344800,6


In [6]:
df.groupby('label').describe()

Unnamed: 0_level_0,url,url,url,url,url_text,url_text,url_text,url_text
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
bad,66447,66447,diaryofagameaddict.com,1,66447,65950,kf25zx com images us battle net,6
good,344800,344800,01453.com/,1,344800,344766,en wikipedia org wiki Robert Baldwin,2


In [7]:
df['label'].value_counts()

good    344800
bad      66447
Name: label, dtype: int64

In [8]:
df_good = df[df['label'] == 'good']
df_good.shape

(344800, 3)

In [9]:
df_bad = df[df['label'] == 'bad']
df_bad.shape

(66447, 3)

In [10]:
df_good_downsampled = df_good.sample(df_bad.shape[0])
df_good_downsampled.shape

(66447, 3)

In [11]:
df_balanced = pd.concat([df_good_downsampled,df_bad])
df_balanced.groupby('label').describe()

Unnamed: 0_level_0,url,url,url,url,url_text,url_text,url_text,url_text
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
bad,66447,66447,diaryofagameaddict.com,1,66447,65950,kf25zx com images us battle net,6
good,66447,66447,scenicrouteonline.com/,1,66447,66446,en wikipedia org wiki Robert Baldwin,2


In [12]:
#creating new column which has 1 if url is bad or 0 if url is good          
df_balanced['malicious'] = df_balanced['label'].apply(lambda x: 1 if x == 'bad' else 0)
df_balanced.sample(5)

Unnamed: 0,url,label,url_text,malicious
34316,royalgateenergy.com/wp-admin/js/images/httpdoc...,bad,royalgateenergy com wp admin js images httpdoc...,1
373907,youtube.com/watch?v=b3qq02V7fVA,good,youtube com watch v b3qq02V7fVA,0
245629,goplaytoday.net/,good,goplaytoday net,0
230231,facebook.com/people/Joel-Bouchard/737267658,good,facebook com people Joel Bouchard 737267658,0
225044,facebook.com/christa.currie,good,facebook com christa currie,0


In [13]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df_balanced['url_text'],df_balanced['malicious'],test_size = 0.2 ,stratify = df_balanced['malicious'])
#stratify is to maintain distribution of bad and good urls equal in training set

In [14]:
X_train.head()

1584         sweettalk co HSBC BANK STORAGE new secure html
117294    mydeadpeople blogspot com 2008 11 caliste roug...
15136     mallshifu com images swfupload images 2016 cbc...
255964                             imdb com name nm0000790 
31416                      thefitnessprinciples com dcwork 
Name: url_text, dtype: object

bert encoder urls and preprocessors links:
https://tfhub.dev/google/collections/bert/1

In [15]:
bert_preprocess =hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')



##### Building a functional model

In [16]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)


In [17]:
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

In [18]:
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [20]:
# defining metrics
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

In [21]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

##### Training the model

In [22]:
model.fit(X_train, y_train)



<keras.callbacks.History at 0x249090a0310>

In [23]:
model.evaluate(X_test, y_test)



[0.394493967294693, 0.8300161957740784, 0.8089467883110046, 0.8640981316566467]

In [24]:
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()



In [25]:
y_pred = np.where(y_pred > 0.5, 1, 0)
y_pred

array([0, 0, 1, ..., 1, 0, 0])

In [26]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
cm 

array([[10578,  2712],
       [ 1806, 11483]], dtype=int64)

In [27]:
print('Classification report:\n',classification_report(y_test,y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.85      0.80      0.82     13290
           1       0.81      0.86      0.84     13289

    accuracy                           0.83     26579
   macro avg       0.83      0.83      0.83     26579
weighted avg       0.83      0.83      0.83     26579

