In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import math

In [2]:
def checkSpecial(url):
    """Returns number of special characters in string"""
    regex = re.compile('[@_!#$%^&*()<>?|}{~]')
    return len([c for c in url if regex.search(c)])

def getNums(url):
    """Returns number of digits in string"""
    return len([c for c in url if c.isdigit()])

def entropy(url):
    """Returns entropy of string"""
    s = url.strip()
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    ent = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
    return ent

def numSubDomains(url):
    """Returns number of subdomains in the given URL"""
    subdomains = url.split('http')[-1].split('//')[-1].split('/')
    return len(subdomains)-1

def feature_transform(df):
    """Featurizes the URL string into the data frame"""
    df.insert(2, 'len_url', [len(url) for url in df['URL']])
    df.insert(2, 'numerical', [getNums(url) for url in df['URL']])
    df.insert(2, 'special', [checkSpecial(url) for url in df['URL']])
    df.insert(2, 'hasPercent', [1 if ('%' in url) else 0 for url in df['URL']])
    df.insert(2, 'entropy', [entropy(url) for url in df['URL']])
    df.insert(2, 'numSD', [numSubDomains(url) for url in df['URL']])
    del df['URL']

In [27]:
df = pd.read_csv('Phishing_Dataset.csv')
feature_transform(df)

y = df.pop('Label')

x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=0)

def norm(x):
  return (x - x.mean())/x.std()

normed_train = norm(x_train)

normed_test = norm(x_test)

In [28]:
df.head()

Unnamed: 0,create_age(months),expiry_age(months),numSD,entropy,hasPercent,special,numerical,len_url,update_age(days)
0,-1,-1,1,-4.612783,0,0,21,70,-1
1,212,16,1,-3.819114,0,0,0,36,663
2,-1,-1,5,-4.370385,0,0,21,73,-1
3,198,6,1,-3.695501,0,0,0,31,186
4,240,24,1,-3.760017,0,0,0,31,1684


In [17]:
x_train.iloc[34].values

array([ 22.       ,   2.       ,   4.       ,  -4.4475274,   0.       ,
         0.       ,  17.       ,  64.       , 304.       ])

In [15]:
y_train

70      0
3586    0
1122    0
4569    1
34      1
       ..
1033    0
3264    0
1653    0
2607    1
2732    1
Name: Label, Length: 3839, dtype: int64

In [4]:
def build_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal', input_shape=[len(x_train.columns)]),
     tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(25, activation='relu', kernel_initializer='he_normal'),
     tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  return model

In [5]:
model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               1000      
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 25)                1275      
_________________________________________________________________
dropout_1 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 26        
Total params: 7,351
Trainable params: 7,351
Non-trainable params: 0
______________________________________________________

In [6]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=5)

history = model.fit(normed_train, y_train, epochs=150, batch_size=32, validation_split=0.2, callbacks=[callback])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150


In [7]:
score = model.evaluate(normed_test, y_test)

# test_predictions = model.predict(normed_test).flatten()
# predictions = [int(round(value)) for value in test_predictions]
# print(predictions)



In [13]:
model.predict([[163.        ,   5.        ,   1.        ,  -3.36888407,
         0.        ,   0.        ,   0.        ,  25.        ,
       248.        ]])

array([[0.]], dtype=float32)

In [18]:
model.predict([[ 22.       ,   2.       ,   4.       ,  -4.4475274,   0.       ,
         0.       ,  17.       ,  64.       , 304.       ]])

array([[1.]], dtype=float32)