In [1]:
import pandas as pd
import numpy as np
from numpy import array
from keras import layers
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from nltk.tokenize import word_tokenize
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(r"urls_dataset.csv")

In [3]:
print(data)

                                                     url  label
0                  http://user57245.vs.speednames.com/-/      1
1      http://morningdiaries.com/css/account-limited/...      1
2                 http://keviso.xyz/ScreenDrop/index.php      1
3      http://hosseinabyaran.ir/wp-includes/in/update...      1
4      https://sites.google.com/site/eror404safety/?p...      1
...                                                  ...    ...
20123  https://stainupurworejo.ac.id/wp-includes/css/...      1
20124             http://ceftus.org/tag/paradise-papers/      0
20125    http://transaction-authorization.droppages.com/      1
20126  http://fujoho.jp/index.php?p=shop_repo_list&id...      0
20127                    http://jayamahehe.blogspot.com/      1

[20128 rows x 2 columns]


In [4]:
urls = data.iloc[:,0]
print(urls)

0                    http://user57245.vs.speednames.com/-/
1        http://morningdiaries.com/css/account-limited/...
2                   http://keviso.xyz/ScreenDrop/index.php
3        http://hosseinabyaran.ir/wp-includes/in/update...
4        https://sites.google.com/site/eror404safety/?p...
                               ...                        
20123    https://stainupurworejo.ac.id/wp-includes/css/...
20124               http://ceftus.org/tag/paradise-papers/
20125      http://transaction-authorization.droppages.com/
20126    http://fujoho.jp/index.php?p=shop_repo_list&id...
20127                      http://jayamahehe.blogspot.com/
Name: url, Length: 20128, dtype: object


In [5]:
labels = data.iloc[:,1]
print(labels)

0        1
1        1
2        1
3        1
4        1
        ..
20123    1
20124    0
20125    1
20126    0
20127    1
Name: label, Length: 20128, dtype: int64


In [6]:
 x_train, x_test, y_train, y_test = train_test_split(urls, labels, test_size=0.2)

In [7]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16102,), (16102,), (4026,), (4026,))

In [8]:
def embedding_urls(urls):
    vocab_size = 70
    encoded_urls = [one_hot(url,vocab_size) for url in urls]
    print(f'Encoded URLs: {encoded_urls}')
    max_length = 256
    padded_urls = pad_sequences(encoded_urls,maxlen=max_length,padding='post')
    return padded_urls

In [9]:
x_train_padded = embedding_urls(x_train)
print(x_train_padded)
x_test_padded = embedding_urls(x_test)
print(x_test_padded)

Encoded URLs: [[27, 64, 32, 28, 3, 67, 57], [27, 10, 50, 23, 6, 28, 39], [27, 24, 23, 56, 38, 30, 43, 21, 2, 28, 59, 18], [27, 10, 23, 38, 11, 53, 66, 66, 22, 66, 36, 59], [27, 11, 23, 9, 19, 7, 45, 9, 19, 48, 15, 29, 67, 46, 60, 64, 67, 62, 64], [27, 65, 8, 48], [27, 20, 23, 25, 57, 57, 57, 57, 36, 57, 43, 28, 36, 23, 2, 50, 64, 50, 65], [27, 8, 36, 61, 51, 58], [27, 37, 31, 47, 2, 49, 10, 29, 27, 18, 51, 64], [27, 34, 10, 64, 16, 26, 62, 52], [27, 1, 21, 10], [27, 68, 45, 23, 33, 54, 68, 54, 37, 65, 24, 67], [27, 9, 23, 38, 11, 28], [27, 61, 28, 23, 44, 10], [27, 32, 23, 56, 9, 36, 11, 9], [27, 49, 38, 23, 29, 57, 48, 37, 37, 14, 68, 47, 22], [27, 31, 10, 9, 57, 11, 18, 58, 47, 37, 69, 67, 53, 9, 36, 22, 10, 46, 4, 64, 5, 18, 53, 60, 28], [27, 46, 4, 23], [27, 41, 57, 34, 44, 9, 36, 17, 11, 7, 49], [27, 49, 34, 63, 60, 28, 57, 27, 28, 51, 62, 30, 18, 30], [27, 15, 23, 52, 52, 47, 44, 5, 64], [27, 52, 35, 61, 58], [27, 26, 23, 38, 30, 40, 9, 62, 8, 28, 62, 49, 31, 15, 66, 55, 54, 15, 

In [10]:
model = Sequential()
embedding_layer = Embedding(input_dim=70,output_dim=16,input_length=256)
model.add(embedding_layer)
model.add(Reshape((128, 16, 2), input_shape=(256,)))
model.add(layers.Conv2D(8, (65, 16), activation="relu"))
model.add(layers.MaxPooling2D((2,2), padding='same'))
model.add(layers.Conv2D(16, (17,1), activation="relu"))
model.add(layers.MaxPooling2D((2,2), padding='same'))
model.add(layers.Conv2D(32, (1,1), activation="relu"))
model.add(layers.MaxPooling2D((8,8), padding='same'))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation="relu"))
model.add(layers.Dense(1,  activation="sigmoid"))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 16)           1120      
_________________________________________________________________
reshape (Reshape)            (None, 128, 16, 2)        0         
_________________________________________________________________
conv2d (Conv2D)              (None, 64, 1, 8)          16648     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 32, 1, 8)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 1, 16)         2192      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 1, 16)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 8, 1, 32)          5

In [11]:
model.fit(x_train_padded,y_train,batch_size = 64,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x2bf59e59700>

In [12]:
loss, acc = model.evaluate(x_train_padded, y_train)
print(acc)

0.9374611973762512


In [13]:
loss, acc = model.evaluate(x_test_padded, y_test)
print(acc)

0.8338301181793213
