In [None]:
import pandas as pd
import numpy as np
from numpy import array
from keras import layers
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from nltk.tokenize import word_tokenize
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


In [None]:
from google.colab import files
uploaded = files.upload()

Saving urls_dataset.csv to urls_dataset.csv


In [None]:
data = pd.read_csv(r"urls_dataset.csv")
print(data)

                                                     url  label
0                  http://user57245.vs.speednames.com/-/      1
1      http://morningdiaries.com/css/account-limited/...      1
2                 http://keviso.xyz/ScreenDrop/index.php      1
3      http://hosseinabyaran.ir/wp-includes/in/update...      1
4      https://sites.google.com/site/eror404safety/?p...      1
...                                                  ...    ...
20123  https://stainupurworejo.ac.id/wp-includes/css/...      1
20124             http://ceftus.org/tag/paradise-papers/      0
20125    http://transaction-authorization.droppages.com/      1
20126  http://fujoho.jp/index.php?p=shop_repo_list&id...      0
20127                    http://jayamahehe.blogspot.com/      1

[20128 rows x 2 columns]


In [None]:
urls = data.iloc[:,0]
print(urls)

0                    http://user57245.vs.speednames.com/-/
1        http://morningdiaries.com/css/account-limited/...
2                   http://keviso.xyz/ScreenDrop/index.php
3        http://hosseinabyaran.ir/wp-includes/in/update...
4        https://sites.google.com/site/eror404safety/?p...
                               ...                        
20123    https://stainupurworejo.ac.id/wp-includes/css/...
20124               http://ceftus.org/tag/paradise-papers/
20125      http://transaction-authorization.droppages.com/
20126    http://fujoho.jp/index.php?p=shop_repo_list&id...
20127                      http://jayamahehe.blogspot.com/
Name: url, Length: 20128, dtype: object


In [None]:
labels = data.iloc[:,1]
print(labels)

0        1
1        1
2        1
3        1
4        1
        ..
20123    1
20124    0
20125    1
20126    0
20127    1
Name: label, Length: 20128, dtype: int64


In [None]:
model = Sequential()
embedding_layer = Embedding(input_dim=70,output_dim=16,input_length=256)
model.add(embedding_layer)
model.add(Reshape((1, 256, 16)))
model.add(layers.Conv2D(64, (1, 8), activation="relu", input_shape = (128, 16, 1)))
model.add(layers.MaxPooling2D((2,2), padding='same'))
model.add(layers.Conv2D(16, (1,16), activation="relu"))
model.add(layers.MaxPooling2D((2,2), padding='same'))
model.add(layers.Conv2D(8, (1,32), activation="relu"))
model.add(layers.MaxPooling2D((2,2), padding='same'))

model.add(layers.Flatten())
model.add(layers.Dense(10, activation="relu"))
model.add(layers.Dense(1,  activation="sigmoid"))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 256, 16)           1120      
                                                                 
 reshape (Reshape)           (None, 1, 256, 16)        0         
                                                                 
 conv2d (Conv2D)             (None, 1, 249, 64)        8256      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, 125, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 1, 110, 16)        16400     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 1, 55, 16)        0         
 2D)                                                    

In [None]:
def embedding_urls(urls):
    vocab_size = 70
    encoded_urls = [one_hot(url,vocab_size) for url in urls]
#     print(f'Encoded URLs: {encoded_urls}')
    max_length = 256
    padded_urls = pad_sequences(encoded_urls,maxlen=max_length,padding='post')
    return padded_urls

In [None]:
def evaluate_model(X_test_padded,y_test):
    loss, acc = model.evaluate(X_test_padded,y_test)
    return acc

In [None]:
kf = KFold(n_splits=5)
accuracies = []

for train_index, test_index in kf.split(urls):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = urls[train_index], urls[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    X_train_padded = embedding_urls(X_train)
    X_test_padded = embedding_urls(X_test)
    model.fit(X_train_padded,y_train,batch_size = 64,epochs=100)
    accuracy = evaluate_model(X_test_padded,y_test)
    print(accuracy)
    accuracies.append(accuracy)

TRAIN: [ 4026  4027  4028 ... 20125 20126 20127] TEST: [   0    1    2 ... 4023 4024 4025]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71

In [None]:
print(accuracies)

[0.8539493083953857, 0.9234972596168518, 0.9418777823448181, 0.9515528082847595, 0.9619875550270081]


In [None]:
print(sum(accuracies)/len(accuracies))

0.9265729427337647
