# Libraries

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
import os

# Data

In [3]:
dataset = pd.read_csv('featureset.csv')

In [4]:
testset = pd.read_csv('testset.csv')

In [5]:
dataset.head()

Unnamed: 0,URL,Label,Domain Registration Length,Search Traffic,Bounce Rate,Total Sites Linking in,Alexa Rank,Daily Views per Visitor,Daily Time on Site,Length,Is IP,Count @,Count Double Slash,Count Hyphen,Count Dots,Count Delimeters,Count Subdirectory,Count Queries
0,https://nih.gov,0,-1,72.9,57.3,134142,0.008849558,3.17,237,15,0,0,1,0,1,0,2,1
1,http://www.perfumemart.com/,0,-1,-1.0,50.0,-1,1.925435e-07,-1.0,-1,27,0,0,1,0,2,0,3,1
2,http://chronopost-service-enligne.net/56123s/r...,1,-1,-1.0,-1.0,-1,-1.0,-1.0,-1,116,0,0,1,2,1,0,8,1
3,http://wvw.micloudappel.security-updates-myacc...,1,-1,-1.0,-1.0,-1,-1.0,-1.0,-1,75,0,0,1,2,3,0,5,1
4,http://colach8x.beget.tech/likon/mqnger/drwx/P...,1,4,5.1,45.7,7352,5.198586e-05,5.76,697,139,0,0,1,0,3,3,11,1


In [6]:
testset.head()

Unnamed: 0,URL,Domain Registration Length,Search Traffic,Bounce Rate,Total Sites Linking in,Alexa Rank,Daily Views per Visitor,Daily Time on Site,Length,Is IP,Count @,Count Double Slash,Count Hyphen,Count Dots,Count Delimeters,Count Subdirectory,Count Queries,Label
0,https://www.office.com,20,22.6,58.6,7285,0.019608,2.52,201,22,0,0,1,0,2,0,2,1,0
1,https://www.google.de,-1,5.3,37.3,52993,0.016129,5.27,240,21,0,0,1,0,2,0,2,1,0
2,https://www.tribunnews.com,9,75.0,49.7,39300,0.027027,2.65,261,26,0,0,1,0,2,0,2,1,0
3,https://www.microsoftonline.com,18,1.2,47.2,1106,0.030303,1.91,73,31,0,0,1,0,2,0,2,1,0
4,https://www.imdb.com,27,57.9,41.5,214337,0.02,4.07,209,20,0,0,1,0,2,0,2,1,0


In [7]:
print(dataset.shape, testset.shape)

(8000, 18) (99, 18)


In [8]:
X = dataset.drop(['URL', 'Label'], axis=1)
y = dataset['Label']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2887, random_state=101)

In [11]:
X_test = testset.drop(['URL','Label'], axis=1)
y_test = testset['Label']

# Machine Learning

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
model = LogisticRegression()
accuracies = cross_val_score(estimator=model, X=X, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

LR = LogisticRegression()
LR.fit(X, y)
y_pred = LR.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.885 +- (0.010796411440844587)
[[45  4]
 [ 1 49]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95        49
           1       0.92      0.98      0.95        50

   micro avg       0.95      0.95      0.95        99
   macro avg       0.95      0.95      0.95        99
weighted avg       0.95      0.95      0.95        99



## Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
model = RandomForestClassifier()
accuracies = cross_val_score(estimator=model, X=X, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

RFC = RandomForestClassifier()
RFC.fit(X, y)
y_pred = RFC.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9548749999999998 +- (0.006261239893184092)
[[48  1]
 [ 1 49]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        49
           1       0.98      0.98      0.98        50

   micro avg       0.98      0.98      0.98        99
   macro avg       0.98      0.98      0.98        99
weighted avg       0.98      0.98      0.98        99



# Model Pickeling

In [17]:
import pickle

In [18]:
DIR = 'E:/Models/URL-Classfier'
os.mkdir(DIR)

In [19]:
pickle.dump(LR, open(f'{DIR}/LR.pickle', 'wb'))

In [20]:
pickle.dump(RFC, open(f'{DIR}/RFC.pickle', 'wb'))

# Deeplearning

**Architecture**

16->80->400->20->1

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X)
X_scaled_test = scaler.transform(X_test)
pickle.dump(scaler, open(f'{DIR}/Scaler.pickle', 'wb'))

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [24]:
es = EarlyStopping(monitor='loss', mode='min', verbose=1)
filepath = f'{DIR}/model.h5'
ckpt = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [25]:
def build_network():
    model = Sequential()
    model.add(Dense(80,input_dim=16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [26]:
model = build_network()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 80)                1360      
_________________________________________________________________
dropout_1 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 400)               32400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                8020      
_________________________________________________________________
dropout_3 (Dropout)  

In [27]:
model.fit(X_scaled_train, y, validation_split=0.2887, epochs=25, callbacks=[es, ckpt])

Instructions for updating:
Use tf.cast instead.
Train on 5690 samples, validate on 2310 samples
Epoch 1/25

Epoch 00001: loss improved from inf to 0.25861, saving model to E:/Models/URL-Classfier/model.h5
Epoch 2/25

Epoch 00002: loss improved from 0.25861 to 0.19882, saving model to E:/Models/URL-Classfier/model.h5
Epoch 3/25

Epoch 00003: loss improved from 0.19882 to 0.18667, saving model to E:/Models/URL-Classfier/model.h5
Epoch 4/25

Epoch 00004: loss improved from 0.18667 to 0.17661, saving model to E:/Models/URL-Classfier/model.h5
Epoch 5/25

Epoch 00005: loss improved from 0.17661 to 0.17191, saving model to E:/Models/URL-Classfier/model.h5
Epoch 6/25

Epoch 00006: loss improved from 0.17191 to 0.15936, saving model to E:/Models/URL-Classfier/model.h5
Epoch 7/25

Epoch 00007: loss did not improve from 0.15936
Epoch 00007: early stopping


<keras.callbacks.History at 0x2783bf02cc0>

In [28]:
y_pred = model.predict_classes(X_scaled_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[48  1]
 [ 1 49]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        49
           1       0.98      0.98      0.98        50

   micro avg       0.98      0.98      0.98        99
   macro avg       0.98      0.98      0.98        99
weighted avg       0.98      0.98      0.98        99

