# Libraries

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# Data

In [2]:
dataset = pd.read_csv('featureset.csv')

In [3]:
testset = pd.read_csv('testset.csv')

In [4]:
dataset.head()

Unnamed: 0,URL,Label,Domain Registration Length,Search Traffic,Bounce Rate,Total Sites Linking in,Alexa Rank,Daily Views per Visitor,Daily Time on Site,Length,Is IP,Count @,Count Double Slash,Count Hyphen,Count Dots,Count Delimeters,Count Subdirectory,Count Queries
0,https://nih.gov,0,-1,72.9,57.3,134142,0.008849558,3.17,237,15,0,0,1,0,1,0,2,1
1,http://www.perfumemart.com/,0,-1,-1.0,50.0,-1,1.925435e-07,-1.0,-1,27,0,0,1,0,2,0,3,1
2,http://chronopost-service-enligne.net/56123s/r...,1,-1,-1.0,-1.0,-1,-1.0,-1.0,-1,116,0,0,1,2,1,0,8,1
3,http://wvw.micloudappel.security-updates-myacc...,1,-1,-1.0,-1.0,-1,-1.0,-1.0,-1,75,0,0,1,2,3,0,5,1
4,http://colach8x.beget.tech/likon/mqnger/drwx/P...,1,4,5.1,45.7,7352,5.198586e-05,5.76,697,139,0,0,1,0,3,3,11,1


In [5]:
testset.head()

Unnamed: 0,URL,Domain Registration Length,Search Traffic,Bounce Rate,Total Sites Linking in,Alexa Rank,Daily Views per Visitor,Daily Time on Site,Length,Is IP,Count @,Count Double Slash,Count Hyphen,Count Dots,Count Delimeters,Count Subdirectory,Count Queries,Label
0,https://www.office.com,20,22.6,58.6,7285,0.019608,2.52,201,22,0,0,1,0,2,0,2,1,0
1,https://www.google.de,-1,5.3,37.3,52993,0.016129,5.27,240,21,0,0,1,0,2,0,2,1,0
2,https://www.tribunnews.com,9,75.0,49.7,39300,0.027027,2.65,261,26,0,0,1,0,2,0,2,1,0
3,https://www.microsoftonline.com,18,1.2,47.2,1106,0.030303,1.91,73,31,0,0,1,0,2,0,2,1,0
4,https://www.imdb.com,27,57.9,41.5,214337,0.02,4.07,209,20,0,0,1,0,2,0,2,1,0


In [6]:
print(dataset.shape, testset.shape)

(8000, 18) (99, 18)


In [7]:
X = dataset.drop(['URL', 'Label'], axis=1)
y = dataset['Label']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2887, random_state=101)

In [10]:
X_test = testset.drop(['URL','Label'], axis=1)
y_test = testset['Label']

# Machine Learning

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
model = LogisticRegression()
accuracies = cross_val_score(estimator=model, X=X, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

LR = LogisticRegression()
LR.fit(X, y)
y_pred = LR.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.885 +- (0.010796411440844587)
[[45  4]
 [ 1 49]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95        49
           1       0.92      0.98      0.95        50

   micro avg       0.95      0.95      0.95        99
   macro avg       0.95      0.95      0.95        99
weighted avg       0.95      0.95      0.95        99



### Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
model = RandomForestClassifier()
accuracies = cross_val_score(estimator=model, X=X, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

RFC = RandomForestClassifier()
RFC.fit(X, y)
y_pred = RFC.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9553749999999999 +- (0.004712019206242674)
[[48  1]
 [ 1 49]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        49
           1       0.98      0.98      0.98        50

   micro avg       0.98      0.98      0.98        99
   macro avg       0.98      0.98      0.98        99
weighted avg       0.98      0.98      0.98        99



# Model Pickeling

In [16]:
import pickle

In [17]:
pickle.dump(LR, open('LR.pickle', 'wb'))

In [18]:
pickle.dump(RFC, open('RFC.pickle', 'wb'))