# Libraries

In [1]:
import numpy as np
import pandas as pd

# Data

In [2]:
dataset = pd.read_csv('featureset.csv')

In [3]:
testset = pd.read_csv('testset.csv')

In [4]:
dataset.head()

Unnamed: 0,URL,length,isIp,countAt,countDoubleSlash,countHyphen,countDots,countDelimeters,countSubDirectory,countQueries,domainRegistrationLength,alexaGlobalRank,bounceRate,dailyPageViewsPerVisitor,dailyTimeOnSite,searchVisits,totalSitesLinkingIn,label
0,https://pages.tmall.com,23,0,0,1,0,2,0,2,1,23,0.03125,-1.0,-1.0,-1,-1.0,-1.0,0
1,http://habrahabr.ru,19,0,0,1,0,1,0,2,1,12,-1.0,-1.0,-1.0,-1,-1.0,-1.0,0
2,https://ebay.de,15,0,0,1,0,1,0,2,1,-1,0.007092,-1.0,-1.0,-1,-1.0,-1.0,0
3,http://account-chek-police.000webhostapp.com/p...,131,0,0,1,4,3,11,3,3,5,-1.0,-1.0,-1.0,-1,-1.0,-1.0,1
4,http://aprecnotis.clan.su/recovery-login.htm,44,0,0,1,1,3,0,3,1,12,-1.0,-1.0,-1.0,-1,-1.0,-1.0,1


In [5]:
testset.head()

Unnamed: 0,length,isIp,countAt,countDoubleSlash,countHyphen,countDots,countDelimeters,countSubDirectory,countQueries,domainRegistrationLength,alexaGlobalRank,bounceRate,dailyPageViewsPerVisitor,dailyTimeOnSite,searchVisits,totalSitesLinkingIn,label
0,39.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
1,31.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2,34.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
3,37.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
4,33.0,0.0,0.0,1.0,0.0,2.0,0.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1


In [6]:
X = dataset.drop(['URL', 'label'], axis=1)
y = dataset['label']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2887, random_state=101)

In [9]:
X_test = testset.drop(['label'], axis=1)
y_test = testset['label']

# Scaled Features

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler = MinMaxScaler()
scaler.fit(X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [12]:
Xs = scaler.transform(X)

In [13]:
Xs_train, Xs_eval, ys_train, ys_eval = train_test_split(Xs, y, test_size=0.2887, random_state=101)

In [14]:
Xs_test = scaler.transform(X_test)

In [15]:
ys_test = y_test

# Machine Learning

In [16]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
model = LogisticRegression()
accuracies = cross_val_score(estimator=model, X=Xs, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

LR1 = LogisticRegression()
LR1.fit(X, y)
y_pred = LR1.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9419762481263693 +- (0.00618535524332067)
[[38 11]
 [ 0 50]]
             precision    recall  f1-score   support

          0       1.00      0.78      0.87        49
          1       0.82      1.00      0.90        50

avg / total       0.91      0.89      0.89        99



In [19]:
model = LogisticRegression()
accuracies = cross_val_score(estimator=model, X=Xs, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

LRS1 = LogisticRegression()
LRS1.fit(Xs, y)
y_pred = LRS1.predict(Xs_test)
print(confusion_matrix(ys_test, y_pred))
print(classification_report(ys_test, y_pred))

0.9419762481263693 +- (0.00618535524332067)
[[49  0]
 [19 31]]
             precision    recall  f1-score   support

          0       0.72      1.00      0.84        49
          1       1.00      0.62      0.77        50

avg / total       0.86      0.81      0.80        99



### Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
model = RandomForestClassifier()
accuracies = cross_val_score(estimator=model, X=Xs, y=y, cv=10)
print("{} +- ({})".format(accuracies.mean(), accuracies.std()))

RFC1 = RandomForestClassifier()
RFC1.fit(X, y)
y_pred = RFC1.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9514758445751182 +- (0.00653546592167242)
[[49  0]
 [15 35]]
             precision    recall  f1-score   support

          0       0.77      1.00      0.87        49
          1       1.00      0.70      0.82        50

avg / total       0.88      0.85      0.85        99



# Model Pickeling

In [22]:
import pickle

In [23]:
pickle.dump(LR1, open('LR.pickle', 'wb'))

In [24]:
pickle.dump(LRS1, open('LRS.pickle', 'wb'))

In [25]:
pickle.dump(RFC1, open('RFC.pickle', 'wb'))

In [26]:
pickle.dump(scaler, open('scaler.pickle', 'wb'))