In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

In [5]:
# To load dataset

df = pd.read_csv("https://raw.githubusercontent.com/fafal-abnir/phishing_detection/master/dataset.csv",index_col=0)
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0_level_0,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
129,1,-1,1,1,1,1,0,-1,1,1,...,1,1,1,1,-1,-1,1,1,1,1
1622,1,-1,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,0,-1,1,0,1,-1
3818,1,-1,1,1,1,1,0,1,-1,1,...,1,1,-1,1,0,-1,1,0,1,1
9033,-1,-1,1,1,1,-1,0,-1,-1,1,...,-1,1,1,1,1,-1,1,1,1,1
8193,1,-1,-1,1,-1,-1,-1,-1,1,1,...,1,1,1,1,-1,-1,1,1,1,-1


In [6]:
#To evaluate the metrics using 10 fold-cross validation

scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

In [7]:
# Support Vector Machine

# Linear
linear_clf = svm.SVC(kernel='linear')
cross_val_scores = cross_validate(linear_clf, X, y, cv=fold_count, scoring=scoring)
linear_svc_clf_score = mean_score(cross_val_scores)
print(linear_svc_clf_score)

#Polynomial
poly_clf = svm.SVC(kernel='poly')
cross_val_scores = cross_validate(poly_clf, X, y, cv=fold_count, scoring=scoring)
poly_svc_clf_score = mean_score(cross_val_scores)
print(poly_svc_clf_score)

#Radial Basis Function
rbf_clf = svm.SVC(kernel='rbf')
cross_val_scores = cross_validate(rbf_clf, X, y, cv=fold_count, scoring=scoring)
rbf_svc_clf_score = mean_score(cross_val_scores)
print(rbf_svc_clf_score)

#Sigmoid
sigmoid_clf = svm.SVC(kernel='sigmoid')
cross_val_scores = cross_validate(sigmoid_clf, X, y, cv=fold_count, scoring=scoring)
sigmoid_svc_clf_score = mean_score(cross_val_scores)
print(sigmoid_svc_clf_score)

{'fit_time': 2.665318822860718, 'score_time': 0.11417562961578369, 'test_accuracy': 0.9275442056082414, 'test_recall': 0.9459146341463415, 'test_precision': 0.9257383592763111, 'test_f1': 0.935690597998386}
{'fit_time': 2.189408779144287, 'score_time': 0.17542879581451415, 'test_accuracy': 0.9507015620269529, 'test_recall': 0.9710899060289304, 'test_precision': 0.9422282405947001, 'test_f1': 0.9564267525408949}
{'fit_time': 2.564968800544739, 'score_time': 0.3564657688140869, 'test_accuracy': 0.9520580461980312, 'test_recall': 0.9688158589378102, 'test_precision': 0.9464062370694087, 'test_f1': 0.95747213610874}
{'fit_time': 2.3182525634765625, 'score_time': 0.1924929141998291, 'test_accuracy': 0.8283149092158771, 'test_recall': 0.8471684616196811, 'test_precision': 0.8451870537575509, 'test_f1': 0.8460842041921323}


In [8]:
#Logistic Regression

logistic_clf=LogisticRegression(random_state=1)
cross_val_scores = cross_validate(logistic_clf, X, y, cv=fold_count, scoring=scoring)
logistic_clf_score = mean_score(cross_val_scores)
print(logistic_clf_score)

{'fit_time': 0.04031848907470703, 'score_time': 0.003963780403137207, 'test_accuracy': 0.9269103123235662, 'test_recall': 0.9438021328265229, 'test_precision': 0.9264829339155428, 'test_f1': 0.9350247046371776}


In [9]:
#Neural Network

neural_clf=MLPClassifier(hidden_layer_sizes=(33,),max_iter=500)
cross_val_scores = cross_validate(neural_clf, X, y, cv=fold_count, scoring=scoring)
neural_clf_score = mean_score(cross_val_scores)
print(neural_clf_score)

{'fit_time': 11.703775358200073, 'score_time': 0.004485607147216797, 'test_accuracy': 0.9703304885732287, 'test_recall': 0.97888554534896, 'test_precision': 0.9682226161801186, 'test_f1': 0.9735145336889334}


In [10]:
#Random Forest

rforest_clf=RandomForestClassifier()
cross_val_scores = cross_validate(rforest_clf, X, y, cv=fold_count, scoring=scoring)
rforest_clf_score = mean_score(cross_val_scores)
print(rforest_clf_score)

{'fit_time': 0.5757969856262207, 'score_time': 0.027023959159851074, 'test_accuracy': 0.9729537774213872, 'test_recall': 0.9816452856087002, 'test_precision': 0.9701968498155462, 'test_f1': 0.9758604184981138}


In [11]:
#Ada-Boost

adaboost_clf=AdaBoostClassifier()
cross_val_scores = cross_validate(adaboost_clf, X, y, cv=fold_count, scoring=scoring)
adaboost_clf_score = mean_score(cross_val_scores)
print(adaboost_clf_score)

{'fit_time': 0.31429741382598875, 'score_time': 0.013724303245544434, 'test_accuracy': 0.9366796494644596, 'test_recall': 0.954522753669095, 'test_precision': 0.9333627671864626, 'test_f1': 0.9438085434991758}


In [12]:
#Decision Tree

dtree_clf=DecisionTreeClassifier()
cross_val_scores = cross_validate(dtree_clf, X, y, cv=fold_count, scoring=scoring)
dtree_score = mean_score(cross_val_scores)
print(dtree_score)

{'fit_time': 0.021470141410827637, 'score_time': 0.0031986474990844727, 'test_accuracy': 0.9650845654717584, 'test_recall': 0.9683280540597614, 'test_precision': 0.9690260835542478, 'test_f1': 0.9686417678491177}


In [13]:
#K Nearest Neighbours

KNeighbors_clf=KNeighborsClassifier(3)
cross_val_scores = cross_validate(KNeighbors_clf, X, y, cv=fold_count, scoring=scoring)
KNeighbors_clf_score = mean_score(cross_val_scores)
print(KNeighbors_clf_score)

{'fit_time': 0.0020217418670654295, 'score_time': 0.24689629077911376, 'test_accuracy': 0.9491644915025408, 'test_recall': 0.9589077182979622, 'test_precision': 0.9503933502196666, 'test_f1': 0.9545916002232705}
