In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

In [2]:
data=pd.read_csv('PhishingDataset.csv')

data.head()

data.tail()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
11050,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11051,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11052,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,1,-1,1,0,1,-1
11053,-1,-1,1,1,1,-1,-1,-1,1,-1,...,-1,1,1,1,1,-1,1,1,1,-1
11054,-1,-1,1,1,1,-1,-1,-1,1,1,...,1,1,-1,1,-1,-1,-1,1,-1,-1


In [3]:
data['Result'].value_counts()


 1    6157
-1    4898
Name: Result, dtype: int64

In [5]:
phish=data[data.Result==1]
legit=data[data.Result==-1]

In [6]:
print(phish.shape)
print(legit.shape)

legit_sample=legit.sample(n=6157,replace=True)

data2=pd.concat([legit_sample,phish],axis=0)

data2['Result'].value_counts()

(6157, 31)
(4898, 31)


-1    6157
 1    6157
Name: Result, dtype: int64

In [7]:
X = data2.drop('Result', axis=1)
y = data2['Result']

In [9]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [10]:
# Decision Tree with Hyperparameter Tuning
tree_params = {'max_depth': [10, 20, 30]}
tree = DecisionTreeClassifier()
tree_grid = GridSearchCV(tree, tree_params, cv=5)
tree_grid.fit(X_train, y_train)
best_tree = tree_grid.best_estimator_

In [11]:
# SVM with Hyperparameter Tuning
svm_params = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear']}
svm = SVC(probability=True)
svm_grid = GridSearchCV(svm, svm_params, cv=5)
svm_grid.fit(X_train, y_train)
best_svm = svm_grid.best_estimator_

In [13]:
# Logistic Regression
lrc = LogisticRegression(max_iter=250)
linear_model_result = cross_validate(lrc, X_train, y_train, cv=5)
lrc.fit(X_train, y_train)


In [14]:
# Ensemble Model
ensemble = VotingClassifier(estimators=[('Logistic', lrc), ('Decision', best_tree), ('SVC', best_svm)], voting='soft')
ensemble.fit(X_train, y_train)

In [15]:
# Evaluation
acc_train_ensemble = accuracy_score(y_train, ensemble.predict(X_train))
acc_test_ensemble = accuracy_score(y_test, ensemble.predict(X_test))

In [16]:
print("Ensemble Model: Accuracy on training Data: {:.3f}".format(acc_train_ensemble))
print("Ensemble Model: Accuracy on test Data: {:.3f}".format(acc_test_ensemble))

Ensemble Model: Accuracy on training Data: 0.987
Ensemble Model: Accuracy on test Data: 0.976
