In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
# import csv
df = pd.read_csv('phishData.csv')
df = df.dropna()
df

Unnamed: 0,id,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,11051,1,-1,1,-1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11051,11052,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11052,11053,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,-1,1,0,1,-1
11053,11054,-1,-1,1,1,1,-1,-1,-1,1,...,-1,1,1,1,1,-1,1,1,1,-1


# Multivariate Linear Regression

In [8]:
y=df["Result"]
x2 = df[['having_IP_Address','URL_Length', 'Shortining_Service']]
x2=x2.to_numpy().reshape(-1,3)
y = y.to_numpy().reshape(-1,1)

#train model
model = LinearRegression()

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.014558194891399978
test score: 0.03777587198895416


This model, training with three variables ("IP address," "URL length," and "shortening service" has low training and test scores. That means the R2 is low for both, so even though the test score is higher than the training score, it is likely that they can get better.

In [9]:


y=df["Result"]
x2 = df[['having_IP_Address']]
x2=x2.to_numpy().reshape(-1,1)
y = y.to_numpy().reshape(-1,1)

#train model
model = LinearRegression()

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.0013365335363625386
test score: 0.013367359767574083


Training with only one variable ("IP address") with "phishing" brings down both the train and test score.

In [10]:


y=df["Result"]
x2 = df[['SSLfinal_State']]
x2=x2.to_numpy().reshape(-1,1)
y = y.to_numpy().reshape(-1,1)

#train model
model = LinearRegression()

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.5132554812346473
test score: 0.5060999454201263


Training with just the "SSL final state" variable against "phishing" brings the train and test score to around 50%, which is a big jump from the previous iterations.

In [11]:


y=df["Result"]
x2 = df[['SSLfinal_State', 'having_IP_Address','URL_Length', 'Shortining_Service']]
x2=x2.to_numpy().reshape(-1,4)
y = y.to_numpy().reshape(-1,1)

#train model
model = LinearRegression()

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.5148485905842715
test score: 0.5129576710641851


Training with "SSL final state," "IP Address," "URL length," and "shortening service" against "phishing" did not increase the R2 significantly, although it did make the test score even more similar to the train score.

In [12]:
y=df["Result"]
x2 = df[['SSLfinal_State', 'having_IP_Address','URL_Length', 'Shortining_Service', 'having_Sub_Domain']]
x2=x2.to_numpy().reshape(-1,5)
y = y.to_numpy().reshape(-1,1)

#train model
model = LinearRegression()

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.5185354455274083
test score: 0.5361569402773461


Training with "SSL final state," "IP Address," "URL length," "shortening service," and "sub domain" against "phishing" did not increase the R2 significantly, although it did make the test score even more similar to the train score.

In [13]:


y=df["Result"]
x2 = df[['id',
'having_IP_Address',
'URL_Length',
'Shortining_Service',
'having_At_Symbol',
'double_slash_redirecting',
'Prefix_Suffix',
'having_Sub_Domain',
'SSLfinal_State',
'Domain_registeration_length',
'Favicon',
'port',
'HTTPS_token',
'Request_URL',
'URL_of_Anchor',
'Links_in_tags',
'SFH',
'Submitting_to_email',
'Abnormal_URL',
'Redirect',
'on_mouseover',
'RightClick',
'popUpWidnow',
'Iframe',
'age_of_domain',
'DNSRecord',
'web_traffic',
'Page_Rank',
'Google_Index',
'Links_pointing_to_page',
'Statistical_report']]
x2=x2.to_numpy().reshape(-1,31)
y = y.to_numpy().reshape(-1,1)

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.7111581199573556
test score: 0.6556720731683917


Using every variable against "phishing" brought the train and test scores up. It is important to note the the train score is 6% higher than the test score, which could imply overfitting. This is generally the best model, though.

In [14]:
#same but not prefix-suffix

y=df["Result"]
x2 = df[['id',
'having_IP_Address',
'URL_Length',
'Shortining_Service',
'having_At_Symbol',
'double_slash_redirecting',
'having_Sub_Domain',
'SSLfinal_State',
'Domain_registeration_length',
'Favicon',
'port',
'HTTPS_token',
'Request_URL',
'URL_of_Anchor',
'Links_in_tags',
'SFH',
'Submitting_to_email',
'Abnormal_URL',
'Redirect',
'on_mouseover',
'RightClick',
'popUpWidnow',
'Iframe',
'age_of_domain',
'DNSRecord',
'web_traffic',
'Page_Rank',
'Google_Index',
'Links_pointing_to_page',
'Statistical_report']]
x2=x2.to_numpy().reshape(-1,30)
y = y.to_numpy().reshape(-1,1)

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.7067000672808683
test score: 0.6485006930688735


Doing all variables except for "prefix-suffix" only decreased the R2 slightly.

In [15]:
#same but not prefix-suffix

y=df["Result"]
x2 = df[['id',
'having_IP_Address',
'URL_Length',
'Shortining_Service',

'double_slash_redirecting',
'having_Sub_Domain',
'SSLfinal_State',
'Domain_registeration_length',
'Favicon',
'port',
'HTTPS_token',
'Request_URL',
'URL_of_Anchor',
'Links_in_tags',
'SFH',
'Submitting_to_email',

'Redirect',
'on_mouseover',
'RightClick',
'popUpWidnow',
'Iframe',
'age_of_domain',
'DNSRecord',
'web_traffic',

'Google_Index',
'Links_pointing_to_page',
'Statistical_report']]
x2=x2.to_numpy().reshape(-1,27)
y = y.to_numpy().reshape(-1,1)

# Train Test split 
X_train=x2[0:7000]
X_test=x2[7000:11000]

Y_train=y[0:7000]
Y_test=y[7000:11000]


model.fit(X_train,Y_train)
y_pred = model.predict(x2)

#model scores 
train_score=model.score(X_train, Y_train)
print("train score:", train_score)


test_score=model.score(X_test, Y_test)
print("test score:",test_score)

train score: 0.7039528479171844
test score: 0.6446899852449279


Doing all variables except for "prefix-suffix" "abnormal URL" and "page rank" only decreased the R2 slightly. In summary, doing all variables leads to the best model. Whenever I removed a variable to decrease overfitting, the test score decreased even more.