In [2]:
# Importing libraries
import numpy as np
import pandas as pd
import os

# Visualisation
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Extracting URL informaation
from urllib.parse import urlparse
import tldextract

# Preprocessing
from IPython.display import display 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import category_encoders as ce
from sklearn.metrics import confusion_matrix, classification_report

# ANN
from tensorflow.keras import Sequential, metrics
from tensorflow.keras.layers import Dense

ModuleNotFoundError: No module named 'plotly'

In [17]:
# # Data files
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [18]:
# Loading data
dataset = pd.read_csv('archive/Malicious And Benign URLs.csv')

In [19]:
# Loading OpenPhish Testing Data
# The dataset contains malicious sites only.
openphish_sites = pd.read_csv('other/openphish.csv', names=['url'])

In [20]:
dataset.head()

Unnamed: 0,url,label
0,https://www.google.com,benign
1,https://www.youtube.com,benign
2,https://www.facebook.com,benign
3,https://www.baidu.com,benign
4,https://www.wikipedia.org,benign


In [66]:
test = pd.read_csv('test.csv')

In [67]:
test.head()

Unnamed: 0,url
0,http://testphp.vulnweb.com/
1,https://www.youtube.com/
2,http://172.19.17.212/dvwa/vulnerabilities/xss_...


# Data Analysis

In [23]:
fig = go.Figure([go.Pie(labels=['Benign', 'Malicious'], values=dataset.label.value_counts())])
fig.update_layout(title='Percentage of Class (Benign and Malicious)')
fig.show()

In [24]:
# Dataset information
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   label   450176 non-null  object
dtypes: object(2)
memory usage: 6.9+ MB


In [25]:
# Checking NULL values
dataset.isnull().sum()

url      0
label    0
dtype: int64

# Preprocessing
#### Features to be extracted
<ul>
    <li>SubDomain</li>
    <li>Domain</li>
    <li>Suffix</li>
    <li>Scheme length</li>
    <li>Path length</li>
    <li>Parameter length</li>
    <li>Query length</li>
    <li>Fragment length</li>
    <li>Count of '-'</li>
    <li>Count of '&'</li>
    <li>Count of '?'</li>
    <li>Count of '%'</li>
    <li>Count of '.'</li>
    <li>Count of digits</li>
    <li>Count of alphabets</li>
</ul>

In [26]:
# Sample Website URL information
site ="https://www.google.co.in/?gws_rd=ssl"
print(tldextract.extract(site))
print(urlparse(site))

ExtractResult(subdomain='www', domain='google', suffix='co.in')
ParseResult(scheme='https', netloc='www.google.co.in', path='/', params='', query='gws_rd=ssl', fragment='')


## Functions

In [27]:
# Feature Extraction Functions
def extract_domains(row):
#     domain_info contains ['subdomain', 'domain', 'suffix']
#     len_info contains ['scheme_len', 'url_len', 'path_len', 'param_len', 'query_len', 'frag_len']
    domain_info = list('NA' if extracted == '' else extracted for extracted in tldextract.extract(row['url']))
    len_info = list(map(len, urlparse(row['url'])[:]))
    symbols = ['-', '@', '?', '%', '.']
    count_info = []
    for sym in symbols:
        count_info.append(row['url'].count(sym))
    count_info.append(sum(i.isdigit() for i in row['url']))
    count_info.append(sum(i.isalpha() for i in row['url']))
    return domain_info + len_info + count_info

def extract_url(data):
#     result_type='expand' causes return type to be pd.Dataframe instead of pd.Series
    url_component = data.apply(extract_domains, axis='columns', result_type='expand')
#     Changing column names
    extracted_columns = ['subdomain', 'domain', 'suffix', 
                         'scheme_len', 'url_len', 'path_len', 'param_len', 'query_len', 'frag_len', 
                         'count-', 'count@', 'count?', 'count%', 'count.', 'count_digit', 'count_alpha']
    url_component.columns = extracted_columns
#     display(url_component.head())
#     better than concat, because on multiple execution of new_extract_url(), 
#     multiple cols of same type won't we added
    for col in extracted_columns:
        data[col] = url_component[col]
    return data

In [28]:
# Graph Plotting Functions
def get_frequent_group(data, n_group):
    # get the most frequent
    data = data.value_counts().reset_index(name='values')
    
    # scale log base 10
    data['values'] = np.log10(data['values'])
    
    # calculate total values
    # x_column (subdomain / domain / domain_suffix)
    x_column = data.columns[1]
    data['total_values'] = data[x_column].map(data.groupby(x_column)['values'].sum().to_dict())
    
    # get n_group data order by highest values
    data_group = data.sort_values('total_values', ascending=False).iloc[:, 1].unique()[:n_group]
    data = data[data.iloc[:, 1].isin(data_group)]
    data = data.sort_values('total_values', ascending=False)
    return data

def plot(data, n_group, title):
    data = get_frequent_group(data, n_group)
    fig = px.bar(data, x=data.columns[1], y='values', color='label')
    fig.update_layout(title=title)
    fig.show()

In [29]:
%%time
# Extracting information about URLs
data = extract_url(dataset)

Wall time: 59.1 s


In [30]:
# Extracted Data 
data.head()

Unnamed: 0,url,label,subdomain,domain,suffix,scheme_len,url_len,path_len,param_len,query_len,frag_len,count-,count@,count?,count%,count.,count_digit,count_alpha
0,https://www.google.com,benign,www,google,com,5,14,0,0,0,0,0,0,0,0,2,0,17
1,https://www.youtube.com,benign,www,youtube,com,5,15,0,0,0,0,0,0,0,0,2,0,18
2,https://www.facebook.com,benign,www,facebook,com,5,16,0,0,0,0,0,0,0,0,2,0,19
3,https://www.baidu.com,benign,www,baidu,com,5,13,0,0,0,0,0,0,0,0,2,0,16
4,https://www.wikipedia.org,benign,www,wikipedia,org,5,17,0,0,0,0,0,0,0,0,2,0,20


## Data Visualisation

In [31]:
# Number of unique Domains, Sub-Domains, Domain-suffix
fig = go.Figure(
    data=[
        go.Bar(
        x=['Domain', 'SubDomain', 'Suffix'],
        y=[data['domain'].nunique(), data['subdomain'].nunique(), data['suffix'].nunique()])
    ],
    layout=go.Layout(
        title="Number of unique Domains, Sub-Domains, Domain-suffix")
    )
fig.show()

In [32]:
plot(
    data=data.groupby('label')['domain'], 
    n_group=20, 
    title='Top 20 Domains Grouped By Labels (Logarithmic Scale)'
)

In [33]:
plot(
    data=data.groupby('label')['subdomain'], 
    n_group=20, 
    title='Top 20 Sub Domains Grouped By Labels (Logarithmic Scale)'
)

In [34]:
plot(
    data=data.groupby('label')['suffix'], 
    n_group=20, 
    title='Top 20 Domains Suffix Grouped By Labels (Logarithmic Scale)'
)

In [35]:
# Separating dependent and independent varible
X = data.drop(columns=['label'], inplace=False)
y = data.loc[:, 'label']

In [36]:
# Encoding labels
# Malicious: 1 and Benign: 0
y = (y == 'malicious').astype('int')

In [37]:
# Splitting data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [38]:
X_train

Unnamed: 0,url,subdomain,domain,suffix,scheme_len,url_len,path_len,param_len,query_len,frag_len,count-,count@,count?,count%,count.,count_digit,count_alpha
337108,https://www.youtube.com/watch?v=vrz-oJLZaLY,www,youtube,com,5,15,6,0,13,0,1,0,1,0,2,0,34
91816,https://www.radaris.com/p/Therese/Monplaisir/,www,radaris,com,5,15,22,0,0,0,0,0,0,0,2,0,36
2230,https://www.thisav.com,www,thisav,com,5,14,0,0,0,0,0,0,0,0,2,0,17
95099,https://www.s222.photobucket.com/albums/dd218/...,www.s222,photobucket,com,5,24,39,0,39,0,0,0,1,0,4,6,88
377328,http://cclpgms.com/js?_Acess_Tooken792e070ef1e...,,cclpgms,com,4,11,3,0,150,0,1,0,1,0,5,41,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73349,https://www.monstercockfreaks.com/,www,monstercockfreaks,com,5,25,1,0,0,0,0,0,0,0,2,0,28
371403,http://shredcases.com.br/Adobe/,,shredcases,com.br,4,17,7,0,0,0,0,0,0,0,2,0,24
312201,https://www.tvguide.com/celebrities/curt-conwa...,www,tvguide,com,5,15,31,0,0,0,1,0,0,0,2,6,39
267336,https://www.ocpm.edu/?page=home-links,www,ocpm,edu,5,12,1,0,15,0,1,0,1,0,2,0,28


In [39]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(270105, 17)
(270105,)
(180071, 17)
(180071,)


In [40]:
# Handling categorical features
count_encoder = ce.CountEncoder(handle_unknown=0)
X_train.iloc[:, 1:4] = count_encoder.fit_transform(X_train.iloc[:, 1:4])
X_test.iloc[:, 1:4] = count_encoder.transform(X_test.iloc[:, 1:4])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [41]:
# Scaling of data
cols = X_train.columns
scaler_x = StandardScaler()
X_train = pd.DataFrame(scaler_x.fit_transform(X_train.iloc[:, 1:]), columns=cols[1:])
X_test = pd.DataFrame(scaler_x.transform(X_test.iloc[:, 1:]), columns=cols[1:])
del cols

In [42]:
# Correlation between dependent variables
fig = go.Figure(go.Heatmap(x = X_train.columns, 
                 y = X_train.columns,
                 z = X_train.corr()))
fig.update_layout(title="Correlation between dependent variables")
fig.show()

In [43]:
# Preprocessing OpenPhish Data (unseen data)
openphish_data = extract_url(openphish_sites)
display(openphish_data.head())
openphish_data.iloc[:, 1:4] = count_encoder.transform(openphish_data.iloc[:, 1:4])
# display(openphish_data.info())
openphish_data = pd.DataFrame(scaler_x.transform(openphish_data.iloc[:, 1:]), columns=openphish_data.columns[1:])
print(openphish_data)
openphish_data_label = np.ones(shape=(len(openphish_data), 1), dtype='int')

Unnamed: 0,url,subdomain,domain,suffix,scheme_len,url_len,path_len,param_len,query_len,frag_len,count-,count@,count?,count%,count.,count_digit,count_alpha
0,http://watshap-join.pubgproductions.com/login.php,watshap-join,pubgproductions,com,4,32,10,0,0,0,1,0,0,0,3,0,41
1,http://condescending-hamilton-b3e79b.netlify.app/,condescending-hamilton-b3e79b,netlify,app,4,41,1,0,0,0,2,0,0,0,2,3,38
2,https://kind-mayer-de4be4.netlify.app/,kind-mayer-de4be4,netlify,app,5,29,1,0,0,0,2,0,0,0,2,2,28
3,https://vigilant-austin-37062a.netlify.app/,vigilant-austin-37062a,netlify,app,5,34,1,0,0,0,2,0,0,0,2,5,30
4,http://wagrupp.group18online.com/login.php,wagrupp,group18online,com,4,25,10,0,0,0,0,0,0,0,3,2,33


      subdomain    domain    suffix  scheme_len   url_len  path_len  \
0     -1.475193 -0.427109  0.651421   -1.904798  1.854009 -0.701173   
1     -1.475193 -0.382614 -1.635974   -1.904798  3.172285 -1.063395   
2     -1.475193 -0.382614 -1.635974    0.524325  1.414584 -1.063395   
3     -1.475193 -0.382614 -1.635974    0.524325  2.146959 -1.063395   
4     -1.475193 -0.427109  0.651421   -1.904798  0.828684 -0.701173   
...         ...       ...       ...         ...       ...       ...   
6302   0.783332 -0.427109  0.651421    0.524325  2.000484 -0.701173   
6303   0.783332 -0.427109  0.651421    0.524325  1.854009 -0.701173   
6304   0.783332 -0.427109  0.651421    0.524325  1.414584 -0.701173   
6305  -0.908239 -0.427109  0.651421    0.524325  0.535734 -0.701173   
6306  -0.908239 -0.427109  0.651421    0.524325 -0.489592 -1.063395   

      param_len  query_len  frag_len    count-    count@    count?    count%  \
0     -0.011467  -0.201028 -0.018305 -0.097519 -0.071638 -0.385288 

# Model Training

In [44]:
%%time
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

Wall time: 2.19 s


LogisticRegression()

In [45]:
%%time
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=250, verbose=1, n_jobs=-1, random_state=14)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   44.0s


Wall time: 56.7 s


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   56.3s finished


RandomForestClassifier(n_estimators=250, n_jobs=-1, random_state=14, verbose=1)

In [46]:
%%time
# Support Vector Machine
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

Wall time: 5min


SVC()

## Model Testing

In [47]:
%%time
# Logistic Regression model testing
lr_y_pred = lr.predict(X_test)
print("--Logistic Regression--")
print("Confusion Matrix: \n", confusion_matrix(y_test,lr_y_pred))
lr_class_report = classification_report(y_test, lr_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_lr_class_report = classification_report(openphish_data_label, 
                                                  lr.predict(openphish_data), 
                                                  output_dict=True, zero_division=0)

--Logistic Regression--
Confusion Matrix: 
 [[138086    251]
 [  1872  39862]]
Wall time: 577 ms


In [48]:
%%time
# Random Forest model testing
rf_y_pred = rf.predict(X_test)
print("--Random Forest--")
print("Confusion Matrix: \n", confusion_matrix(y_test, rf_y_pred))
rf_class_report = classification_report(y_test, rf_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_rf_class_report = classification_report(openphish_data_label, 
                                                  rf.predict(openphish_data), 
                                                  output_dict=True, zero_division=0)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    4.2s finished


--Random Forest--
Confusion Matrix: 
 [[138239     98]
 [   440  41294]]


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    0.1s finished


Wall time: 5.13 s


In [49]:
%%time
# Support Vector Machine model testing
svc_y_pred = svc.predict(X_test)
print("--Support Vector Machine--")
print("Confusion Matrix: \n", confusion_matrix(y_test, svc_y_pred))
svc_class_report = classification_report(y_test, svc_y_pred, output_dict=True)

# Classification report on Openphish Data
openphish_svc_class_report = classification_report(openphish_data_label, 
                                                   svc.predict(openphish_data), 
                                                   output_dict=True, zero_division=0)

--Support Vector Machine--
Confusion Matrix: 
 [[138054    283]
 [   823  40911]]
Wall time: 2min 11s


## Model Comparision

In [51]:
# Comparing model performance on testing data
models = ['Logistic Regresiion', 'Random Forest', 'Support Vector Machine']
class_reports = [lr_class_report, rf_class_report, svc_class_report]
model_compare = []
for index, report in enumerate(class_reports):
    model_compare.append([models[index], report['accuracy']])
    model_compare[index].extend(list(report['1'].values())[:-1])
# Converting list into dataframe
model_compare = pd.DataFrame(model_compare, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F-Score'])
model_compare.set_index('Model', inplace=True)
print("Classification Model comparison on Dataset")
display(model_compare)

Classification Model comparison on Dataset


Unnamed: 0_level_0,Accuracy,Precision,Recall,F-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regresiion,0.98821,0.993743,0.955144,0.974061
Random Forest,0.997012,0.997632,0.989457,0.993528
Support Vector Machine,0.993858,0.99313,0.98028,0.986663


In [53]:
# Comparing model performance on openphish data
models = ['Logistic Regresiion', 'Random Forest', 'Support Vector Machine']
openphish_class_reports = [openphish_lr_class_report, openphish_rf_class_report, openphish_svc_class_report]
openphish_model_compare = []
for index, report in enumerate(openphish_class_reports):
    openphish_model_compare.append([models[index], report['accuracy']])
    openphish_model_compare[index].extend(list(report['1'].values())[:-1])
# Converting list into dataframe
openphish_model_compare = pd.DataFrame(openphish_model_compare, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F-Score'])
openphish_model_compare.set_index('Model', inplace=True)
print("Classification Model comparison on OpenPhish Dataset")
display(openphish_model_compare)

Classification Model comparison on OpenPhish Dataset


Unnamed: 0_level_0,Accuracy,Precision,Recall,F-Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regresiion,0.651023,1.0,0.651023,0.78863
Random Forest,0.924528,1.0,0.924528,0.960784
Support Vector Machine,0.771841,1.0,0.771841,0.87123


In [54]:
fig = go.Figure()
for col in model_compare.columns:
    fig.add_trace(
        go.Bar(
            x=model_compare.index,
            y=model_compare[col],
            name=col,))
fig.update_layout(title='Classification Model Comparision on Testing Data', 
                  xaxis={'title': 'Models'}, yaxis={'title': 'Score'})
fig.update_yaxes(range=[0.93, 1])
fig.show()

In [55]:
fig = go.Figure()
for col in openphish_model_compare.columns:
    fig.add_trace(
        go.Bar(
            x=openphish_model_compare.index,
            y=openphish_model_compare[col],
            name=col,))
fig.update_layout(title='Classification Model Comparision on Openphish Data', 
                  xaxis={'title': 'Models'}, yaxis={'title': 'Score'})
fig.update_yaxes(range=[0.4, 1])
fig.show()

##### Testing with some vulnerable sites

In [68]:
data = extract_url(test)

In [69]:
data

Unnamed: 0,url,subdomain,domain,suffix,scheme_len,url_len,path_len,param_len,query_len,frag_len,count-,count@,count?,count%,count.,count_digit,count_alpha
0,http://testphp.vulnweb.com/,testphp,vulnweb,com,4,19,1,0,0,0,0,0,0,0,2,0,21
1,https://www.youtube.com/,www,youtube,com,5,15,1,0,0,0,0,0,0,0,2,0,18
2,http://172.19.17.212/dvwa/vulnerabilities/xss_...,,172.19.17.212,,4,13,28,0,5,0,0,0,1,0,3,10,31


In [70]:
test_data = extract_url(test)
display(test_data.head())
test_data.iloc[:, 1:4] = count_encoder.transform(test_data.iloc[:, 1:4])
# display(openphish_data.info())
test_data = pd.DataFrame(scaler_x.transform(test_data.iloc[:, 1:]), columns=test_data.columns[1:])
print(test_data)
test_data_label = np.ones(shape=(len(test_data), 1), dtype='int')

Unnamed: 0,url,subdomain,domain,suffix,scheme_len,url_len,path_len,param_len,query_len,frag_len,count-,count@,count?,count%,count.,count_digit,count_alpha
0,http://testphp.vulnweb.com/,testphp,vulnweb,com,4,19,1,0,0,0,0,0,0,0,2,0,21
1,https://www.youtube.com/,www,youtube,com,5,15,1,0,0,0,0,0,0,0,2,0,18
2,http://172.19.17.212/dvwa/vulnerabilities/xss_...,,172.19.17.212,,4,13,28,0,5,0,0,0,1,0,3,10,31


   subdomain    domain    suffix  scheme_len   url_len  path_len  param_len  \
0  -1.475193 -0.427109  0.651421   -1.904798 -0.050167 -1.063395  -0.011467   
1   0.783332  2.596822  0.651421    0.524325 -0.636067 -1.063395  -0.011467   
2  -0.908239 -0.427109 -1.605613   -1.904798 -0.929017  0.023271  -0.011467   

   query_len  frag_len    count-    count@    count?    count%    count.  \
0  -0.201028 -0.018305 -0.486745 -0.071638 -0.385288 -0.076434 -0.547624   
1  -0.201028 -0.018305 -0.486745 -0.071638 -0.385288 -0.076434 -0.547624   
2  -0.015527 -0.018305 -0.486745 -0.071638  2.154139 -0.076434  0.333059   

   count_digit  count_alpha  
0    -0.463560    -0.923303  
1    -0.463560    -1.036576  
2     0.646576    -0.545727  


In [71]:
newpred = lr.predict(test_data)

In [72]:
print(newpred)

[1 0 1]


In [75]:
for i in range(len(newpred)):
    if newpred[i] == 1:
        print("Malicious")
    else:
        print("Benign")

Malicious
Benign
Malicious
