Import the required libraries

In [17]:
import pandas as pd
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Function to calculate the scores

In [18]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

# Loading data.
- Load the data and clean it for unique values, later shuffle it with specific seeding

In [19]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset.csv")
df = pd.read_csv(filename)

## Preprocessing data.

In [20]:
df.columns 

Index(['Length of URL', 'Has IP address', 'Shortening Service',
       'Having @ Symbol', 'Double Slash Redirecting', 'Prefix-Suffix',
       'Standard Port', 'CTLD', 'HTTPS in Domain', 'Sensitive Words',
       'Has Tilde', 'Has Port'],
      dtype='object')

As the column names have leading white spaces, we will rename the columns

In [21]:
df.rename(columns={' Has IP address':'Has_IP_address', ' Shortening Service':'Shortening_Service', ' Having @ Symbol':'Having_@_Symbol', ' Double Slash Redirecting':'Double_Slash_Redirecting', ' Prefix-Suffix':'Prefix-Suffix', ' Standard Port':'Standard_Port', ' CTLD':'CTLD', ' HTTPS in Domain':'HTTPS_in_Domain', ' Sensitive Words':'Sensitive_Words', ' Has Tilde':'Has_Tilde', ' Has Port':'Has_Port', ' Result':'Result'}, inplace=True)

As we had merged two csv files to create a dataset, we need to check the unique values present in each column

In [22]:
# Investigate unique values in the each column
unique_IP = df['Has_IP_address'].unique()
unique_SS = df['Shortening_Service'].unique()
unique_HAS = df['Having_@_Symbol'].unique()
unique_DSR = df['Double_Slash_Redirecting'].unique()
unique_PS = df['Prefix-Suffix'].unique()
unique_SP = df['Standard_Port'].unique()
unique_CTLD = df['CTLD'].unique()
unique_HID = df['HTTPS_in_Domain'].unique()
unique_SW = df['Sensitive_Words'].unique()
unique_HT = df['Has_Tilde'].unique()
unique_HP = df['Has_Port'].unique()
unique_R = df['Result'].unique()

KeyError: 'Has_IP_address'

In [None]:
print(unique_IP)
print(unique_SS)
print(unique_HAS)
print(unique_DSR)
print(unique_PS)
print(unique_SP)
print(unique_CTLD)
print(unique_HID)
print(unique_SW)
print(unique_HT)
print(unique_HP)
print(unique_R)

[' 1             ' ' -1            ' ' Has IP address']
[' -1                ' ' 1                 ' ' Shortening Service']
[' 1              ' ' -1             ' ' Having @ Symbol']
[' 1                       ' ' -1                      '
 ' Double Slash Redirecting']
[' -1           ' ' 1            ' ' Prefix-Suffix']
[' 0            ' ' -1           ' ' Standard Port']
[' 1   ' ' -1  ' ' 0   ' ' CTLD']
[' -1             ' ' HTTPS in Domain']
[' 1              ' ' -1             ' ' Sensitive Words']
[' 1        ' ' -1       ' ' Has Tilde']
[' -1      ' ' 1       ' ' Has Port']
[' 1' ' Result' ' -1']


Remove all the values other than {-1, 0, 1} from all the values of the dataset

In [None]:
validValues = {-1, 0, 1}

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df = df[df[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
df = df.astype(int)

In [None]:
# Now you can use the DataFrame with valid integer values (1, -1, and 0)
print(df)

      Length of URL  Has_IP_address  Shortening_Service  Having_@_Symbol  \
0                 1               1                  -1                1   
1                 1               1                   1                1   
2                 1               1                   1                1   
3                 1               1                   1                1   
4                 1               1                   1                1   
...             ...             ...                 ...              ...   
3211             -1               1                   1                1   
3212              1               1                   1                1   
3213             -1               1                   1                1   
3214             -1               1                   1                1   
3215              1               1                   1                1   

      Double_Slash_Redirecting  Prefix-Suffix  Standard_Port  CTLD  \
0                

Again check the unique values present in each column

In [None]:
# Investigate unique values in the each column
unique_IP_2 = df['Has_IP_address'].unique()
unique_SS_2 = df['Shortening_Service'].unique()
unique_HAS_2 = df['Having_@_Symbol'].unique()
unique_DSR_2 = df['Double_Slash_Redirecting'].unique()
unique_PS_2 = df['Prefix-Suffix'].unique()
unique_SP_2 = df['Standard_Port'].unique()
unique_CTLD_2 = df['CTLD'].unique()
unique_HID_2 = df['HTTPS_in_Domain'].unique()
unique_SW_2 = df['Sensitive_Words'].unique()
unique_HT_2 = df['Has_Tilde'].unique()
unique_HP_2 = df['Has_Port'].unique()
unique_R_2 = df['Result'].unique()

In [None]:
print(unique_IP_2)
print(unique_SS_2)
print(unique_HAS_2)
print(unique_DSR_2)
print(unique_PS_2)
print(unique_SP_2)
print(unique_CTLD_2)
print(unique_HID_2)
print(unique_SW_2)
print(unique_HT_2)
print(unique_HP_2)
print(unique_R_2)

[ 1 -1]
[-1  1]
[ 1 -1]
[ 1 -1]
[-1  1]
[ 0 -1]
[ 1 -1  0]
[-1]
[ 1 -1]
[ 1 -1]
[-1  1]
[ 1 -1]


In [None]:
df.shape

(3215, 13)

In [None]:
print("number of 1",len(df[df["Result"]==1]))
print("number of -1",len(df[df["Result"]==-1]))

number of 1 1675
number of -1 1540


### Data pre-processing. 
- Shuffle the data and split it into train and test sets

In [None]:
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0,Length of URL,Has_IP_address,Shortening_Service,Having_@_Symbol,Double_Slash_Redirecting,Prefix-Suffix,Standard_Port,CTLD,HTTPS_in_Domain,Sensitive_Words,Has_Tilde,Has_Port,Result
516,1,1,1,1,1,1,0,1,-1,1,1,-1,1
1008,1,1,-1,1,1,1,0,-1,-1,1,1,-1,1
694,0,1,1,1,1,1,0,1,-1,1,1,-1,1
2091,1,1,1,1,1,1,0,-1,-1,1,1,-1,-1
110,-1,1,1,1,1,1,0,0,-1,1,1,-1,1


# Evalution metrics 
 - Specifying evaluation metrics for classification models
 - Using 10 fold-cross-validation for evaluting 

- Accuracy = (Number of Correct Predictions) / (Total Number of Predictions)
- Recall = (True Positives) / (True Positives + False Negatives)
- Precision = (True Positives) / (True Positives + False Positives)
- F1 score = 2 * (Precision * Recall) / (Precision + Recall)

In [None]:
scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

# Decision Tree 

In [None]:
dtree_clf=DecisionTreeClassifier()
cross_val_scores = cross_validate(dtree_clf, X, y, cv=fold_count, scoring=scoring)
dtree_score = mean_score(cross_val_scores)
print(f"fit time = {dtree_score['fit_time']}")
print(f"score time = {dtree_score['score_time']}")
print(f"accuracy = {dtree_score['test_accuracy']}")
print(f"recall = {dtree_score['test_recall']}")
print(f"precision = {dtree_score['test_precision']}")
print(f"f1 = {dtree_score['test_f1']}")
print("")

fit time = 0.002201676368713379
score time = 0.004827141761779785
accuracy = 0.6239275555813549
recall = 0.7246970345024237
precision = 0.6194377262592194
f1 = 0.6673615542791367

