In [1]:
import pandas as pd
import sklearn
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm , preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



In [2]:
def mean_score(scoring):
    return {i:j.mean() for i,j in scoring.items()}

In [3]:
# Get the parent directory of the current folder
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Specify the file path for dataset.csv in the parent folder
filename = os.path.join(parent_dir, "dataset.csv")
df = pd.read_csv(filename)

In [4]:
df.columns 

Index(['Length of URL', 'Has IP address', 'Shortening Service',
       'Having @ Symbol', 'Double Slash Redirecting', 'Prefix-Suffix', 'CTLD',
       'HTTPS in Domain', 'Sensitive Words', 'Has Tilde', 'Has Port',
       'Result'],
      dtype='object')

In [8]:
df.rename(columns={'Length of URL':'Length of URL', 'Has IP address':'Has_IP_address', 'Shortening Service':'Shortening_Service', 'Having @ Symbol':'Having_@_Symbol', 'Double Slash Redirecting':'Double_Slash_Redirecting', 'Prefix-Suffix':'Prefix-Suffix', 'CTLD':'CTLD', 'HTTPS in Domain':'HTTPS_in_Domain', 'Sensitive Words':'Sensitive_Words', 'Has Tilde':'Has_Tilde', 'Has Port':'Has_Port', 'Result':'Result'}, inplace=True)

In [9]:
# Investigate unique values in the each column
unique_IP = df['Has_IP_address'].unique()
unique_SS = df['Shortening_Service'].unique()
unique_HAS = df['Having_@_Symbol'].unique()
unique_DSR = df['Double_Slash_Redirecting'].unique()
unique_PS = df['Prefix-Suffix'].unique()
unique_CTLD = df['CTLD'].unique()
unique_HID = df['HTTPS_in_Domain'].unique()
unique_SW = df['Sensitive_Words'].unique()
unique_HT = df['Has_Tilde'].unique()
unique_HP = df['Has_Port'].unique()
unique_R = df['Result'].unique()

print(unique_IP)
print(unique_SS)
print(unique_HAS)
print(unique_DSR)
print(unique_PS)
print(unique_CTLD)
print(unique_HID)
print(unique_SW)
print(unique_HT)
print(unique_HP)
print(unique_R)

['1' '-1' 'Has IP address']
['-1' '1' 'Shortening Service']
['1' '-1' 'Having @ Symbol']
['1' '-1' 'Double Slash Redirecting']
['-1' '1' 'Prefix-Suffix']
['1' '-1' '0' 'CTLD']
['-1' 'HTTPS in Domain']
['1' '-1' 'Sensitive Words']
['1' '-1' 'Has Tilde']
['-1' '1' 'Has Port']
['1' 'Result' '-1']


In [10]:
validValues = {-1, 0, 1}

for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df = df[df[column].isin(validValues)]

# Convert all columns to integers (again after the filtering)
df = df.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = pd.to_numeric(df[column], errors='coerce')


In [11]:
# Now you can use the DataFrame with valid integer values (1, -1, and 0)
print(df)

      Length of URL  Has_IP_address  Shortening_Service  Having_@_Symbol  \
0                 1               1                  -1                1   
1                 1               1                   1                1   
2                 1               1                   1                1   
3                 1               1                   1                1   
4                 1               1                   1                1   
...             ...             ...                 ...              ...   
6047              1               1                   1                1   
6048              1               1                   1                1   
6049              1               1                   1                1   
6050              1               1                   1                1   
6051              1               1                   1                1   

      Double_Slash_Redirecting  Prefix-Suffix  CTLD  HTTPS_in_Domain  \
0              

In [12]:
df.shape

(6051, 12)

In [13]:
df = sklearn.utils.shuffle(df)
X = df.drop("Result",axis=1).values
X = preprocessing.scale(X)
y = df['Result'].values
df.head()

Unnamed: 0,Length of URL,Has_IP_address,Shortening_Service,Having_@_Symbol,Double_Slash_Redirecting,Prefix-Suffix,CTLD,HTTPS_in_Domain,Sensitive_Words,Has_Tilde,Has_Port,Result
4815,-1,1,1,1,1,-1,1,-1,1,1,-1,-1
3264,1,1,1,1,1,1,1,-1,1,1,-1,1
3709,1,1,1,1,1,1,-1,-1,1,1,-1,-1
2969,1,1,1,1,1,1,1,-1,1,1,-1,1
1744,-1,1,1,-1,-1,1,0,-1,1,1,-1,1


In [14]:
scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
fold_count=10

## SVM-Linear

In [15]:
linear_clf = svm.SVC(kernel='linear')
cross_val_scores = cross_validate(linear_clf, X, y, cv=fold_count, scoring=scoring)
linear_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {linear_svc_clf_score['fit_time']}")
print(f"score time = {linear_svc_clf_score['score_time']}")
print(f"accuracy = {linear_svc_clf_score['test_accuracy']}")
print(f"recall = {linear_svc_clf_score['test_recall']}")
print(f"precision = {linear_svc_clf_score['test_precision']}")
print(f"f1 = {linear_svc_clf_score['test_f1']}")

fit time = 2.853457283973694
score time = 0.2233288049697876
accuracy = 0.7478062351689715
recall = 0.8222949503412403
precision = 0.7530901110152706
f1 = 0.7843801761594632


## SVM-Polynomial

In [16]:
poly_clf = svm.SVC(kernel='poly')
cross_val_scores = cross_validate(poly_clf, X, y, cv=fold_count, scoring=scoring)
poly_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {poly_svc_clf_score['fit_time']}")
print(f"score time = {poly_svc_clf_score['score_time']}")
print(f"accuracy = {poly_svc_clf_score['test_accuracy']}")
print(f"recall = {poly_svc_clf_score['test_recall']}")
print(f"precision = {poly_svc_clf_score['test_precision']}")
print(f"f1 = {poly_svc_clf_score['test_f1']}")

fit time = 3.229771447181702
score time = 0.22788379192352295
accuracy = 0.7527624580639882
recall = 0.8647850447714301
precision = 0.7384360248293016
f1 = 0.7965829932570124


## SVM-RBF

In [17]:
rbf_clf = svm.SVC(kernel='rbf')
cross_val_scores = cross_validate(rbf_clf, X, y, cv=fold_count, scoring=scoring)
rbf_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {rbf_svc_clf_score['fit_time']}")
print(f"score time = {rbf_svc_clf_score['score_time']}")
print(f"accuracy = {rbf_svc_clf_score['test_accuracy']}")
print(f"recall = {rbf_svc_clf_score['test_recall']}")
print(f"precision = {rbf_svc_clf_score['test_precision']}")
print(f"f1 = {rbf_svc_clf_score['test_f1']}")


fit time = 2.9670342683792112
score time = 0.3779703140258789
accuracy = 0.7527632763276327
recall = 0.863897470806933
precision = 0.7388498306363908
f1 = 0.7964312746385038


## SVM-Sigmoid

In [18]:
sigmoid_clf = svm.SVC(kernel='sigmoid')
cross_val_scores = cross_validate(sigmoid_clf, X, y, cv=fold_count, scoring=scoring)
sigmoid_svc_clf_score = mean_score(cross_val_scores)

print(f"fit time = {sigmoid_svc_clf_score['fit_time']}")
print(f"score time = {sigmoid_svc_clf_score['score_time']}")
print(f"accuracy = {sigmoid_svc_clf_score['test_accuracy']}")
print(f"recall = {sigmoid_svc_clf_score['test_recall']}")
print(f"precision = {sigmoid_svc_clf_score['test_precision']}")
print(f"f1 = {sigmoid_svc_clf_score['test_f1']}")

fit time = 5.551967692375183
score time = 0.3864502668380737
accuracy = 0.6398938984807572
recall = 0.659882005899705
precision = 0.6853142227504203
f1 = 0.672174185903238
