## Importing the libraries

In [1]:
# TODO-LATER: 
# 1. Check if adding simple imputer or other kind of values in the domain age column has any difference on the accuracy

In [2]:
import pandas as pd
import numpy as np

## Importing the dataset

In [3]:
df = pd.read_csv('final_dataset.csv')

## Preview

In [4]:
df.head()

Unnamed: 0,tld,country_code,url_length,path_rest_length,num_spcs_chars,domain_entropy,domain_age,created_year,updated_year,expires_year,word_count,tld_in_path_rest,Label
0,org,-1,98,84,0,3.03,-1,-1,-1,-1,6,0,bad
1,com,-1,70,57,0,3.19,-1,-1,-1,-1,1,0,bad
2,ru,-1,77,60,0,3.2,-1,-1,-1,-1,4,3,bad
3,com,-1,70,57,0,3.19,-1,-1,-1,-1,1,0,bad
4,net,-1,203,163,0,4.38,-1,-1,-1,-1,4,0,bad


In [5]:
df.shape

(667, 13)

In [6]:
df[['tld', 'country_code']] = df[['tld', 'country_code']].astype("string")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tld               667 non-null    string 
 1   country_code      667 non-null    string 
 2   url_length        667 non-null    int64  
 3   path_rest_length  667 non-null    int64  
 4   num_spcs_chars    667 non-null    int64  
 5   domain_entropy    667 non-null    float64
 6   domain_age        667 non-null    int64  
 7   created_year      667 non-null    int64  
 8   updated_year      667 non-null    int64  
 9   expires_year      667 non-null    int64  
 10  word_count        667 non-null    int64  
 11  tld_in_path_rest  667 non-null    int64  
 12  Label             667 non-null    object 
dtypes: float64(1), int64(9), object(1), string(2)
memory usage: 67.9+ KB


In [8]:
df.isnull().sum()

tld                 0
country_code        0
url_length          0
path_rest_length    0
num_spcs_chars      0
domain_entropy      0
domain_age          0
created_year        0
updated_year        0
expires_year        0
word_count          0
tld_in_path_rest    0
Label               0
dtype: int64

## Encoding Categorical Data

### Label Encoding - Label Column

In [9]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label Encoding for the 'Label' column
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

### One Hot Encoding - tld & country_code Column

#### Replacing Values in tld Column Having Frequency Less Than 15 with 'other'

In [10]:
df['tld'].value_counts()

tld
com     333
org      49
net      49
ru       34
biz      27
cl       14
au       14
uk       12
br       12
ir       10
pl        9
co        7
tk        7
ly        6
in        6
info      5
de        4
ro        4
ca        4
se        3
it        3
ua        3
nl        3
np        3
hu        3
kz        3
jp        2
mx        2
su        2
nz        2
fr        2
me        2
eu        2
tr        2
es        2
ng        2
vn        2
zw        1
gl        1
js?       1
il        1
cc        1
ie        1
pt        1
at        1
my        1
gr        1
tw        1
ch        1
name      1
lk        1
cd        1
hk        1
bi        1
ms        1
Name: count, dtype: Int64

In [11]:
df['tld'].nunique()

55

In [12]:
tld_counts = df['tld'].value_counts()
threshold = 15

In [13]:
repl = tld_counts[tld_counts <= threshold].index

In [14]:
df['tld'] = df['tld'].replace(repl, 'other')

#### Replacing Values in country_code Column Having Frequency Less Than 8 with 'XX'

In [15]:
df['country_code'].value_counts()

country_code
-1    379
US    179
CA     16
IS     12
FR      9
GB      9
NL      8
CN      5
AU      5
DE      5
BG      4
RU      3
LT      2
MY      2
JP      2
KZ      2
CZ      2
AT      2
IN      2
BR      2
DK      1
UA      1
PK      1
GE      1
ES      1
GL      1
TW      1
CW      1
CH      1
FI      1
LV      1
PY      1
IT      1
NG      1
CG      1
RE      1
NP      1
Name: count, dtype: Int64

In [16]:
df['country_code'].nunique()

37

In [17]:
country_counts = df['country_code'].value_counts()
threshold = 8

In [18]:
repl = country_counts[country_counts <= threshold].index

In [19]:
df['country_code'] = df['country_code'].replace(repl, 'XX')

## Splitting the dataset into the Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split

X = df.drop('Label', axis=1)
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Implementing One Hot Encoder

In [21]:
from sklearn.preprocessing import LabelEncoder

# Perform Label Encoding on y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Use the same encoder to encode y_test
y_test_encoded = label_encoder.transform(y_test)

In [22]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
X_train_tld_country_code = ohe.fit_transform(X_train[['tld', 'country_code']])

X_test_tld_country_code = ohe.transform(X_test[['tld', 'country_code']])

X_train_tld_country_code.shape



(533, 13)

In [23]:
X_train = X_train.drop(columns=['tld', 'country_code'])

In [24]:
X_test = X_test.drop(columns=['tld', 'country_code'])

In [25]:
X_test

Unnamed: 0,url_length,path_rest_length,num_spcs_chars,domain_entropy,domain_age,created_year,updated_year,expires_year,word_count,tld_in_path_rest
660,40,22,0,3.57,-1,-1,-1,-1,3,0
158,78,61,7,3.45,-1,-1,-1,-1,0,0
634,230,217,0,3.25,3915,2012,2020,2024,7,1
235,32,18,0,3.03,13401,1986,2023,2024,1,0
635,91,67,0,3.41,38,2023,2023,2024,3,1
...,...,...,...,...,...,...,...,...,...,...
536,90,76,0,3.39,-1,-1,-1,-1,5,0
417,68,39,0,3.78,-1,-1,-1,-1,1,1
465,30,8,0,3.59,-1,-1,-1,-1,0,0
211,77,55,0,3.40,8473,2000,2023,2025,3,1


In [26]:
X_train_transformed = np.concatenate((X_train_tld_country_code, X_train), axis=1)

In [27]:
X_test_transformed = np.concatenate((X_test_tld_country_code, X_test), axis=1)

In [28]:
X_train_transformed.shape

(533, 23)

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create a Decision Tree Classifier instance
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 89.55%
