In [1]:
import numpy as np
import pandas as pd
import random
import re

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
urldata=pd.read_csv('cleaned_dataset.csv')
print(urldata)

                                                  domain  label
0      nobell.it/70ffb52d079109dca5664cce6f317373782/...      1
1      www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...      1
2      serviciosbys.com/paypal.cgi.bin.get-into.herf....      1
3      mail.printakid.com/www.online.americanexpress....      1
4      thewhiskeydregs.com/wp-content/themes/widescre...      1
...                                                  ...    ...
95906            xbox360.ign.com/objects/850/850402.html      0
95907       games.teamxbox.com/xbox-360/1860/Dead-Space/      0
95908         www.gamespot.com/xbox360/action/deadspace/      0
95909      en.wikipedia.org/wiki/Dead_Space_(video_game)      0
95910          www.angelfire.com/goth/devilmaycrytonite/      0

[95911 rows x 2 columns]


In [4]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [5]:
y=urldata['label']

In [6]:
url_list=urldata['domain']
print(url_list)

0        nobell.it/70ffb52d079109dca5664cce6f317373782/...
1        www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...
2        serviciosbys.com/paypal.cgi.bin.get-into.herf....
3        mail.printakid.com/www.online.americanexpress....
4        thewhiskeydregs.com/wp-content/themes/widescre...
                               ...                        
95906              xbox360.ign.com/objects/850/850402.html
95907         games.teamxbox.com/xbox-360/1860/Dead-Space/
95908           www.gamespot.com/xbox360/action/deadspace/
95909        en.wikipedia.org/wiki/Dead_Space_(video_game)
95910            www.angelfire.com/goth/devilmaycrytonite/
Name: domain, Length: 95911, dtype: object


In [7]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [8]:
X = vectorizer.fit_transform(url_list)

In [9]:
print(X)

  (0, 135326)	0.25018572664439087
  (0, 100975)	0.09586009647510704
  (0, 145085)	0.2538498869938846
  (0, 9933)	0.18794547662820124
  (0, 119031)	0.24701168657647968
  (0, 138078)	0.12784957739598748
  (0, 184976)	0.22773588222844793
  (0, 123849)	0.2054332208926776
  (0, 193167)	0.17507171382736572
  (0, 198204)	0.15685980439722166
  (0, 94887)	0.09818175345028217
  (0, 116169)	0.09265521298828976
  (0, 22015)	0.2538498869938846
  (0, 12617)	0.2538498869938846
  (0, 147167)	0.25018572664439087
  (0, 2235)	0.2538498869938846
  (0, 134783)	0.07951294066568701
  (0, 53007)	0.25018572664439087
  (0, 53008)	0.25018572664439087
  (0, 12620)	0.2799638917593214
  (0, 145328)	0.0757735377516374
  (0, 164551)	0.2180979800544321
  (0, 145086)	0.2538498869938846
  (1, 103403)	0.14207021903598643
  (1, 66059)	0.061735612190191184
  :	:
  (95907, 109422)	0.41655586924868127
  (95907, 7820)	0.333081622161234
  (95907, 43639)	0.27382404006005717
  (95907, 4222)	0.44359238802784895
  (95907, 186202)	

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)


DecisionTreeClassifier()

In [12]:
# Make predictions
y_pred = clf.predict(X_test)

In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9441693165823907


In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      9523
           1       0.95      0.94      0.94      9660

    accuracy                           0.94     19183
   macro avg       0.94      0.94      0.94     19183
weighted avg       0.94      0.94      0.94     19183



In [21]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[9029,  494],
       [ 577, 9083]], dtype=int64)

In [16]:
tn,fp,fn,tp=cm.ravel()#true negative,false positive,false negative,true positive
print(tn)
print(fp)
print(fn)
print(tp)

9029
494
577
9083


In [17]:
precision_1=tp/(tp+fp) 
precision_0=tn/(tn+fn)
print(precision_1)
print(precision_0)

0.9484180849953012
0.9399333749739746


In [18]:
recall_1=(tp)/(tp+fn)
print(recall_1)
recall_0=(tn)/(tn+fp)
print(recall_0)

0.9402691511387163
0.9481255906752074


In [19]:
accuracy=(tp+tn)/(tp+tn+fp+fn)
accuracy

0.9441693165823907

In [20]:
f1score_0=2*((precision_0*recall_0)/(precision_0+recall_0))
print(f1score_0)
f1score_1=2*((precision_1*recall_1)/(precision_1+recall_1))
print(f1score_1)

0.9440117099691567
0.9443260383635701


In [22]:
support_1=9660
support_0=9523

In [23]:
macro_avg=(f1score_0+f1score_1)/2
weighted_avg=(f1score_0*support_0+f1score_1*support_1)/(support_1+support_0)
print(macro_avg)
print(weighted_avg)

0.9441688741663634
0.94416999659221
