In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the dataset
urldata = pd.read_csv('cleaned_dataset.csv').sample(30000)

In [4]:
y=urldata['label']

In [5]:
url_list=urldata['domain']
print(url_list)

71694        www.ociweb.com/cnb/CORBANewsBrief-200301.html
32882       www.nembi.com.br/components/com_mailto/paypal/
29155    'freebutterflyphotos.com/Remax-ion/Secure Logi...
43212    www.lv.balticspark.org/wp-admin/images/ae/ws31...
41610    '9d345009-a-62cb3a1a-s-sites.googlegroups.com/...
                               ...                        
12341    'www.kdaochsrwa.com/~hailagra/signin/enter_log...
14081                            sing-in-e5bayloginss.com/
8111               fun-dive.com/gps/?check=1i29901056.html
52543                      math.illinoisstate.edu/actuary/
29466    paypa.l.com.c.gi.bin.websicur.cmd.login.submit...
Name: domain, Length: 30000, dtype: object


In [6]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [7]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [8]:
X = vectorizer.fit_transform(url_list)

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
classifier = SVC(kernel='linear',random_state=0)
classifier.fit(X_train,y_train)

SVC(kernel='linear', random_state=0)

In [11]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [12]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9615

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3068
           1       0.97      0.95      0.96      2932

    accuracy                           0.96      6000
   macro avg       0.96      0.96      0.96      6000
weighted avg       0.96      0.96      0.96      6000



In [20]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[2986,   82],
       [ 149, 2783]], dtype=int64)

In [15]:
tn,fp,fn,tp=cm.ravel()#true negative,false positive,false negative,true positive
print(tn)
print(fp)
print(fn)
print(tp)

2986
82
149
2783


In [16]:
precision_1=tp/(tp+fp) 
precision_0=tn/(tn+fn)
print(precision_1)
print(precision_0)

0.9713787085514834
0.9524720893141946


In [17]:
recall_1=(tp)/(tp+fn)
print(recall_1)
recall_0=(tn)/(tn+fp)
print(recall_0)

0.949181446111869
0.9732724902216427


In [18]:
accuracy=(tp+tn)/(tp+tn+fp+fn)
accuracy

0.9615

In [19]:
f1score_0=2*((precision_0*recall_0)/(precision_0+recall_0))
print(f1score_0)
f1score_1=2*((precision_1*recall_1)/(precision_1+recall_1))
print(f1score_1)

0.9627599548605513
0.9601518026565465


In [21]:
support_0=3068
support_1=2932

In [22]:
macro_avg=(f1score_0+f1score_1)/2
weighted_avg=(f1score_0*support_0+f1score_1*support_1)/(support_1+support_0)
print(macro_avg)
print(weighted_avg)

0.961455878758549
0.9614854378168611
