In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt


In [3]:
urldata = pd.read_csv('cleaned_dataset.csv').sample(10000)

In [4]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [5]:
y = urldata['label']
url_list=urldata['domain']

In [6]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [7]:
X = vectorizer.fit_transform(url_list)

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
model.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=42)

In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]


In [11]:
accuracy = accuracy_score(y_test, y_pred)  # Accuracy
accuracy

0.912

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1030
           1       0.91      0.91      0.91       970

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000



In [13]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[946,  84],
       [ 92, 878]], dtype=int64)

In [14]:
tn,fp,fn,tp=cm.ravel()#true negative,false positive,false negative,true positive
print(tn)
print(fp)
print(fn)
print(tp)

946
84
92
878


In [15]:
precision_1=tp/(tp+fp) 
precision_0=tn/(tn+fn)
print(precision_1)
print(precision_0)

0.9126819126819127
0.9113680154142582


In [16]:
recall_1=(tp)/(tp+fn)
print(recall_1)
recall_0=(tn)/(tn+fp)
print(recall_0)

0.9051546391752577
0.9184466019417475


In [17]:
accuracy=(tp+tn)/(tp+tn+fp+fn)
accuracy

0.912

In [18]:
f1score_0=2*((precision_0*recall_0)/(precision_0+recall_0))
print(f1score_0)
f1score_1=2*((precision_1*recall_1)/(precision_1+recall_1))
print(f1score_1)

0.9148936170212766
0.9089026915113871


In [19]:
support_0=1030
support_1=970

In [20]:
macro_avg=(f1score_0+f1score_1)/2
weighted_avg=(f1score_0*support_0+f1score_1*support_1)/(support_1+support_0)
print(macro_avg)
print(weighted_avg)

0.9118981542663318
0.9119880181489802
