In [30]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [31]:
urldata = pd.read_csv('cleaned_dataset.csv')

In [32]:
urldata.shape

(95911, 2)

In [33]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [34]:
y=urldata['label']

In [35]:
url_list=urldata['domain']
print(url_list)

0        nobell.it/70ffb52d079109dca5664cce6f317373782/...
1        www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...
2        serviciosbys.com/paypal.cgi.bin.get-into.herf....
3        mail.printakid.com/www.online.americanexpress....
4        thewhiskeydregs.com/wp-content/themes/widescre...
                               ...                        
95906              xbox360.ign.com/objects/850/850402.html
95907         games.teamxbox.com/xbox-360/1860/Dead-Space/
95908           www.gamespot.com/xbox360/action/deadspace/
95909        en.wikipedia.org/wiki/Dead_Space_(video_game)
95910            www.angelfire.com/goth/devilmaycrytonite/
Name: domain, Length: 95911, dtype: object


In [36]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [37]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)

In [38]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
y_test.value_counts()

1    9660
0    9523
Name: label, dtype: int64

In [40]:
logit = LogisticRegression()	
logit.fit(X_train, y_train)

LogisticRegression()

In [41]:
y_pred=logit.predict(X_test)

In [42]:
print("Accuracy ",accuracy_score(y_test,y_pred))

Accuracy  0.957253818485117


In [43]:
X_predict = ["google.com/search=jcharistech",
"google.com/search=faizanahmad",
"pakistanifacebookforever.com/getpassword.php/", 
"www.radsport-voggel.de/wp-admin/includes/log.exe", 
"ahrenhei.without-transfer.ru/nethost.exe ",
"www.itidea.it/centroesteticosothys/img/_notes/gum.exe"]

In [44]:
X_predict = vectorizer.transform(X_predict)
New_predict = logit.predict(X_predict)

In [45]:
print(New_predict)

[1 1 0 1 1 1]


In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      9523
           1       0.96      0.95      0.96      9660

    accuracy                           0.96     19183
   macro avg       0.96      0.96      0.96     19183
weighted avg       0.96      0.96      0.96     19183



In [66]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[9184,  339],
       [ 481, 9179]], dtype=int64)

In [53]:
tn,fp,fn,tp=cm.ravel()#true negative,false positive,false negative,true positive
print(tn)
print(fp)
print(fn)
print(tp)

9184
339
481
9179


In [55]:
precision_1=tp/(tp+fp) 
precision_0=tn/(tn+fn)
print(precision_1)
print(precision_0)

0.9643832737970162
0.9502327987584066


In [58]:
recall_1=(tp)/(tp+fn)
print(recall_1)
recall_0=(tn)/(tn+fp)
print(recall_0)

0.9502070393374741
0.9644019741678043


In [59]:
accuracy=(tp+tn)/(tp+tn+fp+fn)
accuracy

0.957253818485117

In [64]:
f1score_0=2*((precision_0*recall_0)/(precision_0+recall_0))
print(f1score_0)

0.9572649572649572

In [65]:
f1score_1=2*((precision_1*recall_1)/(precision_1+recall_1))
print(f1score_1)

0.9572426738971738

In [70]:
support_0=9523
support_1=9660

In [75]:
macro_avg=(f1score_0+f1score_1)/2
weighted_avg=(f1score_0*support_0+f1score_1*support_1)/(support_1+support_0)
print(macro_avg)
print(weighted_avg)

0.9572538155810655
0.9572537360100551
