In [None]:
import pandas as pd
import numpy as np
import random

from sklearn.linear_model import LogisticRegression
# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
ds = pd.read_csv('/content/malicious_phish.csv')
urls_data = ds.sample(frac=1)

In [None]:
urls_data.head()
urls_data.tail()

Unnamed: 0,url,type
453118,http://www.ijsbaanapeldoorn.nl/fotos-videos.ht...,defacement
172234,ogj.com/articles/print/volume-104/issue-43/gen...,benign
35409,brokencontrollers.com/big-bang-exhibition-at-m...,benign
419148,http://bestblackhatforum.com/Thread-%E2%9D%B6%...,benign
307041,aviationpros.com/press_release/10390359/new-we...,benign


In [None]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    if 'https://www.' in total_Tokens:
        total_Tokens.remove('com')

    return total_Tokens

In [None]:
y = urls_data["type"]
url_list = urls_data["url"]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [None]:
X = vectorizer.fit_transform(url_list)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=434)

In [None]:
print(X_train)

  (0, 279290)	0.3106117799651131
  (0, 279281)	0.307881282347145
  (0, 682663)	0.26747107350840715
  (0, 964333)	0.3243794045389652
  (0, 672985)	0.5621239475749047
  (0, 682753)	0.5621239475749047
  (1, 528949)	0.33279689897253434
  (1, 367279)	0.3693132792595551
  (1, 367280)	0.3693132792595551
  (1, 80239)	0.7851507037246879
  (2, 0)	0.09750961888670918
  (2, 316230)	0.10266037826650727
  (2, 108593)	0.29507399110729615
  (2, 51996)	0.3732242214921766
  (2, 52701)	0.4171201075172005
  (2, 988716)	0.42011417620370095
  (2, 68288)	0.367653379324439
  (2, 108658)	0.5176230566263073
  (3, 0)	0.07998552872050192
  (3, 316230)	0.08421061150729753
  (3, 691071)	0.11285738019948456
  (3, 1076528)	0.11227113105027868
  (3, 580596)	0.16282201304398028
  (3, 626205)	0.31264213585593975
  (3, 812772)	0.35281940044839744
  :	:
  (520948, 0)	0.08997964841789556
  (520948, 8533)	0.10246473788357835
  (520948, 1074608)	0.21618236802693877
  (520948, 578061)	0.29766798690803303
  (520948, 264747)	0.

In [None]:
print(y_train)

231701        benign
397789        benign
537132       malware
83422     defacement
67928     defacement
             ...    
406688        benign
560778        benign
416304        benign
108651        benign
576075        benign
Name: type, Length: 520952, dtype: object


In [None]:
# Model Building
#using logistic regression
logit = LogisticRegression()
logit.fit(X_train, y_train)

In [None]:
# Accuracy of Our Model
print("Accuracy ",logit.score(X_test, y_test))

In [None]:
svm_model = SVC(gamma='scale',random_state=12345,C=79)
svm_model.fit(X_train,y_train)
print(svm_model.score(X_test, y_test))

In [None]:
svm_model.score(X_test,y_test)