In [67]:
import pandas as pd
import numpy as np
import random

# Machine Learning Packages
from sklearn.feature_extraction.text import TfidfVectorizer #Convert a collection of raw documents to a matrix of TF-IDF features.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score # for Random forest
from sklearn.model_selection import train_test_split

In [68]:
# Load Url Data 
urls_data = pd.read_csv("urldata.csv")
urls_data.head()# return top n (5 by default) rows of a data frame

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


### Data Vectorization Using TfidVectorizer
#### Create A tokenizer
 + Split ,Remove Repetitions and "Com"

In [69]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [70]:
# Labels
y = urls_data["label"]
# Features
url_list = urls_data["url"]

In [71]:
# Using Default Tokenizer
#TfidfVectorizer is to Convert a collection of raw documents to a matrix of TF-IDF features. (TFIDF: -weighting factor- is a numerical statistic that reflect how important a word is to a document )

# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [72]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list) #fit_transform(raw_documents[, y]) Learn the vocabulary dictionary and return document-term matrix.

#### Split into training and testing dataset 80/20 ratio

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)	#train_test_split() Split arrays or matrices into random train and test subsets

Model building

In [81]:
# Model Building using logistic regression
logit = LogisticRegression(max_iter=1000)	
logit.fit(X_train, y_train)


LogisticRegression(max_iter=1000)

In [75]:
# Model Building using Random Forest
#model= RandomForestClassifier()
#model.fit(X_train, y_train)

In [76]:
# Accuracy of logistic regression Model
print("Accuracy in logistic regression Model: ",logit.score(X_test, y_test))

Accuracy in logistic regression Model:  0.96161392743748


In [77]:
# Accuracy of Random Forest Model
#make predections
#yhat = model.predict(X_test)
#Evaluate predections
#print("Accuracy in Random Forest Model: ",accuracy_score(y_test,yhat))

### Predicting the Model

In [78]:
# The following are new phishing websites, 2021 https://db.aa419.org/fakebankslist.php
X_predict = ["https://www.kacst.edu.sa/","https://www.hightimesweedshop.com",
"www.greenlifepharmcacy.com",
"www.atfbinc.com"]

In [79]:
X_predict = vectorizer.transform(X_predict)
New_predict = logit.predict(X_predict)
print(New_predict)

['good' 'bad' 'bad' 'bad']


In [80]:
# Thanks to @ JCharisTech