In [None]:
%%capture
!pip install mlxtend
!pip install tabulate

In [2]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import mlxtend
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector
import warnings

warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("/content/Phishing.csv")

In [7]:
df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,2,12,5.5,8,4.083334,2,15,7,0,...,-1,-1,-1,0.676804,0.860529,-1.0,-1.0,-1.0,-1.0,benign
1,0,3,12,5.0,10,3.583333,3,12,8,2,...,1,0,-1,0.715629,0.776796,0.693127,0.738315,1.0,-1.0,benign
2,2,2,11,4.0,5,4.75,2,16,11,0,...,2,0,1,0.677701,1.0,0.677704,0.916667,0.0,0.898227,benign
3,0,2,7,4.5,7,5.714286,2,15,10,0,...,0,0,-1,0.696067,0.879588,0.818007,0.753585,0.0,-1.0,benign
4,19,2,10,6.0,9,2.25,2,9,5,0,...,5,4,3,0.747202,0.8337,0.655459,0.829535,0.83615,0.823008,benign


In [8]:
df.isnull().sum()[-10:]

SymbolCount_FileName        0
SymbolCount_Extension       0
SymbolCount_Afterpath       0
Entropy_URL                 0
Entropy_Domain              0
Entropy_DirectoryName    1826
Entropy_Filename          190
Entropy_Extension           3
Entropy_Afterpath           3
URL_Type_obf_Type           0
dtype: int64

In [9]:
to_drop = ["NumberRate_Extension", "Entropy_DirectoryName"]
df = df.drop(to_drop, axis=1)

In [10]:
df.dropna().URL_Type_obf_Type.value_counts()

benign      7613
phishing    7276
Name: URL_Type_obf_Type, dtype: int64

In [11]:
np.random.seed(123)
df2 = df.sample(len(df)).dropna()
df2["URL_Type_obf_Type"] = df2.URL_Type_obf_Type.replace({"benign":0, "phishing":1})
df_small = df2.sample(1000)
X = df2.drop("URL_Type_obf_Type", axis=1)
y = df2.URL_Type_obf_Type

In [12]:
rf = RandomForestClassifier(random_state=23)
X_small = df_small.drop("URL_Type_obf_Type", axis=1).reset_index(drop=True)
y_small = df_small.URL_Type_obf_Type.reset_index(drop=True)

In [None]:
sfs = SequentialFeatureSelector(rf,
                                k_features="parsimonious",
                                forward=True,
                                scoring='accuracy',
                                cv=3,
                                )
selected_features = sfs.fit(X_small, y_small)

In [None]:
selected_features.k_feature_names_,len(selected_features.k_feature_names_)

(('domain_token_count',
  'avgpathtokenlen',
  'ldl_url',
  'ldl_path',
  'urlLen',
  'this.fileExtLen',
  'argDomanRatio',
  'argPathRatio',
  'NumberofDotsinURL',
  'CharacterContinuityRate',
  'URL_DigitCount',
  'Directory_LetterCount',
  'Query_LetterCount',
  'LongestPathTokenLength',
  'delimeter_Domain',
  'delimeter_path',
  'SymbolCount_URL',
  'Entropy_Domain',
  'Entropy_Afterpath'),
 19)

In [None]:
sfs.k_score_

0.9769979560398722

In [13]:
#X = X[list(selected_features.k_feature_names_)]
X = X[['domain_token_count',
  'avgpathtokenlen',
  'ldl_url',
  'ldl_path',
  'urlLen',
  'this.fileExtLen',
  'argDomanRatio',
  'argPathRatio',
  'NumberofDotsinURL',
  'CharacterContinuityRate',
  'URL_DigitCount',
  'Directory_LetterCount',
  'Query_LetterCount',
  'LongestPathTokenLength',
  'delimeter_Domain',
  'delimeter_path',
  'SymbolCount_URL',
  'Entropy_Domain',
  'Entropy_Afterpath']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=3)

In [14]:
def scoring(y_true, y_pred, model_name, verbose=True):

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    c = confusion_matrix(y_true, y_pred)

    if verbose:
        print(f"Scores for {model_name}: Accuracy: {accuracy}, f1: {f1}, precision: {precision}, recall: {recall}\nconfusion matrix:\n{c}")
    return [model_name, accuracy, f1, precision, recall]

In [15]:
rfc = RandomForestClassifier(random_state=23)
rfc.fit(X_train, y_train)

In [16]:
rfc_scores = scoring(y_test, rfc.predict(X_test), "Random Forest")
rfc_train_scores = scoring(y_train, rfc.predict(X_train), "Random Forest", verbose=False)

Scores for Random Forest: Accuracy: 0.9848891873740766, f1: 0.9847094801223242, precision: 0.9870572207084468, recall: 0.9823728813559321
confusion matrix:
[[1484   19]
 [  26 1449]]


In [17]:
dt = DecisionTreeClassifier(random_state=21)
dt.fit(X_train, y_train)

In [18]:
dt_scores = scoring(y_test, dt.predict(X_test), "Decision Tree")
dt_train_scores = scoring(y_train, dt.predict(X_train), "Decision Tree", verbose=False)

Scores for Decision Tree: Accuracy: 0.9714573539288113, f1: 0.9710785981626403, precision: 0.9747267759562842, recall: 0.967457627118644
confusion matrix:
[[1466   37]
 [  48 1427]]


In [19]:
S = StandardScaler()
X_train_scaled = S.fit_transform(X_train)
X_test_scaled = S.transform(X_test)

In [20]:
lr = LogisticRegression(random_state=21)
lr.fit(X_train_scaled, y_train)

In [21]:
lr_scores = scoring(y_test, lr.predict(X_test_scaled), "Logistic Regression")
lr_train_scores = scoring(y_train, lr.predict(X_train_scaled), "Logistic Regression", verbose=False)

Scores for Logistic Regression: Accuracy: 0.935191403626595, f1: 0.9347311464321949, precision: 0.9325236167341431, recall: 0.9369491525423729
confusion matrix:
[[1403  100]
 [  93 1382]]


In [22]:
knn = KNeighborsRegressor()
knn.fit(X_train_scaled, y_train)

In [23]:
knn_scores = scoring(y_test, knn.predict(X_test_scaled).round().astype('int'), "KNN")
knn_train_scores = scoring(y_train, knn.predict(X_train_scaled).round().astype('int'), "KNN", verbose=False)

Scores for KNN: Accuracy: 0.9721289456010745, f1: 0.9716433208062862, precision: 0.9793388429752066, recall: 0.9640677966101695
confusion matrix:
[[1473   30]
 [  53 1422]]


In [24]:
print("-------------Test Scores-----------")
print(tabulate([rfc_scores, dt_scores, lr_scores, knn_scores], headers=['S/N','Model', 'Accuracy', 'F1', 'Precision', 'Recall'], showindex=True))

-------------Test Scores-----------
  S/N  Model                  Accuracy        F1    Precision    Recall
-----  -------------------  ----------  --------  -----------  --------
    0  Random Forest          0.984889  0.984709     0.987057  0.982373
    1  Decision Tree          0.971457  0.971079     0.974727  0.967458
    2  Logistic Regression    0.935191  0.934731     0.932524  0.936949
    3  KNN                    0.972129  0.971643     0.979339  0.964068


In [25]:
print("-------------Train Scores-----------")
print(tabulate([rfc_train_scores, dt_train_scores, lr_train_scores, knn_train_scores], headers=['S/N','Model', 'Accuracy', 'F1', 'Precision', 'Recall'], showindex=True))

-------------Train Scores-----------
  S/N  Model                  Accuracy        F1    Precision    Recall
-----  -------------------  ----------  --------  -----------  --------
    0  Random Forest          1         1            1         1
    1  Decision Tree          1         1            1         1
    2  Logistic Regression    0.93569   0.93417      0.931448  0.936907
    3  KNN                    0.979515  0.978823     0.985667  0.972074


In [26]:
joblib.dump(rfc,"model.pkl")

['model.pkl']