In [1]:
%%capture
!pip install mlxtend

In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
import mlxtend
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector
import warnings

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("/content/Phishing.csv")

In [4]:
df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,2,12,5.5,8,4.083334,2,15,7,0,...,-1,-1,-1,0.676804,0.860529,-1.0,-1.0,-1.0,-1.0,benign
1,0,3,12,5.0,10,3.583333,3,12,8,2,...,1,0,-1,0.715629,0.776796,0.693127,0.738315,1.0,-1.0,benign
2,2,2,11,4.0,5,4.75,2,16,11,0,...,2,0,1,0.677701,1.0,0.677704,0.916667,0.0,0.898227,benign
3,0,2,7,4.5,7,5.714286,2,15,10,0,...,0,0,-1,0.696067,0.879588,0.818007,0.753585,0.0,-1.0,benign
4,19,2,10,6.0,9,2.25,2,9,5,0,...,5,4,3,0.747202,0.8337,0.655459,0.829535,0.83615,0.823008,benign


In [5]:
to_drop = ["NumberRate_Extension", "Entropy_DirectoryName"]
df = df.drop(to_drop, axis=1)

In [43]:
df.dropna().URL_Type_obf_Type.value_counts()

phishing    4014
benign      2709
Name: URL_Type_obf_Type, dtype: int64

In [52]:
df.isnull().sum()[70:81]

SymbolCount_FileName        0
SymbolCount_Extension       0
SymbolCount_Afterpath       0
Entropy_URL                 0
Entropy_Domain              0
Entropy_DirectoryName    1826
Entropy_Filename          190
Entropy_Extension           3
Entropy_Afterpath           3
URL_Type_obf_Type           0
dtype: int64

In [5]:
df_small = df[['Querylength', "domain_token_count", "URL_Type_obf_Type"]].sample(len(df))

In [6]:
df_small["URL_Type_obf_Type"] = df_small.URL_Type_obf_Type.replace({"benign":1, "phishing":0})

In [None]:
X = df_small.drop("URL_Type_obf_Type", axis=1)
y = df_small.URL_Type_obf_Type

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=23)

In [None]:
l = LogisticRegression()
l.fit(X_train, y_train)

LogisticRegression()

In [None]:
df_small.URL_Type_obf_Type.value_counts(normalize=True)

1    0.506345
0    0.493655
Name: URL_Type_obf_Type, dtype: float64

In [None]:
accuracy_score(y_test, l.predict(X_test)), accuracy_score(y_train, l.predict(X_train))

(0.7996096291476903, 0.8171317009680306)

In [None]:
f1_score(y_test, l.predict(X_test)), f1_score(y_train, l.predict(X_train))

0.8173190984578885

In [None]:
roc_auc_score(y_test, l.predict_proba(X_test)[:,1]), roc_auc_score(y_train, l.predict_proba(X_train)[:,1])

(0.8258435791612204, 0.8368585494043169)

In [None]:
confusion_matrix(y_test, l.predict(X_test))

array([[1080,  483],
       [ 133, 1378]])

In [6]:
np.random.seed(123)
df2 = df.sample(len(df)).dropna()
df2["URL_Type_obf_Type"] = df2.URL_Type_obf_Type.replace({"benign":0, "phishing":1})
df_small = df2.sample(1000)
X = df2.drop("URL_Type_obf_Type", axis=1)
y = df2.URL_Type_obf_Type

In [7]:
rf = RandomForestClassifier(random_state=23)
X_small = df_small.drop("URL_Type_obf_Type", axis=1).reset_index(drop=True)
y_small = df_small.URL_Type_obf_Type.reset_index(drop=True)

In [8]:
sfs = SequentialFeatureSelector(rf,
                                k_features="parsimonious",
                                forward=True,
                                scoring='accuracy',
                                cv=3,
                                )
selected_features = sfs.fit(X_small, y_small)

In [9]:
selected_features.k_feature_names_,len(selected_features.k_feature_names_)

(('domain_token_count',
  'avgpathtokenlen',
  'ldl_url',
  'ldl_path',
  'urlLen',
  'this.fileExtLen',
  'argDomanRatio',
  'argPathRatio',
  'NumberofDotsinURL',
  'CharacterContinuityRate',
  'URL_DigitCount',
  'Directory_LetterCount',
  'Query_LetterCount',
  'LongestPathTokenLength',
  'delimeter_Domain',
  'delimeter_path',
  'SymbolCount_URL',
  'Entropy_Domain',
  'Entropy_Afterpath'),
 19)

In [24]:
X = X[list(selected_features.k_feature_names_)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=3)

In [25]:
rfc = RandomForestClassifier(random_state=23)
rfc.fit(X_train, y_train)

In [29]:
def scoring(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"Accuracy: {accuracy}, f1: {f1}, precision: {precision}, recall: {recall}")

In [30]:
scoring(y_test, rfc.predict(X_test))

Accuracy: 0.9848891873740766, f1: 0.9847094801223242, precision: 0.9870572207084468, recall: 0.9823728813559321


In [31]:
scoring(y_train, rfc.predict(X_train))

Accuracy: 1.0, f1: 1.0, precision: 1.0, recall: 1.0


In [46]:
df2[['domain_token_count', 'avgpathtokenlen', 'ldl_url', 'ldl_path',
       'urlLen', 'this.fileExtLen', 'argDomanRatio', 'argPathRatio',
       'NumberofDotsinURL', 'CharacterContinuityRate', 'URL_DigitCount',
       'Directory_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
       'delimeter_Domain', 'delimeter_path', 'SymbolCount_URL',
       'Entropy_Domain', 'Entropy_Afterpath','URL_Type_obf_Type']].corr()

Unnamed: 0,domain_token_count,avgpathtokenlen,ldl_url,ldl_path,urlLen,this.fileExtLen,argDomanRatio,argPathRatio,NumberofDotsinURL,CharacterContinuityRate,URL_DigitCount,Directory_LetterCount,Query_LetterCount,LongestPathTokenLength,delimeter_Domain,delimeter_path,SymbolCount_URL,Entropy_Domain,Entropy_Afterpath,URL_Type_obf_Type
domain_token_count,1.0,0.093069,0.172421,0.113234,0.091688,-0.052561,-0.097838,-0.000902,0.52363,-0.70205,0.07109,0.056799,-0.061953,-0.180733,0.237385,-0.276724,0.258754,-0.512747,-0.089765,0.495367
avgpathtokenlen,0.093069,1.0,0.50135,0.506943,0.380419,-0.119786,-0.022322,-0.161606,0.104907,-0.179731,0.411423,0.146705,0.077186,0.300468,0.183053,-0.120092,-0.000266,-0.112524,-0.083031,0.119996
ldl_url,0.172421,0.50135,1.0,0.983149,0.623913,-0.097606,0.419513,0.213766,0.250297,-0.183431,0.832372,0.232313,0.401987,0.47092,0.160853,-0.088367,0.246274,-0.158176,0.181031,0.26728
ldl_path,0.113234,0.506943,0.983149,1.0,0.615676,-0.097162,0.432291,0.214687,0.225508,-0.148008,0.827712,0.244746,0.410537,0.481355,0.156408,-0.083168,0.242122,-0.142363,0.187233,0.260485
urlLen,0.091688,0.380419,0.623913,0.615676,1.0,-0.221057,0.479096,0.212634,0.279469,-0.087991,0.656298,0.499999,0.435828,0.768378,0.091394,0.476056,0.505552,-0.089687,0.261303,-0.084576
this.fileExtLen,-0.052561,-0.119786,-0.097606,-0.097162,-0.221057,1.0,-0.176559,-0.202443,0.189827,0.116701,-0.157046,0.052606,-0.094526,-0.229239,-0.055425,0.016727,-0.102156,-0.007596,-0.249206,0.148061
argDomanRatio,-0.097838,-0.022322,0.419513,0.432291,0.479096,-0.176559,1.0,0.839262,0.293697,0.041312,0.4377,0.258559,0.741794,0.404785,-0.056255,0.061732,0.629699,0.042557,0.729339,-0.054948
argPathRatio,-0.000902,-0.161606,0.213766,0.214687,0.212634,-0.202443,0.839262,1.0,0.277185,-0.013155,0.216199,0.109922,0.613311,0.167324,-0.042051,-0.106847,0.579984,-0.017955,0.868501,0.015951
NumberofDotsinURL,0.52363,0.104907,0.250297,0.225508,0.279469,0.189827,0.293697,0.277185,1.0,-0.369929,0.180677,0.511558,0.374765,-0.142533,0.098148,0.000625,0.633516,-0.318677,0.212802,0.368502
CharacterContinuityRate,-0.70205,-0.179731,-0.183431,-0.148008,-0.087991,0.116701,0.041312,-0.013155,-0.369929,1.0,-0.084444,-0.055204,0.051097,0.094729,-0.532419,0.227562,-0.212226,0.361565,0.054285,-0.402124


In [44]:
y_train.name

'URL_Type_obf_Type'

In [53]:
pd.DataFrame({'imp':rfc.feature_importances_},index=X_train.columns).sort_values(by='imp', ascending=False)

Unnamed: 0,imp
argDomanRatio,0.148293
domain_token_count,0.130895
LongestPathTokenLength,0.130831
NumberofDotsinURL,0.103838
delimeter_path,0.081286
urlLen,0.08122
Entropy_Domain,0.048153
CharacterContinuityRate,0.048137
ldl_path,0.036344
ldl_url,0.033276


In [54]:
X = X[list(selected_features.k_feature_names_)].drop(['Entropy_Afterpath','delimeter_Domain'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=30)

In [61]:
rfc2 = RandomForestClassifier(random_state=23)
rfc2.fit(X_train, y_train)

In [62]:
scoring(y_test, rfc2.predict(X_test))

Accuracy: 0.9811954331766286, f1: 0.9798561151079137, precision: 0.9819754866618601, recall: 0.9777458722182341


In [63]:
scoring(y_train, rfc2.predict(X_train))

Accuracy: 1.0, f1: 1.0, precision: 1.0, recall: 1.0


In [64]:
pd.DataFrame({'imp':rfc2.feature_importances_},index=X_train.columns).sort_values(by='imp', ascending=False)

Unnamed: 0,imp
domain_token_count,0.168535
argDomanRatio,0.142002
LongestPathTokenLength,0.138132
NumberofDotsinURL,0.103111
urlLen,0.080847
delimeter_path,0.067972
argPathRatio,0.044188
Entropy_Domain,0.041446
ldl_url,0.034289
CharacterContinuityRate,0.031693
