## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.compose import ColumnTransformer


## Importing the dataset

In [3]:
df = pd.read_csv('phishing_url_miniature.csv')

In [4]:
df.head()

Unnamed: 0,tld,country_code,url_length,path_rest_length,num_spcs_chars,domain_entropy,domain_age,created_year,updated_year,expires_year,word_count,tld_in_path_rest,Label
0,org,-1,98,84,0,3.03,-1,-1,-1,-1,6,0,bad
1,com,-1,70,57,0,3.19,-1,-1,-1,-1,1,0,bad
2,ru,-1,77,60,0,3.2,-1,-1,-1,-1,4,3,bad
3,com,-1,70,57,0,3.19,-1,-1,-1,-1,1,0,bad
4,net,-1,203,163,0,4.38,-1,-1,-1,-1,4,0,bad


In [5]:
df.shape

(667, 13)

In [6]:
column_to_print = df[df.columns[12]]

print(column_to_print)

0      bad
1      bad
2      bad
3      bad
4      bad
      ... 
662    bad
663    bad
664    bad
665    bad
666    bad
Name: Label, Length: 667, dtype: object


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tld               667 non-null    object 
 1   country_code      667 non-null    object 
 2   url_length        667 non-null    int64  
 3   path_rest_length  667 non-null    int64  
 4   num_spcs_chars    667 non-null    int64  
 5   domain_entropy    667 non-null    float64
 6   domain_age        667 non-null    int64  
 7   created_year      667 non-null    int64  
 8   updated_year      667 non-null    int64  
 9   expires_year      667 non-null    int64  
 10  word_count        667 non-null    int64  
 11  tld_in_path_rest  667 non-null    int64  
 12  Label             667 non-null    object 
dtypes: float64(1), int64(9), object(3)
memory usage: 67.9+ KB


In [8]:
df[['tld', 'country_code']] = df[['tld', 'country_code']].astype("string")

In [9]:
df.isnull().sum()

tld                 0
country_code        0
url_length          0
path_rest_length    0
num_spcs_chars      0
domain_entropy      0
domain_age          0
created_year        0
updated_year        0
expires_year        0
word_count          0
tld_in_path_rest    0
Label               0
dtype: int64

## Splitting the dataset into the Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split

X = df.drop('Label', axis=1)
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Planning the Pipeline

In [11]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [12]:
from sklearn.preprocessing import LabelEncoder

# Perform Label Encoding on y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Use the same encoder to encode y_test
y_test_encoded = label_encoder.transform(y_test)

In [13]:
from sklearn.preprocessing import OneHotEncoder

# One Hot Encoding Transformer
trf1 = ColumnTransformer([
    ('ohe_tld_country_code',OneHotEncoder(sparse=False,handle_unknown='ignore'),[0,1])
],remainder='passthrough')

In [14]:
from sklearn.preprocessing import MinMaxScaler

#OHE COLUMNS ARE FROM 0-12
# Scaling
trf2 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(14, 24))
], remainder='passthrough')

In [15]:
from sklearn.feature_selection import SelectKBest,chi2

# Feature selection
trf3 = SelectKBest(score_func=chi2, k=20)

In [16]:

from sklearn.tree import DecisionTreeClassifier

# train the model
trf4 = DecisionTreeClassifier()

## Create Pipeline

In [17]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])


In [18]:
# train
pipe.fit(X_train, y_train)



## Explore the Pipeline

In [19]:
# Code here
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_tld_country_code',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [0, 1])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('scale', MinMaxScaler(), slice(14, 24, None))]),
 'trf3': SelectKBest(k=20, score_func=<function chi2 at 0x000002A4F9ACA340>),
 'trf4': DecisionTreeClassifier()}

In [20]:
# Predict
y_pred = pipe.predict(X_test)

In [21]:
y_pred

array(['bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad',
       'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'good', 'good', 'bad',
       'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good',
       'bad', 'good', 'bad', 'good', 'bad', 'good', 'good', 'bad', 'bad',
       'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'good', 'bad', 'bad',
       'bad', 'good', 'bad', 'good', 'good', 'bad', 'bad', 'good', 'bad',
       'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad',
       'bad', 'good', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'good',
       'bad', 'bad', 'good', 'good', 'bad', 'bad', 'bad', 'good', 'good',
       'bad', 'bad', 'good', 'good', 'good', 'bad', 'bad', 'bad', 'bad',
       'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
       'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'good',
       'good', 'bad', 'bad', 'good', 'good', 'bad', 'good', 'bad', 'good',
       'good', 'good', 'bad', 'bad', 'bad', 'good', 'good

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8955223880597015

## Cross Validation using Pipeline

In [23]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()



0.9136660201022746

## GridSearch using Pipeline

In [24]:
# gridsearchcv
params = {
    'trf4__max_depth':[1,2,3,4,5,None]
}

In [25]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)



In [26]:
grid.best_score_

0.9287779932992418

In [27]:
grid.best_params_

{'trf4__max_depth': 2}

## Exporting the Pipeline

In [None]:
# export 
import pickle
pickle.dump(pipe, open('pipe.pkl','wb'))