In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
!unzip /content/72ptz43s9v-1.zip
df = pd.read_csv('/content/dataset_small.csv')

Archive:  /content/72ptz43s9v-1.zip
  inflating: dataset_small.csv       
  inflating: dataset_full.csv        


# **Data Cleaning**

In [17]:
df = pd.read_csv('/content/dataset_full.csv')

In [18]:
skipped_features = [
    'time_response',
    'domain_spf',
    'asn_ip',
    'qty_ip_resolved',
    'qty_nameservers',
    'qty_mx_servers',
    'ttl_hostname',
    'qty_redirects',
    'url_google_index',
    'domain_google_index', #remove url shorten and https one
    'time_domain_activation',
    'time_domain_expiration',
]


df = df.drop(skipped_features,axis=1)

In [19]:
class DataCleaning:

    def __init__(self, df, missing_threshold, corr_threshold):
        self.df = df
        self.missing_threshold = missing_threshold
        self.corr_threshold = corr_threshold

    def col_with_variance_0(self):
        columns_to_drop = []
        numerical_columns = [col for col in self.df.columns if self.df[col].dtype != 'O']
        for col in numerical_columns:
            if self.df[col].std() == 0:
                columns_to_drop.append(col)
        return columns_to_drop

    def get_redundant_cols(self):
        cols_missing_ratios = self.df.isna().sum().div(self.df.shape[0])
        cols_to_drop = list(cols_missing_ratios[cols_missing_ratios > self.missing_threshold].index)
        return cols_to_drop

    def dropping_columns_on_basis_of_correlation(self):
        columns_to_drop = set()
        relation = self.df.corr()
        for columns in range(len(relation.columns)):
            for rows in range(columns):
                if abs(relation.iloc[columns, rows]) > self.corr_threshold:
                    col_name = relation.columns[columns]
                    columns_to_drop.add(col_name)
        columns_to_drop = list(columns_to_drop)
        return columns_to_drop

    def feature_scaling_df(self):
        cols_to_drop_1 = self.get_redundant_cols()
        cols_to_drop_2 = self.col_with_variance_0()
        cols_to_drop_3 = self.dropping_columns_on_basis_of_correlation()
        columns_to_drop = cols_to_drop_1 + cols_to_drop_2 + cols_to_drop_3
        columns_to_drop = set(columns_to_drop)
        return columns_to_drop


clean = DataCleaning(df, 0.8, 0.8)
drop_columns = clean.feature_scaling_df()
print(drop_columns)
df2 = df.drop(columns=drop_columns)

{'qty_questionmark_domain', 'qty_percent_file', 'qty_dollar_directory', 'qty_and_url', 'qty_dollar_domain', 'qty_plus_directory', 'qty_plus_file', 'qty_slash_domain', 'qty_comma_domain', 'qty_space_domain', 'qty_and_directory', 'qty_space_params', 'qty_comma_file', 'qty_underline_file', 'qty_at_directory', 'qty_space_directory', 'qty_tilde_params', 'qty_and_domain', 'qty_dollar_file', 'qty_hashtag_params', 'qty_hashtag_file', 'domain_length', 'qty_dot_file', 'qty_plus_domain', 'qty_hyphen_file', 'qty_slash_file', 'qty_space_file', 'params_length', 'tld_present_params', 'qty_hashtag_domain', 'qty_comma_params', 'qty_equal_domain', 'qty_plus_params', 'qty_exclamation_file', 'qty_tilde_domain', 'qty_asterisk_directory', 'qty_exclamation_domain', 'qty_at_file', 'qty_and_file', 'qty_asterisk_domain', 'qty_asterisk_params', 'qty_equal_params', 'qty_exclamation_directory', 'qty_tilde_file', 'qty_and_params', 'qty_exclamation_params', 'qty_comma_directory', 'qty_dollar_params', 'qty_slash_dire

In [20]:
len(drop_columns)

59

In [21]:
len(list(df2.columns))

41

# **Data Transformation**

In [22]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [59]:
list(df2.columns)

['qty_dot_url',
 'qty_hyphen_url',
 'qty_underline_url',
 'qty_slash_url',
 'qty_questionmark_url',
 'qty_equal_url',
 'qty_at_url',
 'qty_exclamation_url',
 'qty_space_url',
 'qty_tilde_url',
 'qty_comma_url',
 'qty_plus_url',
 'qty_asterisk_url',
 'qty_hashtag_url',
 'qty_dollar_url',
 'qty_percent_url',
 'qty_tld_url',
 'length_url',
 'qty_dot_domain',
 'qty_hyphen_domain',
 'qty_underline_domain',
 'qty_at_domain',
 'qty_vowels_domain',
 'domain_in_ip',
 'server_client_domain',
 'qty_dot_directory',
 'qty_hyphen_directory',
 'qty_underline_directory',
 'qty_percent_directory',
 'directory_length',
 'file_length',
 'qty_dot_params',
 'qty_hyphen_params',
 'qty_underline_params',
 'qty_slash_params',
 'qty_questionmark_params',
 'qty_percent_params',
 'email_in_url',
 'tls_ssl_certificate',
 'url_shortened',
 'phishing']

In [25]:
X = df2.drop(columns='phishing',axis=1)
y = df2['phishing']

In [26]:
X.shape

(88647, 40)

In [27]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='all', k_neighbors=5)
X, Y = smote.fit_resample(X,y)


In [28]:
X.shape

(116000, 40)

In [30]:
Y.value_counts()

1    58000
0    58000
Name: phishing, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X ,Y, test_size=0.25, random_state=42)

In [32]:
pca = PCA(n_components=10)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

In [33]:
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_pca_train)
X_scaled_test = scaler.transform(X_pca_test)

In [58]:
import joblib

joblib.dump(pca, '/content/pca.joblib')
joblib.dump(scaler, '/content/scaler.joblib')

['/content/scaler.joblib']

# **HyperOPT**

In [13]:
!pip install hyperopt

# XGBClassifier

In [34]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials,space_eval

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', range(3, 15)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'gamma': hp.uniform('gamma', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'n_estimators': hp.choice('n_estimators', range(50, 500)),
}

space

{'learning_rate': <hyperopt.pyll.base.Apply at 0x7fdd735d5b40>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7fdd7359a290>,
 'min_child_weight': <hyperopt.pyll.base.Apply at 0x7fdd7353e3b0>,
 'subsample': <hyperopt.pyll.base.Apply at 0x7fdd7353e200>,
 'gamma': <hyperopt.pyll.base.Apply at 0x7fdd7353e650>,
 'colsample_bytree': <hyperopt.pyll.base.Apply at 0x7fdd7353dfc0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x7fdd7353dae0>}

In [35]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(space):
  model = XGBClassifier(
      learning_rate = space['learning_rate'],
      max_depth = space['max_depth'],
      min_child_weight = space['min_child_weight'],
      subsample = space['subsample'],
      gamma = space['gamma'],
      colsample_bytree = space['colsample_bytree'],
      n_estimators = space['n_estimators']
  )
  accuracy = cross_val_score(model, X_scaled_train, y_train, cv = 5).mean()

  # We aim to maximize accuracy, therefore we return it as a negative value
  return {'loss': accuracy, 'status': STATUS_OK }

In [36]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [14:39<00:00, 11.00s/trial, best loss: 0.9078620689655172]


{'colsample_bytree': 0.885851504127611,
 'gamma': 0.6411482852494903,
 'learning_rate': 0.034930371583044684,
 'max_depth': 0,
 'min_child_weight': 8.0,
 'n_estimators': 33,
 'subsample': 0.9191185596390956}

In [37]:
# Retrieve the best parameters
best_params = space_eval(space, best)
print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'colsample_bytree': 0.885851504127611, 'gamma': 0.6411482852494903, 'learning_rate': 0.034930371583044684, 'max_depth': 3, 'min_child_weight': 8.0, 'n_estimators': 83, 'subsample': 0.9191185596390956}


In [46]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score , precision_score, fbeta_score, confusion_matrix

xgb_hyp = XGBClassifier(
    learning_rate=best_params['learning_rate'],
    n_estimators =best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

xgb_hyp.fit(X_scaled_train, y_train)

# Evaluate the performance of the XGBoost classifier
y_pred_xgb_hyp = xgb_hyp.predict(X_scaled_test)
score_xgb_hyp = accuracy_score(y_test, y_pred_xgb_hyp)

y_true = y_test
y_pred = y_pred_xgb_hyp

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)
print('Testing Data')
print('Accuracy of XGBClassifier using HyperOPT:', score_xgb_hyp)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

y_true = y_train
y_pred = xgb_hyp.predict(X_scaled_train)

# Assuming y_true contains the true labels and y_pred contains the predicted labels
accuracy = accuracy_score(y_true,y_pred)
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)
print('Trainind Data')
print('Accuracy of XGBClassifier using HyperOPT:', accuracy)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

Testing Data
Accuracy of XGBClassifier using HyperOPT: 0.9100689655172414
Precision: 0.8786370102159576
Recall: 0.9489525139664804
F2 Score: 0.9340032441646275
Trainind Data
Accuracy of XGBClassifier using HyperOPT: 0.9079080459770115
Precision: 0.8778389830508475
Recall: 0.948580586080586
F2 Score: 0.9335346070656092


In [47]:
df3 = pd.read_csv('/content/dataset_small.csv')
df3 = df3[list(df2.columns)]
X2 = df3.drop(columns='phishing',axis=1)
Y2 = df3['phishing']
X2 = pca.fit_transform(X2)
X2 = scaler.transform(X2)
ypred2 = xgb_hyp.predict(X2)
accuracy = accuracy_score(Y2,ypred2)
print('Accuracy  ' ,accuracy)
y_true = Y2
y_pred = ypred2
from sklearn.metrics import precision_score, fbeta_score, confusion_matrix

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

Accuracy   0.8419984653423139
Precision: 0.8364649230478708
Recall: 0.8671974418377003
F2 Score: 0.8608715932133117


In [55]:
import joblib

joblib.dump(xgb_hyp, '/content/xgb_hyp.joblib')


['/content/xgb_hyp.joblib']

# Random Forest Classifier

In [None]:
space = {
    'criterion': hp.choice('criterion', ['entropy', 'gini']),
    'max_depth': hp.quniform('max_depth', 10, 1200, 10),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),
    'n_estimators': hp.choice('n_estimators', [10, 50, 300, 750, 1200, 1300, 1500])
}

In [None]:
from sklearn.ensemble import RandomForestClassifier

def objective(space):
    # Round 'max_depth' to an integer
    space['max_depth'] = int(space['max_depth'])

    model = RandomForestClassifier(
        criterion=space['criterion'],
        max_depth=space['max_depth'],
        max_features=space['max_features'],
        min_samples_leaf=space['min_samples_leaf'],
        min_samples_split=space['min_samples_split'],
        n_estimators=space['n_estimators']
    )

    accuracy = cross_val_score(model, X_pca_train, y_train, cv=5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
from sklearn.model_selection import cross_val_score

trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

  1%|▏         | 1/80 [00:01<02:32,  1.93s/trial, best loss: -0.5226337444092287]

  warn(

  warn(

  warn(



  2%|▎         | 2/80 [00:02<01:24,  1.08s/trial, best loss: -0.5226337444092287]

  warn(

  warn(



  5%|▌         | 4/80 [06:22<3:10:13, 150.18s/trial, best loss: -0.8311620093728468]

  warn(

  warn(

  warn(

  warn(

  warn(



 12%|█▎        | 10/80 [09:08<45:23, 38.91s/trial, best loss: -0.8311620093728468]

  warn(

  warn(

  warn(

  warn(

  warn(



 14%|█▍        | 11/80 [09:12<32:15, 28.05s/trial, best loss: -0.8311620093728468]

  warn(

  warn(

  warn(

  warn(

  warn(



 38%|███▊      | 30/80 [33:39<1:03:15, 75.91s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 39%|███▉      | 31/80 [36:55<1:31:24, 111.93s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 40%|████      | 32/80 [38:23<1:23:56, 104.92s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 41%|████▏     | 33/80 [38:57<1:05:20, 83.41s/trial, best loss: -0.849782830901208] 

  warn(

  warn(

  warn(

  warn(

  warn(



 42%|████▎     | 34/80 [42:10<1:29:11, 116.35s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 44%|████▍     | 35/80 [42:42<1:08:26, 91.26s/trial, best loss: -0.849782830901208] 

  warn(

  warn(

  warn(

  warn(

  warn(



 45%|████▌     | 36/80 [44:50<1:14:50, 102.07s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 48%|████▊     | 38/80 [48:52<1:21:15, 116.07s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 50%|█████     | 40/80 [49:46<46:41, 70.03s/trial, best loss: -0.849782830901208]  

  warn(

  warn(

  warn(

  warn(

  warn(



 51%|█████▏    | 41/80 [49:55<33:29, 51.53s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(



 52%|█████▎    | 42/80 [49:55<22:59, 36.30s/trial, best loss: -0.849782830901208]

  warn(



 55%|█████▌    | 44/80 [1:06:26<3:03:12, 305.35s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 59%|█████▉    | 47/80 [1:07:43<1:07:30, 122.75s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 64%|██████▍   | 51/80 [1:13:18<45:46, 94.70s/trial, best loss: -0.849782830901208]   

  warn(

  warn(

  warn(

  warn(

  warn(



 68%|██████▊   | 54/80 [1:15:17<23:13, 53.61s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 71%|███████▏  | 57/80 [1:21:00<36:12, 94.44s/trial, best loss: -0.849782830901208]

In [None]:
# Retrieve the best parameters
best_params = space_eval(space, best)
print("Best Hyperparameters:")
print(best_params)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_hyp = RandomForestClassifier(
    criterion=best_params['criterion'],
    max_depth =best_params['max_depth'],
    max_features=best_params['max_features'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators']
)

rf_hyp.fit(X_pca_train, y_train)

# Evaluate the performance of the XGBoost classifier hyper
y_pred_rf_hyp = rf_hyp.predict(X_pca_test)
score_rf_hyp = accuracy_score(y_test, y_pred_rf_hyp)
print('Accuracy of Random Forest Classifier using HyperOPT:', score_rf_hyp)

# **TPOTClassifier**

In [None]:
!pip install tpot

# XGBClssifier

In [41]:
from tpot import TPOTClassifier

tpot_config = {
    'xgboost.XGBClassifier': {
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'n_estimators': range(50, 1000, 50),
        'max_depth': range(3, 15),
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4],
        'min_child_weight': range(1, 10)
    }
}


In [42]:
tpot = TPOTClassifier(generations=5, population_size=20, config_dict=tpot_config, verbosity=2, random_state=42, scoring='accuracy')
tpot.fit(X_scaled_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9385747126436783

Generation 2 - Current best internal CV score: 0.9385747126436783

Generation 3 - Current best internal CV score: 0.9385747126436783

Generation 4 - Current best internal CV score: 0.9385747126436783

Generation 5 - Current best internal CV score: 0.9385747126436783

Best pipeline: XGBClassifier(input_matrix, colsample_bytree=0.9, gamma=0.4, learning_rate=0.1, max_depth=10, min_child_weight=5, n_estimators=350, subsample=0.8)


In [43]:
tpot.get_params

<bound method BaseEstimator.get_params of TPOTClassifier(config_dict={'xgboost.XGBClassifier': {'colsample_bytree': [0.6,
                                                                           0.7,
                                                                           0.8,
                                                                           0.9,
                                                                           1.0],
                                                      'gamma': [0, 0.1, 0.2,
                                                                0.3, 0.4],
                                                      'learning_rate': [0.01,
                                                                        0.05,
                                                                        0.1,
                                                                        0.2,
                                                                        0.3],
                 

In [44]:
# Evaluate the performance of the XGBoost classifier
y_pred_xgb_tpot = tpot.predict(X_scaled_test)
score_xgb_tpot = accuracy_score(y_test, y_pred_xgb_tpot)


y_true = y_test
y_pred = y_pred_xgb_tpot

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)
print('Testing Data')
print('Accuracy of XGBClassifier using HyperOPT:', score_xgb_tpot)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

y_true = y_train
y_pred = tpot.predict(X_scaled_train)

# Assuming y_true contains the true labels and y_pred contains the predicted labels
accuracy = accuracy_score(y_true,y_pred)
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)
print('Trainind Data')
print('Accuracy of XGBClassifier using HyperOPT:', accuracy)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

Testing Data
Accuracy of XGBClassifier using HyperOPT: 0.9387241379310345
Precision: 0.938719832109129
Recall: 0.9370810055865921
F2 Score: 0.9374083129584352
Trainind Data
Accuracy of XGBClassifier using HyperOPT: 0.9613103448275863
Precision: 0.9688110521909015
Recall: 0.9536401098901099
F2 Score: 0.956636168219148


In [45]:
df3 = pd.read_csv('/content/dataset_small.csv')
df3 = df3[list(df2.columns)]
X2 = df3.drop(columns='phishing',axis=1)
Y2 = df3['phishing']
X2 = pca.fit_transform(X2)
X2 = scaler.transform(X2)
ypred2 = xgb_hyp.predict(X2)
accuracy = accuracy_score(Y2,ypred2)
print('Accuracy  ' ,accuracy)
y_true = Y2
y_pred = ypred2
from sklearn.metrics import precision_score, fbeta_score, confusion_matrix

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

Accuracy   0.8419984653423139
Precision: 0.8364649230478708
Recall: 0.8671974418377003
F2 Score: 0.8608715932133117


In [56]:
import joblib

# Save only the fitted pipeline of TPOT
joblib.dump(tpot.fitted_pipeline_, '/content/tpot_xgbclassifier_pipeline.joblib')


['/content/tpot_xgbclassifier_pipeline.joblib']

# Random Forest Classifier

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:

from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param},
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_pca_train,y_train)

Optimization Progress:   0%|          | 0/84 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9015301835336046


In [None]:
accuracy = tpot_classifier.score(X_pca_test, y_test)
print(accuracy)

In [None]:
tpot_classifier.get_params

In [None]:
import joblib

# Save only the fitted pipeline of TPOT
joblib.dump(tpot_classifier.fitted_pipeline_, '/content/tpot_randomforestclassifier_pipeline.joblib')


# **Optuna**

# XGBClassifier

In [48]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.1 colorlog-6.8.0 optuna-3.5.0


In [49]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def objective(trial,X_scaled_train,y_train,X_scaled_test,y_test):
    train_x, valid_x, train_y, valid_y = X_scaled_train,X_scaled_test,y_train,y_test

    params = {
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int("n_estimators", 50, 500),
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        'subsample': trial.suggest_categorical("subsample", [0.8, 0.9, 1.0]),
        'colsample_bytree': trial.suggest_categorical("colsample_bytree", [0.8, 0.9, 1.0]),
        'gamma': trial.suggest_categorical("gamma", [0, 0.1, 0.2, 0.3, 0.4]),
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 10)
    }

    # Create XGBoost classifier with the suggested hyperparameters
    clf = XGBClassifier(**params)

    # Train the classifier on the training set
    clf.fit(train_x, train_y)

    # Calculate accuracy on the validation set
    accuracy = accuracy_score(valid_y, clf.predict(valid_x))

    return 1.0 - accuracy  # Optimize for accuracy

In [51]:

# Create a study object and optimize the objective function
study = optuna.create_study(direction="minimize")  # Minimize because we want to minimize 1.0 - accuracy
study.optimize(lambda trial: objective(trial, X_scaled_train, y_train, X_scaled_test, y_test), n_trials=100)


[I 2024-01-22 11:29:49,174] A new study created in memory with name: no-name-3d6a74c4-5baf-4322-8d2a-c6ac0e6f95e7
[I 2024-01-22 11:29:54,396] Trial 0 finished with value: 0.9323448275862068 and parameters: {'learning_rate': 0.10093319691710702, 'n_estimators': 450, 'max_depth': 4, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'min_child_weight': 4}. Best is trial 0 with value: 0.9323448275862068.
[I 2024-01-22 11:29:57,454] Trial 1 finished with value: 0.9366551724137931 and parameters: {'learning_rate': 0.06442152251343282, 'n_estimators': 256, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 1.0, 'gamma': 0.2, 'min_child_weight': 10}. Best is trial 0 with value: 0.9323448275862068.
[I 2024-01-22 11:29:59,086] Trial 2 finished with value: 0.9378965517241379 and parameters: {'learning_rate': 0.22686106422332056, 'n_estimators': 234, 'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.8, 'gamma': 0.2, 'min_child_weight': 3}. Best is trial 0 with value: 0.9323448275862

In [52]:
# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.010044370709321186, 'n_estimators': 50, 'max_depth': 3, 'subsample': 1.0, 'colsample_bytree': 0.9, 'gamma': 0.4, 'min_child_weight': 1}


In [53]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score , precision_score, fbeta_score, confusion_matrix

xgb_opt = XGBClassifier(
    learning_rate=best_params['learning_rate'],
    n_estimators =best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

xgb_opt.fit(X_scaled_train, y_train)

# Evaluate the performance of the XGBoost classifier
y_pred_xgb_opt = xgb_opt.predict(X_scaled_test)
score_xgb_opt = accuracy_score(y_test, y_pred_xgb_opt)

y_true = y_test
y_pred = y_pred_xgb_opt

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)
print('Testing Data')
print('Accuracy of XGBClassifier using HyperOPT:', score_xgb_hyp)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

y_true = y_train
y_pred = xgb_opt.predict(X_scaled_train)

# Assuming y_true contains the true labels and y_pred contains the predicted labels
accuracy = accuracy_score(y_true,y_pred)
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)
print('Trainind Data')
print('Accuracy of XGBClassifier using HyperOPT:', accuracy)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

Testing Data
Accuracy of XGBClassifier using HyperOPT: 0.9100689655172414
Precision: 0.881242626818718
Recall: 0.9389664804469273
F2 Score: 0.9268245609197936
Trainind Data
Accuracy of XGBClassifier using HyperOPT: 0.9056206896551724
Precision: 0.8808220060555305
Recall: 0.9390796703296703
F2 Score: 0.9268196792429808


In [54]:
df3 = pd.read_csv('/content/dataset_small.csv')
df3 = df3[list(df2.columns)]
X2 = df3.drop(columns='phishing',axis=1)
Y2 = df3['phishing']
X2 = pca.fit_transform(X2)
X2 = scaler.transform(X2)
ypred2 = xgb_opt.predict(X2)
accuracy = accuracy_score(Y2,ypred2)
print('Accuracy  ' ,accuracy)
y_true = Y2
y_pred = ypred2
from sklearn.metrics import precision_score, fbeta_score, confusion_matrix

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
f2_score = fbeta_score(y_true, y_pred, beta=2)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F2 Score: {f2_score}')

Accuracy   0.8348196777218859
Precision: 0.8418357361863136
Recall: 0.8421378927790648
F2 Score: 0.8420774441094709


In [57]:
import joblib

joblib.dump(xgb_hyp, '/content/xgb_opt.joblib')


['/content/xgb_opt.joblib']

# **Data Extraction From Url**

In [60]:
import numpy as np
from urllib.parse import urlparse, parse_qs
import joblib
import socket

def is_ip_address(domain):
    try:
        socket.inet_aton(domain)
        return True
    except socket.error:
        return False

def extract_additional_url_features(url):
    parsed_url = urlparse(url)

    return {
        'qty_dot_url': url.count('.'),
        'qty_hyphen_url': url.count('-'),
        'qty_underline_url': url.count('_'),
        'qty_slash_url': url.count('/'),
        'qty_questionmark_url': url.count('?'),
        'qty_equal_url': url.count('='),
        'qty_at_url': url.count('@'),
        'qty_exclamation_url': url.count('!'),
        'qty_space_url': url.count(' '),
        'qty_tilde_url': url.count('~'),
        'qty_comma_url': url.count(','),
        'qty_plus_url': url.count('+'),
        'qty_asterisk_url': url.count('*'),
        'qty_hashtag_url': url.count('#'),
        'qty_dollar_url': url.count('$'),
        'qty_percent_url': url.count('%'),
        'qty_tld_url': len(parsed_url.netloc.split('.')[-1]),
        'length_url': len(url)
    }

def extract_additional_domain_features(url):
    # Parse the URL to get the domain
    domain = urlparse(url).netloc

    if not domain:
        return {
            'qty_dot_domain': -1,
            'qty_hyphen_domain': -1,
            'qty_underline_domain': -1,
            'qty_at_domain': -1,
            'qty_vowels_domain': -1,
            'domain_in_ip': -1,
            'server_client_domain': -1
        }

    return {
        'qty_dot_domain': domain.count('.'),
        'qty_hyphen_domain': domain.count('-'),
        'qty_underline_domain': domain.count('_'),
        'qty_at_domain': domain.count('@'),
        'qty_vowels_domain': sum(1 for char in domain if char.lower() in "aeiou"),
        'domain_in_ip': 1 if is_ip_address(domain) else 0,
        'server_client_domain': 1 if domain.startswith("www.") else 0
    }

def extract_additional_path_features(url):
    # Parse the URL to get the path
    path = urlparse(url).path

    if not path:
        return {
            'qty_dot_directory': -1,
            'qty_hyphen_directory': -1,
            'qty_underline_directory': -1,
            'qty_percent_directory': -1,
            'directory_length': -1
        }

    return {
        'qty_dot_directory': path.count('.'),
        'qty_hyphen_directory': path.count('-'),
        'qty_underline_directory': path.count('_'),
        'qty_percent_directory': path.count('%'),
        'directory_length': len(path)
    }

def extract_file_features(url):
    # Parse the URL to get the path
    path = urlparse(url).path

    if not path:
        return {
            'file_length': -1
        }

    # Extract the file name from the path
    file_name = path.split('/')[-1]

    # Attribute: Length of the file name
    file_length = len(file_name)

    return {
        'file_length': file_length
    }

def extract_additional_params_features(url):
    # Parse the URL to get the query parameters
    query_params = urlparse(url).query

    if not query_params:
        return {
            'qty_dot_params': -1,
            'qty_hyphen_params': -1,
            'qty_underline_params': -1,
            'qty_slash_params': -1,
            'qty_questionmark_params': -1,
            'qty_percent_params': -1
        }

    # Extract parameter names from the query string
    param_names = parse_qs(query_params).keys()

    return {
        'qty_dot_params': sum(param.count('.') for param in param_names),
        'qty_hyphen_params': sum(param.count('-') for param in param_names),
        'qty_underline_params': sum(param.count('_') for param in param_names),
        'qty_slash_params': sum(param.count('/') for param in param_names),
        'qty_questionmark_params': sum(param.count('?') for param in param_names),
        'qty_percent_params': sum(param.count('%') for param in param_names)
    }

def email_urlshorten(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Extract the domain from the URL
    domain = parsed_url.netloc

    if not domain:
        return {
            'email_in_url': -1,
            'tls_ssl_certificate' : -1,
            'url_shortened': -1
        }

    return {
        'email_in_url': 1 if '@' in url else 0,
        'tls_ssl_certificate' : 1 if url.startswith("https://") else 0,
        'url_shortened': 1 if domain in ['bit.ly', 'goo.gl', 'tinyurl.com', 'ow.ly'] else 0
    }

def extract_all_features(url):
    # Extract URL-based features
    url_features = extract_additional_url_features(url)

    # Extract Domain-based features
    domain_features = extract_additional_domain_features(url)

    # Extract Page-based features
    path_features = extract_additional_path_features(url)

    # Extract File-based feature
    file_feature = extract_file_features(url)

    # Extract Params-based features
    params_features = extract_additional_params_features(url)

    # Extract Additional Features
    additional_features = email_urlshorten(url)

    # Combine all features
    all_features = {**url_features, **domain_features, **path_features, **file_feature, **params_features, **additional_features}

    return all_features


# **Testing**

# HyperOPT XGBClassifier

In [62]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
# http://app.validchk.com/visitqr.aspx?vid=1073653
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(data)

# Assuming you have a scaler object
scaled_data = scaler.transform(pca_transformed_data)


# Use the trained XGBBoost for prediction
prediction = xgb_hyp.predict(scaled_data)
print(prediction)

[1]




# HyperOPT Random Forest Classifier

In [None]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
prediction = rf_hyp.predict(pca_transformed_data)
print(prediction)

array([[ 3.89625317, -2.77028969, -1.01534515, -1.5399523 , -1.37525167,
         3.87685379, -1.00835236,  4.63073484,  5.3159082 , -7.51699881,
        -4.32969553,  5.15898124, -0.03700143,  0.6175854 , -2.4718587 ,
        -4.37237251, -1.51808813, -3.84874513, -3.22719764,  2.09611946]])

# TPOTClassifier Xgboost

In [None]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)
# Load the fitted pipeline
loaded_pipeline = joblib.load('/content/tpot_xgbclassifier_pipeline.joblib')

# Now you can use loaded_pipeline for predictions

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
#prediction = loaded_pipeline.predict(pca_transformed_data)
prediction = tpot.predict(pca_transformed_data)

print(prediction)

[1]




# TPOTClassifier Random Forest

In [None]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
prediction = xgb_hyp.predict(pca_transformed_data)
print(prediction)