In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
!unzip /content/72ptz43s9v-1.zip
df = pd.read_csv('/content/dataset_small.csv')

Archive:  /content/72ptz43s9v-1.zip
  inflating: dataset_small.csv       
  inflating: dataset_full.csv        


# **Data Cleaning**

In [2]:
skipped_features = [
    'time_response',
    'domain_spf',
    'asn_ip',
    'qty_ip_resolved',
    'qty_nameservers',
    'qty_mx_servers',
    'ttl_hostname',
    'tls_ssl_certificate',
    'qty_redirects',
    'url_google_index',
    'domain_google_index',
    'url_shortened',
    'time_domain_activation',
    'time_domain_expiration',
]


df = df.drop(skipped_features,axis=1)

In [3]:
class DataCleaning:

    def __init__(self, df, missing_threshold, corr_threshold):
        self.df = df
        self.missing_threshold = missing_threshold
        self.corr_threshold = corr_threshold

    def col_with_variance_0(self):
        columns_to_drop = []
        numerical_columns = [col for col in self.df.columns if self.df[col].dtype != 'O']
        for col in numerical_columns:
            if self.df[col].std() == 0:
                columns_to_drop.append(col)
        return columns_to_drop

    def get_redundant_cols(self):
        cols_missing_ratios = self.df.isna().sum().div(self.df.shape[0])
        cols_to_drop = list(cols_missing_ratios[cols_missing_ratios > self.missing_threshold].index)
        return cols_to_drop

    def dropping_columns_on_basis_of_correlation(self):
        columns_to_drop = set()
        relation = self.df.corr()
        for columns in range(len(relation.columns)):
            for rows in range(columns):
                if abs(relation.iloc[columns, rows]) > self.corr_threshold:
                    col_name = relation.columns[columns]
                    columns_to_drop.add(col_name)
        columns_to_drop = list(columns_to_drop)
        return columns_to_drop

    def feature_scaling_df(self):
        cols_to_drop_1 = self.get_redundant_cols()
        cols_to_drop_2 = self.col_with_variance_0()
        cols_to_drop_3 = self.dropping_columns_on_basis_of_correlation()
        columns_to_drop = cols_to_drop_1 + cols_to_drop_2 + cols_to_drop_3
        columns_to_drop = set(columns_to_drop)
        return columns_to_drop


clean = DataCleaning(df, 0.8, 0.8)
drop_columns = clean.feature_scaling_df()
print(drop_columns)
df2 = df.drop(columns=drop_columns)

{'qty_comma_params', 'qty_asterisk_directory', 'params_length', 'qty_tilde_params', 'qty_equal_file', 'qty_equal_domain', 'qty_tilde_directory', 'qty_dot_file', 'qty_slash_file', 'qty_equal_params', 'qty_and_file', 'qty_dollar_directory', 'qty_questionmark_domain', 'qty_asterisk_domain', 'qty_at_file', 'qty_plus_file', 'qty_equal_directory', 'tld_present_params', 'qty_at_directory', 'qty_hashtag_directory', 'qty_underline_file', 'qty_tilde_file', 'qty_percent_domain', 'domain_length', 'qty_comma_file', 'qty_space_domain', 'qty_comma_directory', 'qty_percent_file', 'qty_percent_directory', 'qty_at_params', 'qty_plus_domain', 'qty_space_directory', 'qty_hashtag_params', 'qty_hashtag_domain', 'qty_exclamation_file', 'qty_params', 'qty_tilde_domain', 'qty_exclamation_params', 'qty_dollar_domain', 'qty_and_params', 'qty_plus_directory', 'qty_and_domain', 'qty_exclamation_domain', 'qty_and_directory', 'qty_slash_directory', 'qty_dollar_params', 'qty_and_url', 'qty_space_params', 'qty_space_f

In [4]:
len(drop_columns)

58

# **Data Transformation**

In [5]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [6]:
X = df2.drop(columns='phishing',axis=1)
y = df2['phishing']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.25, random_state=42)

In [8]:
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [9]:
pca = PCA(n_components=20)
X_pca_train = pca.fit_transform(X_scaled_train)
X_pca_test = pca.transform(X_scaled_test)

# **HyperOPT**

In [None]:
!pip install hyperopt



# XGBClassifier

In [None]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials,space_eval

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', range(3, 15)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'gamma': hp.uniform('gamma', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'n_estimators': hp.choice('n_estimators', range(50, 500)),
}

space

{'learning_rate': <hyperopt.pyll.base.Apply at 0x7a3da6288d00>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7a3d9fdf2b90>,
 'min_child_weight': <hyperopt.pyll.base.Apply at 0x7a3d9fdf2ec0>,
 'subsample': <hyperopt.pyll.base.Apply at 0x7a3d9fdf2fe0>,
 'gamma': <hyperopt.pyll.base.Apply at 0x7a3d9fdf3100>,
 'colsample_bytree': <hyperopt.pyll.base.Apply at 0x7a3d9fdf3220>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x7a3d9fdf3340>}

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(space):
  model = XGBClassifier(
      learning_rate = space['learning_rate'],
      max_depth = space['max_depth'],
      min_child_weight = space['min_child_weight'],
      subsample = space['subsample'],
      gamma = space['gamma'],
      colsample_bytree = space['colsample_bytree'],
      n_estimators = space['n_estimators']
  )
  accuracy = cross_val_score(model, X_pca_train, y_train, cv = 5).mean()

  # We aim to maximize accuracy, therefore we return it as a negative value
  return {'loss': accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [19:49<00:00, 14.87s/trial, best loss: 0.8543072866370016]


{'colsample_bytree': 0.9226435233167982,
 'gamma': 0.07428468227320731,
 'learning_rate': 0.010899447619528022,
 'max_depth': 0,
 'min_child_weight': 10.0,
 'n_estimators': 40,
 'subsample': 0.9956101034428355}

In [None]:
# Retrieve the best parameters
best_params = space_eval(space, best)
print("Best Hyperparameters:")
print(best_params)

Best Hyperparameters:
{'colsample_bytree': 0.9226435233167982, 'gamma': 0.07428468227320731, 'learning_rate': 0.010899447619528022, 'max_depth': 3, 'min_child_weight': 10.0, 'n_estimators': 90, 'subsample': 0.9956101034428355}


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb_hyp = XGBClassifier(
    learning_rate=best_params['learning_rate'],
    n_estimators =best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

xgb_hyp.fit(X_pca_train, y_train)

# Evaluate the performance of the XGBoost classifier
y_pred_xgb_hyp = xgb_hyp.predict(X_pca_test)
score_xgb_hyp = accuracy_score(y_test, y_pred_xgb_hyp)
print('Accuracy of XGBClassifier using HyperOPT:', score_xgb_hyp)

Accuracy of XGBClassifier using HyperOPT: 0.8518619560769336


# Random Forest Classifier

In [None]:
space = {
    'criterion': hp.choice('criterion', ['entropy', 'gini']),
    'max_depth': hp.quniform('max_depth', 10, 1200, 10),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
    'min_samples_split': hp.uniform('min_samples_split', 0, 1),
    'n_estimators': hp.choice('n_estimators', [10, 50, 300, 750, 1200, 1300, 1500])
}

In [None]:
from sklearn.ensemble import RandomForestClassifier

def objective(space):
    # Round 'max_depth' to an integer
    space['max_depth'] = int(space['max_depth'])

    model = RandomForestClassifier(
        criterion=space['criterion'],
        max_depth=space['max_depth'],
        max_features=space['max_features'],
        min_samples_leaf=space['min_samples_leaf'],
        min_samples_split=space['min_samples_split'],
        n_estimators=space['n_estimators']
    )

    accuracy = cross_val_score(model, X_pca_train, y_train, cv=5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
from sklearn.model_selection import cross_val_score

trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

  1%|▏         | 1/80 [00:01<02:32,  1.93s/trial, best loss: -0.5226337444092287]

  warn(

  warn(

  warn(



  2%|▎         | 2/80 [00:02<01:24,  1.08s/trial, best loss: -0.5226337444092287]

  warn(

  warn(



  5%|▌         | 4/80 [06:22<3:10:13, 150.18s/trial, best loss: -0.8311620093728468]

  warn(

  warn(

  warn(

  warn(

  warn(



 12%|█▎        | 10/80 [09:08<45:23, 38.91s/trial, best loss: -0.8311620093728468]

  warn(

  warn(

  warn(

  warn(

  warn(



 14%|█▍        | 11/80 [09:12<32:15, 28.05s/trial, best loss: -0.8311620093728468]

  warn(

  warn(

  warn(

  warn(

  warn(



 38%|███▊      | 30/80 [33:39<1:03:15, 75.91s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 39%|███▉      | 31/80 [36:55<1:31:24, 111.93s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 40%|████      | 32/80 [38:23<1:23:56, 104.92s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 41%|████▏     | 33/80 [38:57<1:05:20, 83.41s/trial, best loss: -0.849782830901208] 

  warn(

  warn(

  warn(

  warn(

  warn(



 42%|████▎     | 34/80 [42:10<1:29:11, 116.35s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 44%|████▍     | 35/80 [42:42<1:08:26, 91.26s/trial, best loss: -0.849782830901208] 

  warn(

  warn(

  warn(

  warn(

  warn(



 45%|████▌     | 36/80 [44:50<1:14:50, 102.07s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 48%|████▊     | 38/80 [48:52<1:21:15, 116.07s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 50%|█████     | 40/80 [49:46<46:41, 70.03s/trial, best loss: -0.849782830901208]  

  warn(

  warn(

  warn(

  warn(

  warn(



 51%|█████▏    | 41/80 [49:55<33:29, 51.53s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(



 52%|█████▎    | 42/80 [49:55<22:59, 36.30s/trial, best loss: -0.849782830901208]

  warn(



 55%|█████▌    | 44/80 [1:06:26<3:03:12, 305.35s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 59%|█████▉    | 47/80 [1:07:43<1:07:30, 122.75s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 64%|██████▍   | 51/80 [1:13:18<45:46, 94.70s/trial, best loss: -0.849782830901208]   

  warn(

  warn(

  warn(

  warn(

  warn(



 68%|██████▊   | 54/80 [1:15:17<23:13, 53.61s/trial, best loss: -0.849782830901208]

  warn(

  warn(

  warn(

  warn(

  warn(



 71%|███████▏  | 57/80 [1:21:00<36:12, 94.44s/trial, best loss: -0.849782830901208]

In [None]:
# Retrieve the best parameters
best_params = space_eval(space, best)
print("Best Hyperparameters:")
print(best_params)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_hyp = RandomForestClassifier(
    criterion=best_params['criterion'],
    max_depth =best_params['max_depth'],
    max_features=best_params['max_features'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators']
)

rf_hyp.fit(X_pca_train, y_train)

# Evaluate the performance of the XGBoost classifier hyper
y_pred_rf_hyp = rf_hyp.predict(X_pca_test)
score_rf_hyp = accuracy_score(y_test, y_pred_rf_hyp)
print('Accuracy of Random Forest Classifier using HyperOPT:', score_rf_hyp)

# **TPOTClassifier**

In [10]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.1-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11937 sha256=67f31e0783050616cfb851e032a522345932bae4f23f4b396f7dabb24e8ededc
  Stored in directory: /r

# XGBClssifier

In [13]:
from tpot import TPOTClassifier

tpot_config = {
    'xgboost.XGBClassifier': {
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'n_estimators': range(50, 500),
        'max_depth': range(3, 10),
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4],
        'min_child_weight': range(1, 10)
    }
}

In [14]:
tpot = TPOTClassifier(generations=5, population_size=20, config_dict=tpot_config, verbosity=2, random_state=42, scoring='accuracy')
tpot.fit(X_pca_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9015074281958644

Generation 2 - Current best internal CV score: 0.9017347370736944

Generation 3 - Current best internal CV score: 0.9022349541109735

Generation 4 - Current best internal CV score: 0.9022349541109735

Generation 5 - Current best internal CV score: 0.9022349541109735

Best pipeline: XGBClassifier(input_matrix, colsample_bytree=1.0, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=356, subsample=0.9)


In [15]:
accuracy = tpot.score(X_pca_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9038330377847497


In [16]:
tpot.get_params

<bound method BaseEstimator.get_params of TPOTClassifier(config_dict={'xgboost.XGBClassifier': {'colsample_bytree': [0.8,
                                                                           0.9,
                                                                           1.0],
                                                      'gamma': [0, 0.1, 0.2,
                                                                0.3, 0.4],
                                                      'learning_rate': [0.01,
                                                                        0.1,
                                                                        0.2,
                                                                        0.3],
                                                      'max_depth': range(3, 10),
                                                      'min_child_weight': range(1, 10),
                                                      'n_estimators': range(50, 500),


In [30]:
import joblib

# Save only the fitted pipeline of TPOT
joblib.dump(tpot.fitted_pipeline_, '/content/tpot_xgbclassifier_pipeline.joblib')


['/content/tpot_xgbclassifier_pipeline.joblib']

# Random Forest Classifier

In [33]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:

from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param},
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_pca_train,y_train)

Optimization Progress:   0%|          | 0/84 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9015301835336046


In [None]:
accuracy = tpot_classifier.score(X_pca_test, y_test)
print(accuracy)

In [None]:
tpot_classifier.get_params

In [None]:
import joblib

# Save only the fitted pipeline of TPOT
joblib.dump(tpot_classifier.fitted_pipeline_, '/content/tpot_randomforestclassifier_pipeline.joblib')


# **Optuna**

# XGBClassifier

In [None]:
!pip install optuna

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def objective(trial,X_pca_train,y_train,X_pca_test,y_test):
    train_x, valid_x, train_y, valid_y = X_pca_train,X_pca_test,y_train,y_test

    params = {
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int("n_estimators", 50, 500),
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        'subsample': trial.suggest_categorical("subsample", [0.8, 0.9, 1.0]),
        'colsample_bytree': trial.suggest_categorical("colsample_bytree", [0.8, 0.9, 1.0]),
        'gamma': trial.suggest_categorical("gamma", [0, 0.1, 0.2, 0.3, 0.4]),
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 10)
    }

    # Create XGBoost classifier with the suggested hyperparameters
    clf = XGBClassifier(**params)

    # Train the classifier on the training set
    clf.fit(train_x, train_y)

    # Calculate accuracy on the validation set
    accuracy = accuracy_score(valid_y, clf.predict(valid_x))

    return 1.0 - accuracy  # Optimize for accuracy

In [None]:

# Create a study object and optimize the objective function
study = optuna.create_study(direction="minimize")  # Minimize because we want to minimize 1.0 - accuracy
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# **Data Extraction From Url**

In [18]:
import re
import socket

In [19]:
def extract_additional_url_features(url):
    parsed_url = urlparse(url)

    # Extracting features based on characters in the URL
    char_count_features = {
        'qty_dot_url': url.count('.'),
        'qty_hyphen_url': url.count('-'),
        'qty_underline_url': url.count('_'),
        'qty_slash_url': url.count('/'),
        'qty_questionmark_url': url.count('?'),
        'qty_equal_url': url.count('='),
        'qty_at_url': url.count('@'),
        'qty_exclamation_url': url.count('!'),
        'qty_space_url': url.count(' '),
        'qty_tilde_url': url.count('~'),
        'qty_comma_url': url.count(','),
        'qty_plus_url': url.count('+'),
        'qty_asterisk_url': url.count('*'),
        'qty_hashtag_url': url.count('#'),
        'qty_dollar_url': url.count('$'),
        'qty_percent_url': url.count('%'),
        'qty_tld_url': url.count('.') - 1,  # Subtracting 1 to exclude the dot in TLD
        'length_url': len(url)
    }

    return char_count_features

In [20]:

def extract_additional_domain_features(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    # Extracting features based on characters in the domain
    char_count_features = {
        'qty_dot_domain': domain.count('.'),
        'qty_hyphen_domain': domain.count('-'),
        'qty_underline_domain': domain.count('_'),
        'qty_at_domain': domain.count('@'),
        'qty_vowels_domain': sum(1 for char in domain if char.lower() in 'aeiou'),
    }

    # Check if the domain is an IP address
    try:
        ip_address = socket.gethostbyname(domain)
        domain_in_ip = 1
    except socket.error:
        domain_in_ip = 0

    # Check if "server" or "client" is present as a separate word in the domain
    server_client_domain = 1 if re.search(r'\b(server|client)\b', domain, flags=re.IGNORECASE) else 0

    char_count_features['domain_in_ip'] = domain_in_ip
    char_count_features['server_client_domain'] = server_client_domain

    return char_count_features


In [21]:
def extract_additional_path_features(url):
    parsed_url = urlparse(url)
    path = parsed_url.path

    # Extracting features based on characters in the directory
    directory_features = {
        'qty_dot_directory': path.count('.'),
        'qty_hyphen_directory': path.count('-'),
        'qty_underline_directory': path.count('_'),
        'qty_questionmark_directory': path.count('?'),
        'directory_length': len(path),
    }

    # Extracting features based on characters in the file
    file_features = {
        'qty_hyphen_file': parsed_url.path.rfind('-'),
        'file_length': len(parsed_url.path),
    }

    return {**directory_features, **file_features}

In [22]:
def extract_additional_params_features(url):
    parsed_url = urlparse(url)
    params = parse_qs(parsed_url.query)

    # Extracting features based on characters in the parameters
    params_features = {
        'qty_dot_params': sum(value[0].count('.') for value in params.values()),
        'qty_hyphen_params': sum(value[0].count('-') for value in params.values()),
        'qty_underline_params': sum(value[0].count('_') for value in params.values()),
        'qty_slash_params': sum(value[0].count('/') for value in params.values()),
        'qty_questionmark_params': sum(value[0].count('?') for value in params.values()),
        'qty_percent_params': sum(value[0].count('%') for value in params.values()),
    }

    return params_features

In [23]:
from urllib.parse import urlparse

# Function to check if an email is present in the URL
def email_in_url(url):
    return 1 if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', url) else 0

'''def url_shortened(url):
    return 1 if len(url) <= 25 else 0

def time_domain_activation(domain_info):
    if isinstance(domain_info.creation_date, list) and isinstance(domain_info.updated_date, list):
        # If creation_date and updated_date are lists, take the first element
        creation_date = domain_info.creation_date[0]
        updated_date = domain_info.updated_date[0]
    else:
        # Otherwise, use the original values
        creation_date = domain_info.creation_date
        updated_date = domain_info.updated_date

    try:
        return (creation_date - updated_date).days
    except TypeError:
        return None

# Function to get the time of domain expiration
def time_domain_expiration(domain):
    try:
        domain_info = whois.whois(domain)
        return (domain_info.expiration_date - domain_info.creation_date).days
    except whois.parser.PywhoisError:
        return None'''

'def url_shortened(url):\n    return 1 if len(url) <= 25 else 0\n\ndef time_domain_activation(domain_info):\n    if isinstance(domain_info.creation_date, list) and isinstance(domain_info.updated_date, list):\n        # If creation_date and updated_date are lists, take the first element\n        creation_date = domain_info.creation_date[0]\n        updated_date = domain_info.updated_date[0]\n    else:\n        # Otherwise, use the original values\n        creation_date = domain_info.creation_date\n        updated_date = domain_info.updated_date\n\n    try:\n        return (creation_date - updated_date).days\n    except TypeError:\n        return None\n\n# Function to get the time of domain expiration\ndef time_domain_expiration(domain):\n    try:\n        domain_info = whois.whois(domain)\n        return (domain_info.expiration_date - domain_info.creation_date).days\n    except whois.parser.PywhoisError:\n        return None'

In [24]:

from urllib.parse import urlparse, parse_qs

def extract_all_features(url):
    parsed_url = urlparse(url)
    # Extract URL-based features
    url_features = extract_additional_url_features(url)

    # Extract Domain-based features
    domain_features = extract_additional_domain_features(url)

    # Extract Page-based features
    path_features = extract_additional_path_features(url)

    # Extract Params-based features
    params_features = extract_additional_params_features(url)

    # Extract Additional Features
    additional_features = {
        'email_in_url': email_in_url(url),
        #'time_domain_activation': time_domain_activation(parsed_url.netloc),
        #'time_domain_expiration': time_domain_expiration(parsed_url.netloc),
        #'url_shortened': url_shortened(url),
    }

    # Combine all features
    all_features = {**url_features, **domain_features, **path_features, **params_features, **additional_features}

    return all_features

# Example usage:
url = "http://example.com/path/to/page?param1=value1&param2=value2"
all_extracted_features = extract_all_features(url)

# Display the extracted features
print(all_extracted_features)


{'qty_dot_url': 1, 'qty_hyphen_url': 0, 'qty_underline_url': 0, 'qty_slash_url': 5, 'qty_questionmark_url': 1, 'qty_equal_url': 2, 'qty_at_url': 0, 'qty_exclamation_url': 0, 'qty_space_url': 0, 'qty_tilde_url': 0, 'qty_comma_url': 0, 'qty_plus_url': 0, 'qty_asterisk_url': 0, 'qty_hashtag_url': 0, 'qty_dollar_url': 0, 'qty_percent_url': 0, 'qty_tld_url': 0, 'length_url': 59, 'qty_dot_domain': 1, 'qty_hyphen_domain': 0, 'qty_underline_domain': 0, 'qty_at_domain': 0, 'qty_vowels_domain': 4, 'domain_in_ip': 1, 'server_client_domain': 0, 'qty_dot_directory': 0, 'qty_hyphen_directory': 0, 'qty_underline_directory': 0, 'qty_questionmark_directory': 0, 'directory_length': 13, 'qty_hyphen_file': -1, 'file_length': 13, 'qty_dot_params': 0, 'qty_hyphen_params': 0, 'qty_underline_params': 0, 'qty_slash_params': 0, 'qty_questionmark_params': 0, 'qty_percent_params': 0, 'email_in_url': 0}


# **Testing**

# HyperOPT XGBClassifier

In [None]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
prediction = xgb_hyp.predict(pca_transformed_data)
print(prediction)

[1]




# HyperOPT Random Forest Classifier

In [None]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
prediction = rf_hyp.predict(pca_transformed_data)
print(prediction)

array([[ 3.89625317, -2.77028969, -1.01534515, -1.5399523 , -1.37525167,
         3.87685379, -1.00835236,  4.63073484,  5.3159082 , -7.51699881,
        -4.32969553,  5.15898124, -0.03700143,  0.6175854 , -2.4718587 ,
        -4.37237251, -1.51808813, -3.84874513, -3.22719764,  2.09611946]])

# TPOTClassifier Xgboost

In [32]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)
# Load the fitted pipeline
loaded_pipeline = joblib.load('/content/tpot_xgbclassifier_pipeline.joblib')

# Now you can use loaded_pipeline for predictions

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
#prediction = loaded_pipeline.predict(pca_transformed_data)
prediction = tpot.predict(pca_transformed_data)

print(prediction)

[1]




# TPOTClassifier Random Forest

In [None]:
url = "http://app.validchk.com/visitqr.aspx?vid=1073653"
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a scaler object
scaled_data = scaler.transform(data)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(scaled_data)

# Use the trained XGBBoost for prediction
prediction = xgb_hyp.predict(pca_transformed_data)
print(prediction)