### **Installing Libraries and Importing Modules:**

In [3]:
import shap
import interpret

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2

import warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectPercentile

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn import svm

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

### **Function Declarations:**

In [4]:
def load_data(file_name):
  df = pd.read_csv(file_name,encoding='utf-8', engine='python')
  return df

def plot_label_distribution(y, title):
  counts = y.value_counts()
  plt.figure(figsize=(8, 9))
  plt.bar(counts.index, counts.values, tick_label=['benign', 'phishing'], color=['#4477AA', '#EE6677'])
  plt.xlabel('Label')
  plt.ylabel('Count')
  plt.title(title)
  plt.show()

def plot_corr_matrix(X):
  corr_matrix = X.corr()
  plt.figure(figsize=(50, 50))
  mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
  sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', mask=mask, annot_kws={"size": 8})
  plt.show()

def print_correlated_pairs(X):
  corr_matrix = X.corr()
  corr_pairs = corr_matrix.unstack()
  selected_pairs = corr_pairs[((corr_pairs > 0.5) | (corr_pairs < -0.5)) & (corr_pairs != 1)]
  sorted_pairs = selected_pairs.sort_values(kind="quicksort")
  pd.set_option('display.max_rows', None)
  print(sorted_pairs)

def find_correlated_features(dataset, threshold):
  col_corr = set()  # Set of all the names of correlated columns
  corr_matrix = dataset.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
      if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
        colname = corr_matrix.columns[i]  # getting the name of column
        col_corr.add(colname)
  return col_corr

def drop_correlated_features(X_train, X_test, corr_features):
  X_train = X_train.drop(corr_features,axis=1)
  X_test = X_test.drop(corr_features,axis=1)
  return X_train, X_test

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
  # Train the model
  model.fit(X_train, y_train)

  # Make predictions on the test set
  y_pred = model.predict(X_test)

  # Calculate the evaluation metrics
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  # Print the evaluation metrics
  print(f"{model_name}:")
  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1}\n")

  # Return the evaluation metrics
  return [accuracy, precision, recall, f1]

def plot_metrics(models_metrics, metric_names, model_names, graph_title):
  n_models = len(model_names)
  n_metrics = len(metric_names)
  bar_width = 0.15
  r = np.arange(n_models)

  # Define colors for each metric
  colors = ['#4477AA', '#228833', '#CCBB44', '#EE6677']

  # Increase the figure size to make the graph wider
  fig, ax = plt.subplots(figsize=(18, 6))

  for i in range(n_metrics):
    # Convert metrics from range 0-1 to 0-100
    metrics_percentage = [metric * 100 for metric in [metrics[i] for metrics in models_metrics]]
    # Add a small gap between the bars in each grouping by subtracting a small value from the bar width
    rects = ax.bar(r + i * (bar_width + 0.02), metrics_percentage, width=bar_width, label=metric_names[i], color=colors[i])

    # Add data labels on top of each bar
    for rect in rects:
      height = rect.get_height()
      ax.text(rect.get_x() + rect.get_width() / 2., 1.02 * height,
              '%.2f' % float(height),
              ha='center', va='bottom')

  ax.set_xlabel('Models')
  ax.set_ylabel('Scores')  # Update label to indicate percentage
  ax.set_title(graph_title)
  ax.set_xticks(r + (n_metrics / 2 - 0.5) * (bar_width + 0.02))
  ax.set_xticklabels(model_names)
  # Move the legend to the bottom
  ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5)

  plt.show()

### **Load Datasets:**

**Loading Preprocessed & Presplit Data**

In [9]:
df1_train_over = load_data('../Dataset/Training/Dataset1/Oversampled/70_30_train.csv')

In [10]:
df1_train_over.info()

In [12]:
df1_train_under = load_data('../Dataset/Training/Dataset1/Undersampled/70_30_train.csv')

In [13]:
df1_train_under.info()

In [14]:
df1_train_overunder = load_data('../Dataset/Training/Dataset1/Over_Undersampled/70_30_train.csv')

In [15]:
df1_train_overunder.info()

In [16]:
df1_test = load_data('../Dataset/Testing/Dataset1/70_30_test.csv')

In [17]:
df1_test.info()

In [18]:
df2_train = load_data('../Dataset/Training/Dataset2/70_30_train.csv')

In [19]:
df2_train.info()

In [21]:
df2_test = load_data('../Dataset/Testing/Dataset2/70_30_test.csv')

In [22]:
df2_test.info()

### **Feature and Label Definition:**

**Define Training Dataset-1 (SMOTE Oversampling)**

In [None]:
# Define the target variable for the training set
y_train_1_over = df1_train_over['phishing']

# Drop the target variable for the training set
X_train_1_over = df1_train_over.drop(labels=['phishing'], axis=1)

**Define Balanced Training Dataset-1 (SMOTE Oversampling)**

In [None]:
# Separate benign and phishing instances in Dataset 1
df1_train_over_benign = df1_train_over[df1_train_over['phishing'] == 0]
df1_train_over_phishing = df1_train_over[df1_train_over['phishing'] == 1]

# Get the number of benign and phishing instances in Dataset 2
num_benign = len(df2_train[df2_train['status'] == 0])
num_phishing = len(df2_train[df2_train['status'] == 1])

# Randomly sample benign and phishing instances from Dataset 1 to match the number of benign and phishing instances in Dataset 2
df1_train_over_benign_sampled = df1_train_over_benign.sample(n=num_benign, random_state=42)
df1_train_over_phishing_sampled = df1_train_over_phishing.sample(n=num_phishing, random_state=42)

# Combine the sampled benign and phishing instances
df1_train_over_sampled = pd.concat([df1_train_over_benign_sampled, df1_train_over_phishing_sampled])

# Display new class counts
print(df1_train_over_sampled.phishing.value_counts())

# Define the target variable for the sampled training set
y_train_1_over_sampled = df1_train_over_sampled['phishing']

# Drop the target variable for the sampled training set
X_train_1_over_sampled = df1_train_over_sampled.drop(labels=['phishing'], axis=1)

**Define Training Dataset-1 (RandomUnderSampler Undersampling)**

In [None]:
# Define the target variable for the training set
y_train_1_under = df1_train_under['phishing']

# Drop the target variable for the training set
X_train_1_under = df1_train_under.drop(labels=['phishing'], axis=1)

**Define Training Dataset-1 (SMOTEENN Over-Undersampling)**

In [None]:
# Define the target variable for the training set
y_train_1_overunder = df1_train_overunder['phishing']

# Drop the target variable for the training set
X_train_1_overunder = df1_train_overunder.drop(labels=['phishing'], axis=1)

**Define Testing Dataset-1**

In [None]:
# # Define the target variable for the testing set
# y_test_1 = df1_test['phishing']

# # Drop the target variable for the testing set
# X_test_1 = df1_test.drop(labels=['phishing'], axis=1)

**Define Balanced Testing Dataset-1**

In [None]:
# Calculate the number of instances in the minority class
minority_class_count = len(df1_test[df1_test.phishing==1])

# Sample the majority class to match the number of samples in the minority class
df1_test_majority_sampled = df1_test[df1_test.phishing==0].sample(n=minority_class_count, random_state=42)

# Combine minority class with sampled majority class
df1_test_balanced = pd.concat([df1_test_majority_sampled, df1_test[df1_test.phishing==1]])

# Display new class counts
print(df1_test_balanced.phishing.value_counts())

# Now you can define your test set with the balanced data
y_test_1 = df1_test_balanced['phishing']
X_test_1 = df1_test_balanced.drop(labels=['phishing'], axis=1)

**Define Training Dataset-2**

In [None]:
# Define the target variable for the training set
y_train_2 = df2_train['status']

# Drop the target variable for the training set
X_train_2 = df2_train.drop(labels=['status'], axis=1)

**Define Testing Dataset-2**

In [None]:
# Define the target variable for the testing set
y_test_2 = df2_test['status']

# Drop the target variable for the testing set
X_test_2 = df2_test.drop(labels=['status'], axis=1)

### **Label Visualization:**

**Graph Benign/Phishing Distribution**

In [None]:
label_plot1 = plot_label_distribution(y_train_1_over, 'Dataset-1 (Oversampling): Distribution of Labels in the Training Set')
label_plot2 = plot_label_distribution(y_test_1, 'Dataset-1 (Oversampling): Distribution of Labels in the Testing Set')

In [None]:
label_plot3 = plot_label_distribution(y_train_2, 'Dataset-2: Distribution of Labels in the Training Set')
label_plot4 = plot_label_distribution(y_test_2, 'Dataset-2: Distribution of Labels in the Testing Set')

### **ML Model Testing:**

**Training and Evaluating**

In [None]:
# models = [
#   (LogisticRegression(random_state=42), "Logistic Regression"),
#   (DecisionTreeClassifier(random_state=42), "Decision Tree"),
#   (RandomForestClassifier(random_state=42), "Random Forest"),
#   (GaussianNB(), "Naive Bayes"),
#   (GradientBoostingClassifier(random_state=42), "Gradient Boosting Machine"),
#   (XGBClassifier(use_label_encoder=False, random_state=42), "XGBoost"),
#   (ExplainableBoostingClassifier(random_state=42), "Explainable Boosting Machine"),
#   (svm.SVC(), "SVM")
# ]

# metric_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

# model_name_1 = []
# metrics_1 = []
# model_name_2 = []
# metrics_2 = []
# model_name_3 = []
# metrics_3 = []
# model_name_4 = []
# metrics_4 = []

# print(f"Dataset 1 (Oversampling):\n")
# for model, name in models:
#   metric = train_and_evaluate_model(model, X_train_1_over, y_train_1_over, X_test_1, y_test_1, name)
#   model_name_1.append(name)
#   metrics_1.append(metric)

# print(f"Dataset 1 (Undersampling):\n")
# for model, name in models:
#   metric = train_and_evaluate_model(model, X_train_1_under, y_train_1_under, X_test_1, y_test_1, name)
#   model_name_2.append(name)
#   metrics_2.append(metric)

# print(f"Dataset 1 (Over-Undersampling):\n")
# for model, name in models:
#   metric = train_and_evaluate_model(model, X_train_1_overunder, y_train_1_overunder, X_test_1, y_test_1, name)
#   model_name_3.append(name)
#   metrics_3.append(metric)

# print(f"Dataset 2:\n")
# for model, name in models:
#   metric = train_and_evaluate_model(model, X_train_2, y_train_2, X_test_2, y_test_2, name)
#   model_name_4.append(name)
#   metrics_4.append(metric)

**Comparison of Results**

In [None]:
# plot_metrics(metrics_1, metric_names, model_name_1, 'Dataset-1 (Oversampling): Comparison of Machine Learning Models')

In [None]:
# plot_metrics(metrics_2, metric_names, model_name_2, 'Dataset-1 (Undersampling): Comparison of Machine Learning Models')

In [None]:
# plot_metrics(metrics_3, metric_names, model_name_3, 'Dataset-1 (Over-Undersampling): Comparison of Machine Learning Models')

In [None]:
# plot_metrics(metrics_4, metric_names, model_name_4, 'Dataset-2: Comparison of Machine Learning Models')

As seen by the graphs, **Oversampling** & **XGBoost** has the highest Accuracy, Precision, Recall, and F1 Score. Thus, it will be the sampling method and model used going forward.

### **Define Full, Unique, and Common Features & Compare:**

In [None]:
for i, feat in enumerate(X_train_1_over, 1):
  print(f"{i}. {feat}")

In [None]:
for i, feat in enumerate(X_train_2, 1):
  print(f"{i}. {feat}")

In [None]:
# Get the feature lists for both datasets
features_dataset_1 = X_test_1.columns.tolist()
features_dataset_2 = X_test_2.columns.tolist()

# Define a dictionary where the keys are the feature names in Dataset 1
# and the values are the corresponding feature names in Dataset 2
feature_pairs = {
  "qty_dot_url": "total_of.",
  "qty_hyphen_url": "total_of-",
  "qty_underline_url": "total_of_",
  "qty_slash_url": "total_of/",
  "qty_questionmark_url": "total_of?",
  "qty_equal_url": "total_of=",
  "qty_at_url": "total_of@",
  "qty_and_url": "total_of&",
  "qty_tilde_url": "total_of~",
  "qty_comma_url": "total_of,",
  "qty_asterisk_url": "total_of*",
  "qty_percent_url": "total_of%",
  "qty_dollar_url": "total_of$",
  "url_google_index": "google_index",
  "tld_present_params": "tld_in_path",
  "qty_redirects": "nb_redirection",
  "length_url": "url_length",
  "url_shortened": "shortening_service",
  "domain_length": "hostname_length",
  "domain_in_ip": "ip"
}

# Initialize lists to store the similar and unique features
similar_features = []
unique_features_1 = []
unique_features_2 = []
full_features_1 = []
full_features_2 = []

# Print the similar features of both datasets
print("Similar features:")
for i, (feature_1, feature_2) in enumerate(feature_pairs.items(), 1):
    print(f"{i}. {feature_1} (Dataset 1) - {feature_2} (Dataset 2)")
    similar_features.append((feature_1, feature_2))

# Print the unique features for each dataset
print("\nUnique features in Dataset 1:")
for feat in features_dataset_1:
    if feat not in feature_pairs.keys():
        print(f"{len(unique_features_1) + 1}. {feat}")
        unique_features_1.append(feat)

print("\nUnique features in Dataset 2:")
for feat in features_dataset_2:
    if feat not in feature_pairs.values():
        print(f"{len(unique_features_2) + 1}. {feat}")
        unique_features_2.append(feat)

# Add the similar features to the full features list for each dataset
for feature_1, feature_2 in feature_pairs.items():
    if feature_1 in features_dataset_1 and feature_2 in features_dataset_2:
        full_features_1.append(feature_1)
        full_features_2.append(feature_1)  # Use Dataset 1's naming scheme

# Add the unique features to the full features list for each dataset
for feat in features_dataset_1:
    if feat not in feature_pairs.keys():
        full_features_1.append(feat)

for feat in features_dataset_2:
    if feat not in feature_pairs.values():
        full_features_2.append(feat)  # Keep Dataset 2's unique naming scheme

# Print the full features for both datasets
print("\nFull features in Dataset 1:")
for i, feature in enumerate(full_features_1, 1):
    print(f"{i}. {feature}")

print("\nFull features in Dataset 2:")
for i, feature in enumerate(full_features_2, 1):
    print(f"{i}. {feature}")

# Print the lists of similar and unique features
print("\nSimilar features:", similar_features)
print("\nUnique features in Dataset 1:", unique_features_1)
print("Unique features in Dataset 2:", unique_features_2)
print("\nFull features in Dataset 1:", full_features_1)
print("Full features in Dataset 2:", full_features_2)

In [None]:
venn2([set(unique_features_1 + similar_features), set(unique_features_2 + similar_features)], set_labels = ('Dataset 1', 'Dataset 2'))
plt.show()

### **Setup Full, Unique, and Common Features:**

In [None]:
# Create a reverse mapping dictionary
reverse_feature_pairs = {v: k for k, v in feature_pairs.items()}

# Create a copy of the Dataset 2 and rename the shared features to match Dataset 1's naming scheme
X_train_2_alt = X_train_2.copy()
X_train_2_alt.rename(columns=reverse_feature_pairs, inplace=True)

X_test_2_alt = X_test_2.copy()
X_test_2_alt.rename(columns=reverse_feature_pairs, inplace=True)

**Full Features**

In [None]:
X_train_1_over_full = X_train_1_over[full_features_1]
X_test_1_full = X_test_1[full_features_1]

In [None]:
X_train_2_full = X_train_2_alt[full_features_2]
X_test_2_full = X_test_2_alt[full_features_2]

**Unique and Common Features**

In [None]:
X_train_1_over_unique = X_train_1_over[unique_features_1]
X_test_1_unique = X_test_1[unique_features_1]

In [None]:
X_train_1_over_common = X_train_1_over[[feature[0] for feature in similar_features]]
X_test_1_common = X_test_1[[feature[0] for feature in similar_features]]

In [None]:
X_train_2_unique = X_train_2[unique_features_2]
X_test_2_unique = X_test_2[unique_features_2]

In [None]:
X_train_2_common = X_train_2[[feature[1] for feature in similar_features]]
X_test_2_common = X_test_2[[feature[1] for feature in similar_features]]

### **Merge Datasets:**

In [None]:
# Extract the common features for each dataset
common_features_df1 = [feature[0] for feature in similar_features]
common_features_df2 = [feature[1] for feature in similar_features]

In [None]:
# Select only the common features from each dataframe
X_train_1_over_setup = X_train_1_over_sampled[common_features_df1]
X_test_1_setup = X_test_1[common_features_df1]

X_train_2_setup = X_train_2[common_features_df2]
X_test_2_setup = X_test_2[common_features_df2]

In [None]:
# Create a dictionary mapping df2 column names to df1 column names
column_mapping = {feature[1]: feature[0] for feature in similar_features}

# Rename the columns
X_train_2_setup = X_train_2_setup.rename(columns=column_mapping)
X_test_2_setup = X_test_2_setup.rename(columns=column_mapping)

In [None]:
X_train_12_over_merge = pd.concat([X_train_1_over_setup, X_train_2_setup])
y_train_12_over_merge = pd.concat([y_train_1_over_sampled, y_train_2])

X_test_12_merge = pd.concat([X_test_1_setup, X_test_2_setup])
y_test_12_merge = pd.concat([y_test_1, y_test_2])

In [None]:
label_plot5 = plot_label_distribution(y_train_12_over_merge, 'Dataset-1+2: Distribution of Labels in the Training Set')
label_plot6 = plot_label_distribution(y_test_12_merge, 'Dataset-1+2: Distribution of Labels in the Testing Set')

### **Feature Importance Setup:**

**Setup Variables**

In [None]:
X_train_1_over_full, X_test_1_over_full = X_train_1_over_full, X_test_1_full
X_train_2_over_full, X_test_2_over_full = X_train_2_full, X_test_2_full

In [None]:
X_train_1_over_unique, X_test_1_over_unique = X_train_1_over_unique, X_test_1_unique
X_train_1_over_common, X_test_1_over_common = X_train_1_over_common, X_test_1_common

In [None]:
X_train_2_unique, X_test_2_unique = X_train_2_unique, X_test_2_unique
X_train_2_common, X_test_2_common = X_train_2_common, X_test_2_common

In [None]:
X_train_12_over_merge, X_test_12_over_merge = X_train_12_over_merge, X_test_12_merge

**Extract Similar Features and Rename Columns**

In [None]:
# Extract the similar features from the datasets
X_train_1_over_similar = X_train_1_over_common[[feature[0] for feature in similar_features if feature[0] in X_train_1_over_common.columns]].copy()
X_test_1_over_similar = X_test_1_over_common[[feature[0] for feature in similar_features if feature[0] in X_test_1_over_common.columns]].copy()

X_train_2_similar = X_train_2_common[[feature[1] for feature in similar_features if feature[1] in X_train_2_common.columns]].copy()
X_test_2_similar = X_test_2_common[[feature[1] for feature in similar_features if feature[1] in X_test_2_common.columns]].copy()

# Create a dictionary to map the new column names to the old ones
rename_dict = {feature[1]: feature[0] for feature in similar_features if feature[1] in X_train_2_similar.columns and feature[0] in X_train_1_over_similar.columns}

# Rename the columns in Dataset 2 to match those in Dataset 1
X_train_2_similar.rename(columns=rename_dict, inplace=True)
X_test_2_similar.rename(columns=rename_dict, inplace=True)

**Get Common Feature Columns from Datasets**

In [None]:
similar_columns = list(set(X_train_1_over_similar.columns) & set(X_train_2_similar.columns) & set(X_train_12_over_merge.columns))

In [None]:
for i, col in enumerate(similar_columns, 1):
  print(f"{i}. {col}")

**Select Common Columns for Equivalency**

In [None]:
X_train_1_over_similar = X_train_1_over_similar[similar_columns]
X_test_1_over_similar = X_test_1_over_similar[similar_columns]

X_train_2_similar = X_train_2_similar[similar_columns]
X_test_2_similar = X_test_2_similar[similar_columns]

X_train_12_over_merge = X_train_12_over_merge[similar_columns]
X_test_12_over_merge = X_test_12_over_merge[similar_columns]

### **Feature Importance Using SHAP**

**1. Train on Dataset-1 (Oversampled) with Full Features, Test on Dataset-1 (Oversampled)**

In [None]:
# Dataset 1 (Oversampling)
model_1 = XGBClassifier(use_label_encoder=False, random_state=42)
model_1.fit(X_train_1_over_full, y_train_1_over)

In [None]:
predicted_1 = model_1.predict(X_test_1_over_full)
print(classification_report(y_test_1, predicted_1, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_1 = shap.Explainer(model_1, X_train_1_over_full)

# Calculate SHAP values
shap_values_1 = explainer_1(X_test_1_over_full)

In [None]:
# Train on Dataset-1 (Oversampled), Test on Dataset-1
shap.plots.beeswarm(shap_values_1, max_display=30)

In [None]:
shap.plots.bar(shap_values_1.mean(0), max_display=30)

**2. Train on Dataset-2 with Full Features, Test on Dataset-2**

In [None]:
# Dataset 2
model_2 = XGBClassifier(use_label_encoder=False, random_state=42)
model_2.fit(X_train_2_full, y_train_2)

In [None]:
predicted_2 = model_2.predict(X_test_2_full)
print(classification_report(y_test_2, predicted_2, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_2 = shap.Explainer(model_2, X_train_2_full)

# Calculate SHAP values
shap_values_2 = explainer_2(X_test_2_full)

In [None]:
# Train on Dataset-2, Test on Dataset-2
shap.plots.beeswarm(shap_values_2, max_display=30)

In [None]:
shap.plots.bar(shap_values_2.mean(0), max_display=30)

**3. Train on Dataset-1 (Oversampled) with Common Features, Test on Dataset-1 (Oversampled)**

In [None]:
# Dataset 1 (Oversampling)
model_3 = XGBClassifier(use_label_encoder=False, random_state=42)
model_3.fit(X_train_1_over_similar, y_train_1_over)

In [None]:
predicted_3 = model_3.predict(X_test_1_over_similar)
print(classification_report(y_test_1, predicted_3, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_3 = shap.Explainer(model_3, X_train_1_over_similar)

# Calculate SHAP values
shap_values_3 = explainer_3(X_test_1_over_similar)

In [None]:
# Train on Dataset-1 (Oversampled), Test on Dataset-1 (Oversampled)
shap.plots.beeswarm(shap_values_3, max_display=20)

In [None]:
shap.plots.bar(shap_values_3.mean(0), max_display=20)

**4. Train on Dataset-2 with Common Features, Test on Dataset-2**

In [None]:
# Dataset 2
model_4 = XGBClassifier(use_label_encoder=False, random_state=42)
model_4.fit(X_train_2_similar, y_train_2)

In [None]:
predicted_4 = model_4.predict(X_test_2_similar)
print(classification_report(y_test_2, predicted_4, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_4 = shap.Explainer(model_4, X_train_2_similar)

# Calculate SHAP values
shap_values_4 = explainer_4(X_test_2_similar)

In [None]:
# Train on Dataset-2, Test on Dataset-2
shap.plots.beeswarm(shap_values_4, max_display=20)

In [None]:
shap.plots.bar(shap_values_4.mean(0), max_display=20)

**5. Train on Dataset-1 (Oversampled), Test on Dataset-2**

In [None]:
# Dataset 1 (Oversampling)
model_5 = XGBClassifier(use_label_encoder=False, random_state=42)
model_5.fit(X_train_1_over_similar, y_train_1_over)

In [None]:
predicted_5 = model_5.predict(X_test_2_similar)
print(classification_report(y_test_2, predicted_5, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_5 = shap.Explainer(model_5, X_train_1_over_similar)

# Calculate SHAP values
shap_values_5 = explainer_5(X_test_2_similar)

In [None]:
# Train on Dataset-1 (Oversampled), Test on Dataset-2
shap.plots.beeswarm(shap_values_5, max_display=20)

In [None]:
shap.plots.bar(shap_values_5.mean(0), max_display=20)

**6. Train on Dataset-2, Test on Dataset-1**

In [None]:
# Dataset 2
model_6 = XGBClassifier(use_label_encoder=False, random_state=42)
model_6.fit(X_train_2_similar, y_train_2)

In [None]:
predicted_6 = model_6.predict(X_test_1_over_similar)
print(classification_report(y_test_1, predicted_6, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_6 = shap.Explainer(model_6, X_train_2_similar)

# Calculate SHAP values
shap_values_6 = explainer_6(X_test_1_over_similar)

In [None]:
# Train on Dataset-2, Test on Dataset-1
shap.plots.beeswarm(shap_values_6, max_display=20)

In [None]:
shap.plots.bar(shap_values_6.mean(0), max_display=20)

**7. Train on Dataset-1+2 (Oversampled), Test on Dataset-1**

In [None]:
# Dataset 1+2
model_7 = XGBClassifier(use_label_encoder=False, random_state=42)
model_7.fit(X_train_12_over_merge, y_train_12_over_merge)

In [None]:
predicted_7 = model_7.predict(X_test_1_over_similar)
print(classification_report(y_test_1, predicted_7, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_7 = shap.Explainer(model_7, X_train_12_over_merge)

# Calculate SHAP values
shap_values_7 = explainer_7(X_test_1_over_similar)

In [None]:
# Train on Dataset-1+2 (Oversampled), Test on Dataset-1
shap.plots.beeswarm(shap_values_7, max_display=20)

In [None]:
shap.plots.bar(shap_values_7.mean(0), max_display=20)

**8. Train on Dataset-1+2 (Oversampled), Test on Dataset-2**

In [None]:
# Dataset 1+2
model_8 = XGBClassifier(use_label_encoder=False, random_state=42)
model_8.fit(X_train_12_over_merge, y_train_12_over_merge)

In [None]:
predicted_8 = model_8.predict(X_test_2_similar)
print(classification_report(y_test_2, predicted_8, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_8 = shap.Explainer(model_8, X_train_12_over_merge)

# Calculate SHAP values
shap_values_8 = explainer_8(X_test_2_similar)

In [None]:
# Train on Dataset-1+2 (Oversampled), Test on Dataset-2
shap.plots.beeswarm(shap_values_8, max_display=20)

In [None]:
shap.plots.bar(shap_values_8.mean(0), max_display=20)

**9. Train on Dataset-1+2 (Oversampled), Test on Dataset-1+2**

In [None]:
# Dataset 1+2
model_9 = XGBClassifier(use_label_encoder=False, random_state=42)
model_9.fit(X_train_12_over_merge, y_train_12_over_merge)

In [None]:
predicted_9 = model_9.predict(X_test_12_over_merge)
print(classification_report(y_test_12_merge, predicted_9, target_names=['legitimate','phishing']))

In [None]:
# Initialize the SHAP explainer
explainer_9 = shap.Explainer(model_9, X_train_12_over_merge)

# Calculate SHAP values
shap_values_9 = explainer_9(X_test_12_over_merge)

In [None]:
# Train on Dataset-1+2 (Oversampled), Test on Dataset-1+2
shap.plots.beeswarm(shap_values_9, max_display=20)

In [None]:
shap.plots.bar(shap_values_9.mean(0), max_display=20)