<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Model_Training/MODEL_TRAINING_BY_USING_AUTO_ML_CODE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pycaret




In [None]:
import pandas as pd
from pycaret.classification import *
import numpy as np
from sklearn.metrics import confusion_matrix, roc_auc_score

# Step 2: Load Your Dataset
data = pd.read_csv('/content/All.csv')  # <-- Adjust path if needed

# Step 3: Select Relevant Features
selected_features = [
    'NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld', 'urlLen', 'domainlength',
    'pathLength', 'subDirLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio',
    'pathDomainRatio', 'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
    'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count', 'host_letter_count', 'Directory_LetterCount',
    'Extension_LetterCount', 'LongestPathTokenLength', 'Domain_LongestWordLength', 'Arguments_LongestWordLength',
    'spcharUrl', 'delimeter_path', 'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
    'SymbolCount_Domain', 'SymbolCount_Directoryname', 'SymbolCount_FileName', 'SymbolCount_Extension',
    'Entropy_Domain', 'Entropy_DirectoryName', 'domain_token_count'
]

# Filter the data to include only the selected features and the target
data = data[selected_features + ['URL_Type_obf_Type']]  # Assuming 'URL_Type_obf_Type' is your target

# Step 4: Check and Clean the Data
numeric_data = data.select_dtypes(include=[np.number])

# Check for NaN and infinite values in numeric columns
if np.any(np.isnan(numeric_data)) or np.any(np.isinf(numeric_data)):
    print("Data contains NaN or infinite values. Cleaning data...")

    # Replace infinite values with NaN and then fill NaN with the median
    numeric_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data[numeric_data.columns] = numeric_data.fillna(numeric_data.median())  # Replace NaN with the median value

# Step 5: Setup PyCaret AutoML
clf = setup(
    data=data,
    target='URL_Type_obf_Type',  # Make sure 'URL_Type_obf_Type' is your target column name
    session_id=123,
    normalize=True,
    use_gpu=False,
    ignore_features=[],  # Add any irrelevant features to this list if needed
)

# Step 6: Compare All Models Automatically
best_models = compare_models(
    sort='Accuracy',     # Sort models by highest Accuracy
    n_select=5           # Select Top 5 Models
)

# Step 7: Get the Results Table
comparison_df = pull()

# Step 8: Compute Additional Metrics Manually
# Function to calculate TPR, TNR, FPR, FNR
def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tp = cm[1, 1]  # True Positive
    tn = cm[0, 0]  # True Negative
    fp = cm[0, 1]  # False Positive
    fn = cm[1, 0]  # False Negative

    # Calculating metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    tpr = recall  # Same as Sensitivity
    tnr = tn / (tn + fp) if (tn + fp) != 0 else 0  # Specificity
    fpr = fp / (fp + tn) if (fp + tn) != 0 else 0    # False Positive Rate
    fnr = fn / (fn + tp) if (fn + tp) != 0 else 0    # False Negative Rate
    roc_auc = roc_auc_score(y_true, y_pred)

    return accuracy, precision, recall, f1, roc_auc, tpr, tnr, fpr, fnr

# Step 9: Store the Model Metrics
metrics_list = []
for model in best_models:
    print(f"\n🔵 Evaluating Model: {model}")
    model_predictions = predict_model(model)

    # Check column names of model_predictions to figure out the correct prediction column
    print("Predicted Dataframe Columns:", model_predictions.columns)

    # Find the correct column name for predictions
    predicted_column_name = model_predictions.columns[0]  # Usually the first column is the prediction

    y_true = data['URL_Type_obf_Type'].values  # Actual labels
    y_pred = model_predictions[predicted_column_name].values  # Use the correct prediction column

    # Check if the lengths match
    if len(y_true) == len(y_pred):
        # Get the metrics for this model
        metrics = calculate_metrics(y_true, y_pred)
        metrics_list.append([model, *metrics])
    else:
        print(f"Length mismatch for {model}. Skipping this model.")

# Step 10: Create a DataFrame for the Metrics
metrics_df = pd.DataFrame(metrics_list, columns=[
    'Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC', 'TPR', 'TNR', 'FPR', 'FNR'
])

# Step 11: Print the Comparison Table
print("\n🔵 Model Comparison Table 🔵\n")
print(metrics_df)

# Step 12: (Optional) Save Table to CSV File
# metrics_df.to_csv('model_comparison_results.csv', index=False)


Data contains NaN or infinite values. Cleaning data...


Unnamed: 0,Description,Value
0,Session id,123
1,Target,URL_Type_obf_Type
2,Target type,Multiclass
3,Target mapping,"Defacement: 0, benign: 1, malware: 2, phishing: 3, spam: 4"
4,Original data shape,"(36707, 38)"
5,Transformed data shape,"(36707, 38)"
6,Transformed train set shape,"(25694, 38)"
7,Transformed test set shape,"(11013, 38)"
8,Numeric features,37
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9788,0.999,0.9788,0.9789,0.9788,0.9735,0.9735,3.337
lightgbm,Light Gradient Boosting Machine,0.9786,0.999,0.9786,0.9787,0.9786,0.9732,0.9732,13.14
et,Extra Trees Classifier,0.9734,0.9986,0.9734,0.9737,0.9735,0.9667,0.9668,2.776
rf,Random Forest Classifier,0.9727,0.9986,0.9727,0.973,0.9728,0.9658,0.9659,5.233
dt,Decision Tree Classifier,0.9466,0.9666,0.9466,0.9465,0.9464,0.9332,0.9333,0.492
knn,K Neighbors Classifier,0.9419,0.9888,0.9419,0.9419,0.9415,0.9273,0.9275,1.049
gbc,Gradient Boosting Classifier,0.9352,0.0,0.9352,0.9357,0.9353,0.9189,0.919,45.329
lr,Logistic Regression,0.7568,0.0,0.7568,0.7546,0.7538,0.6952,0.6961,2.445
svm,SVM - Linear Kernel,0.7432,0.0,0.7432,0.746,0.7366,0.6778,0.6812,1.106
ridge,Ridge Classifier,0.7009,0.0,0.7009,0.6999,0.6924,0.6247,0.6279,0.166


Processing:   0%|          | 0/69 [00:00<?, ?it/s]


🔵 Evaluating Model: XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, objective='multi:softprob', ...)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9803,0.999,0.9803,0.9803,0.9803,0.9753,0.9753


Predicted Dataframe Columns: Index(['NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld',
       'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
       'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count',
       'host_letter_count', 'Directory_LetterCount', 'Extension_LetterCount',
       'LongestPathTokenLength', 'Domain_LongestWordLength',
       'Arguments_LongestWordLength', 'spcharUrl', 'delimeter_path',
       'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
       'SymbolCount_Domain', 'SymbolCount_Directoryname',
       'SymbolCount_FileName', 'SymbolCount_Extension', 'Entropy_Domain',
       'Entropy_DirectoryName', 'domain_token_count', 'URL_Type_obf_Type',
       'prediction_label', 'prediction_score'],
      dtype='object')
Length mismatch for XGBClassifier(base_score=None

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9798,0.999,0.9798,0.9798,0.9798,0.9747,0.9747


Predicted Dataframe Columns: Index(['NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld',
       'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
       'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count',
       'host_letter_count', 'Directory_LetterCount', 'Extension_LetterCount',
       'LongestPathTokenLength', 'Domain_LongestWordLength',
       'Arguments_LongestWordLength', 'spcharUrl', 'delimeter_path',
       'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
       'SymbolCount_Domain', 'SymbolCount_Directoryname',
       'SymbolCount_FileName', 'SymbolCount_Extension', 'Entropy_Domain',
       'Entropy_DirectoryName', 'domain_token_count', 'URL_Type_obf_Type',
       'prediction_label', 'prediction_score'],
      dtype='object')
Length mismatch for LGBMClassifier(boosting_type=

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9743,0.9986,0.9743,0.9746,0.9744,0.9678,0.9679


Predicted Dataframe Columns: Index(['NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld',
       'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
       'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count',
       'host_letter_count', 'Directory_LetterCount', 'Extension_LetterCount',
       'LongestPathTokenLength', 'Domain_LongestWordLength',
       'Arguments_LongestWordLength', 'spcharUrl', 'delimeter_path',
       'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
       'SymbolCount_Domain', 'SymbolCount_Directoryname',
       'SymbolCount_FileName', 'SymbolCount_Extension', 'Entropy_Domain',
       'Entropy_DirectoryName', 'domain_token_count', 'URL_Type_obf_Type',
       'prediction_label', 'prediction_score'],
      dtype='object')
Length mismatch for ExtraTreesClassifier(bootstra

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9729,0.9985,0.9729,0.9731,0.9729,0.966,0.966


Predicted Dataframe Columns: Index(['NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld',
       'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
       'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count',
       'host_letter_count', 'Directory_LetterCount', 'Extension_LetterCount',
       'LongestPathTokenLength', 'Domain_LongestWordLength',
       'Arguments_LongestWordLength', 'spcharUrl', 'delimeter_path',
       'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
       'SymbolCount_Domain', 'SymbolCount_Directoryname',
       'SymbolCount_FileName', 'SymbolCount_Extension', 'Entropy_Domain',
       'Entropy_DirectoryName', 'domain_token_count', 'URL_Type_obf_Type',
       'prediction_label', 'prediction_score'],
      dtype='object')
Length mismatch for RandomForestClassifier(bootst

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.9462,0.9663,0.9462,0.9459,0.946,0.9326,0.9326


Predicted Dataframe Columns: Index(['NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld',
       'urlLen', 'domainlength', 'pathLength', 'subDirLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
       'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count',
       'host_letter_count', 'Directory_LetterCount', 'Extension_LetterCount',
       'LongestPathTokenLength', 'Domain_LongestWordLength',
       'Arguments_LongestWordLength', 'spcharUrl', 'delimeter_path',
       'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
       'SymbolCount_Domain', 'SymbolCount_Directoryname',
       'SymbolCount_FileName', 'SymbolCount_Extension', 'Entropy_Domain',
       'Entropy_DirectoryName', 'domain_token_count', 'URL_Type_obf_Type',
       'prediction_label', 'prediction_score'],
      dtype='object')
Length mismatch for DecisionTreeClassifier(ccp_al