<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Model/Model_Training_FULLY_CLEANED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Model_Training/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pycaret


In [None]:
import pandas as pd
from pycaret.classification import *
import numpy as np
from sklearn.metrics import confusion_matrix, roc_auc_score

# Step 2: Load Your Dataset
data = pd.read_csv('/content/All.csv')  # <-- Adjust path if needed

# Step 3: Select Relevant Features
selected_features = [
    'NumberofDotsinURL', 'avgdomaintokenlen', 'avgpathtokenlen', 'tld', 'urlLen', 'domainlength',
    'pathLength', 'subDirLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio',
    'pathDomainRatio', 'argPathRatio', 'CharacterContinuityRate', 'LongestVariableValue',
    'URL_DigitCount', 'Extension_DigitCount', 'URL_Letter_Count', 'host_letter_count', 'Directory_LetterCount',
    'Extension_LetterCount', 'LongestPathTokenLength', 'Domain_LongestWordLength', 'Arguments_LongestWordLength',
    'spcharUrl', 'delimeter_path', 'NumberRate_URL', 'NumberRate_FileName', 'SymbolCount_URL',
    'SymbolCount_Domain', 'SymbolCount_Directoryname', 'SymbolCount_FileName', 'SymbolCount_Extension',
    'Entropy_Domain', 'Entropy_DirectoryName', 'domain_token_count'
]

# Filter the data to include only the selected features and the target
data = data[selected_features + ['URL_Type_obf_Type']]  # Assuming 'URL_Type_obf_Type' is your target

# Step 4: Check and Clean the Data
numeric_data = data.select_dtypes(include=[np.number])

# Check for NaN and infinite values in numeric columns
if np.any(np.isnan(numeric_data)) or np.any(np.isinf(numeric_data)):
    print("Data contains NaN or infinite values. Cleaning data...")

    # Replace infinite values with NaN and then fill NaN with the median
    numeric_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data[numeric_data.columns] = numeric_data.fillna(numeric_data.median())  # Replace NaN with the median value

# Step 5: Setup PyCaret AutoML
clf = setup(
    data=data,
    target='URL_Type_obf_Type',  # Make sure 'URL_Type_obf_Type' is your target column name
    session_id=123,
    normalize=True,
    use_gpu=False,
    ignore_features=[],  # Add any irrelevant features to this list if needed
)

# Step 6: Compare All Models Automatically
best_models = compare_models(
    sort='Accuracy',     # Sort models by highest Accuracy
    n_select=5           # Select Top 5 Models
)

# Step 7: Get the Results Table
comparison_df = pull()

# Step 8: Compute Additional Metrics Manually
# Function to calculate TPR, TNR, FPR, FNR
def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tp = cm[1, 1]  # True Positive
    tn = cm[0, 0]  # True Negative
    fp = cm[0, 1]  # False Positive
    fn = cm[1, 0]  # False Negative

    # Calculating metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    tpr = recall  # Same as Sensitivity
    tnr = tn / (tn + fp) if (tn + fp) != 0 else 0  # Specificity
    fpr = fp / (fp + tn) if (fp + tn) != 0 else 0    # False Positive Rate
    fnr = fn / (fn + tp) if (fn + tp) != 0 else 0    # False Negative Rate
    roc_auc = roc_auc_score(y_true, y_pred)

    return accuracy, precision, recall, f1, roc_auc, tpr, tnr, fpr, fnr

# Step 9: Store the Model Metrics
metrics_list = []
for model in best_models:
    print(f"\n🔵 Evaluating Model: {model}")
    model_predictions = predict_model(model)

    # Check column names of model_predictions to figure out the correct prediction column
    print("Predicted Dataframe Columns:", model_predictions.columns)

    # Find the correct column name for predictions
    predicted_column_name = model_predictions.columns[0]  # Usually the first column is the prediction

    y_true = data['URL_Type_obf_Type'].values  # Actual labels
    y_pred = model_predictions[predicted_column_name].values  # Use the correct prediction column

    # Check if the lengths match
    if len(y_true) == len(y_pred):
        # Get the metrics for this model
        metrics = calculate_metrics(y_true, y_pred)
        metrics_list.append([model, *metrics])
    else:
        print(f"Length mismatch for {model}. Skipping this model.")

# Step 10: Create a DataFrame for the Metrics
metrics_df = pd.DataFrame(metrics_list, columns=[
    'Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC', 'TPR', 'TNR', 'FPR', 'FNR'
])

# Step 11: Print the Comparison Table
print("\n🔵 Model Comparison Table 🔵\n")
print(metrics_df)

# Step 12: (Optional) Save Table to CSV File
# metrics_df.to_csv('model_comparison_results.csv', index=False)
