In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/url-phishing-extracted-features/22IT085_Pre-processed_Dataset.csv
/kaggle/input/url-phishing-extracted-features/221IT085_URLfeaturedataset.csv


In [2]:
df = pd.read_csv("/kaggle/input/url-phishing-extracted-features/22IT085_Pre-processed_Dataset.csv", nrows=100000)

In [3]:
df.columns

Index(['full_url_length', 'hostname_length', 'ip_address_in_url', 'dot_count',
       'hyphen_count', 'underscore_count', 'slash_count',
       'question_mark_count', 'equal_count', 'at_count',
       ...
       'dns_record_check', 'media_links_ratio', 'connection_errors_ratio',
       'mx_servers_count', 'spf_record', 'domain_in_title', 'web_traffic',
       'google_index', 'page_rank', 'Label'],
      dtype='object', length=116)

In [4]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import random
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif, RFE, f_classif, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, matthews_corrcoef, confusion_matrix
from imblearn.over_sampling import SMOTE

# ---------------------------
# Assume df is your DataFrame with the 'Label' column.
# ---------------------------
# Remove constant features (features that do not vary) except the Label.
non_constant_cols = [col for col in df.columns if df[col].nunique() > 1 or col == 'Label']
df_filtered = df[non_constant_cols]

X = df_filtered.drop("Label", axis=1)
y = df_filtered["Label"]

# Remove highly correlated features (keep only one from a set of highly correlated features).
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
X_uncorrelated = X.drop(columns=to_drop)
print("Removed highly correlated features:", to_drop)

# ---------------------------
# PCA Dimensionality Reduction: keep top 10 principal components.
# ---------------------------
pca = PCA(n_components=10)
principal_components = pca.fit_transform(X_uncorrelated)
print("\nTop 10 Principal Components (Explained Variance Ratio):")
for i, ratio in enumerate(pca.explained_variance_ratio_, 1):
    print(f"PC{i}: {ratio:.4f}")

# Create a DataFrame for PCA features.
pca_df = pd.DataFrame(principal_components, columns=[f'PC{i}' for i in range(1, 11)])

pca_components = pd.DataFrame(pca.components_, 
                              columns=X_uncorrelated.columns, 
                              index=[f"PC{i}" for i in range(1, 11)])
print("\nPCA Component Loadings:")
print(pca_components)

# ---------------------------
# Feature Selection Techniques
# ---------------------------
# 1. Mutual Information
mi = mutual_info_classif(X_uncorrelated, y, random_state=42)
mi_series = pd.Series(mi, index=X_uncorrelated.columns)
mi_top = mi_series.sort_values(ascending=False).head(30)
print("\nTop 30 features by Mutual Information:")
print(mi_top)

# 2. Recursive Feature Elimination (RFE) with Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=lr, n_features_to_select=30)
rfe.fit(X_uncorrelated, y)
rfe_ranking = pd.Series(rfe.ranking_, index=X_uncorrelated.columns)
rfe_top_features = rfe_ranking[rfe_ranking == 1].index.tolist()
print("\nTop features by RFE:")
print(rfe_top_features)

# 3. SelectKBest with ANOVA F-test
skb = SelectKBest(score_func=f_classif, k=30)
skb.fit(X_uncorrelated, y)
skb_scores = pd.Series(skb.scores_, index=X_uncorrelated.columns)
skb_top = skb_scores.sort_values(ascending=False).head(30)
print("\nTop 30 features by ANOVA F-test:")
print(skb_top)

# 4. Extra Trees Classifier for Feature Importance
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
etc.fit(X_uncorrelated, y)
etc_importances = pd.Series(etc.feature_importances_, index=X_uncorrelated.columns)
etc_top = etc_importances.sort_values(ascending=False).head(30)
print("\nTop 30 features by ExtraTreesClassifier importance:")
print(etc_top)

# 5. Chi-Square Test (requires non-negative values; we scale features to [0,1])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_uncorrelated)
X_chi2 = pd.DataFrame(X_scaled, columns=X_uncorrelated.columns)
chi2_selector = SelectKBest(score_func=chi2, k=30)
chi2_selector.fit(X_chi2, y)
chi2_scores = pd.Series(chi2_selector.scores_, index=X_chi2.columns)
chi2_top = chi2_scores.sort_values(ascending=False).head(30)
chi2_top_features = chi2_top.index.tolist()
print("\nTop 30 features by Chi-Square Test:")
print(chi2_top)

# ---------------------------
# Define feature sets including PCA
# ---------------------------
feature_sets = {
    'MutualInformation': mi_top.index.tolist(),
    'RFE': rfe_top_features,
    'ANOVAFtest': skb_top.index.tolist(),
    'ExtraTrees': etc_top.index.tolist(),
    'ChiSquare': chi2_top_features,
    'PCA': list(pca_df.columns)  # PCA features: PC1 to PC10
}

# We want to train 4 models using each feature set for SVC and for One-Class SVM.
# For SVC, we also compute extra metrics (MCC, ROC, AUC).
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
results_svc = {}
results_oneclass = {}

# Loop over each feature set (including PCA)
for method_name, features in feature_sets.items():
    print(f"\n--- Training models using features from {method_name} ---")
    
    # For ChiSquare, use the scaled version; for PCA, use the pca_df; for others, use uncorrelated features.
    if method_name == 'ChiSquare':
        X_method = X_chi2[features]
    elif method_name == 'PCA':
        X_method = pca_df  # already a DataFrame with PCA features
    else:
        X_method = X_uncorrelated[features]
    
    # Split data into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(X_method, y, test_size=0.2, random_state=42)
    
    # For standard SVC models, balance the training data using SMOTE.
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
    
    for kernel in kernels:
        print(f"\nSVC using {method_name} features with {kernel} kernel:")
        # Set probability=True to compute ROC curves later.
        svc_model = SVC(kernel=kernel, probability=True, random_state=42)
        svc_model.fit(X_train_bal, y_train_bal)
        
        # Make predictions on the test set.
        y_pred = svc_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        
        # Cap accuracy if above 99%
        acc_percent = round(acc * 100, 2)
        
        
        # Compute ROC curve and AUC score using probability estimates.
        y_proba = svc_model.predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, y_proba, pos_label=1)
        roc_auc = roc_auc_score(y_test, y_proba)
        
        # Print metrics.
        print(f"Accuracy: {acc_percent}%")
        print(f"MCC: {round(mcc, 4)}")
        print(f"ROC AUC: {round(roc_auc, 4)}")
        
        # Compute and print the Confusion Matrix.
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:")
        print(cm)
        
        # # Plot ROC curve and show it inline.
        # plt.figure()
        # plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.4f})")
        # plt.plot([0, 1], [0, 1], 'k--')
        # plt.xlim([0.0, 1.0])
        # plt.ylim([0.0, 1.05])
        # plt.xlabel("False Positive Rate")
        # plt.ylabel("True Positive Rate")
        # plt.title(f"ROC Curve: SVC ({method_name}, {kernel})")
        # plt.legend(loc="lower right")
        # plt.show()  # Display the plot inline
        
        # Save the SVC model.
        filename = f"svm_{method_name}_{kernel}.pkl"
        with open(filename, "wb") as file:
            pickle.dump(svc_model, file)
        print(f"Model saved as {filename}")
        
        results_svc[f"{method_name}_{kernel}"] = {
            "accuracy": acc_percent,
            "mcc": round(mcc, 4),
            "roc_auc": round(roc_auc, 4)
        }
    
    # ----- One-Class SVM Models -----
    # Train One-Class SVM only on positive samples (assume label 1 is normal).
    X_train_positive = X_train[y_train == 1]
    if X_train_positive.empty:
        print(f"Skipping One-Class SVM for {method_name} as no positive class samples available in training data.")
        continue
    
    # Adjust y_test: map label 1 -> 1 and all others -> -1.
    y_test_adjusted = y_test.apply(lambda label: 1 if label == 1 else -1)
    
    for kernel in kernels:
        print(f"\nOne-Class SVM using {method_name} features with {kernel} kernel:")
        one_class_model = OneClassSVM(kernel=kernel)
        one_class_model.fit(X_train_positive)
        
        # Predict using One-Class SVM (1 for inliers, -1 for outliers).
        y_pred_oc = one_class_model.predict(X_test)
        acc_oc = accuracy_score(y_test_adjusted, y_pred_oc)
        mcc_oc = matthews_corrcoef(y_test_adjusted, y_pred_oc)
        
        # Cap accuracy if above 99%
        acc_oc_percent = round(acc_oc * 100, 2)
        
        
        print(f"Accuracy: {acc_oc_percent}%")
        print(f"MCC: {round(mcc_oc, 4)}")
        
        # Compute and print the Confusion Matrix for One-Class SVM.
        cm_oc = confusion_matrix(y_test_adjusted, y_pred_oc)
        print("Confusion Matrix (One-Class SVM):")
        print(cm_oc)
        
        # Save the One-Class SVM model.
        filename_oc = f"oneclasssvm_{method_name}_{kernel}.pkl"
        with open(filename_oc, "wb") as file:
            pickle.dump(one_class_model, file)
        print(f"One-Class SVM model saved as {filename_oc}")
        
        results_oneclass[f"{method_name}_{kernel}"] = {
            "accuracy": acc_oc_percent,
            "mcc": round(mcc_oc, 4)
        }

# Print summary of SVC model metrics.
print("\nSummary of SVC model metrics:")
for key, metrics in results_svc.items():
    print(f"{key}: Accuracy = {metrics['accuracy']}%, MCC = {metrics['mcc']}, ROC AUC = {metrics['roc_auc']}")

print("\nSummary of One-Class SVM model metrics:")
for key, metrics in results_oneclass.items():
    print(f"{key}: Accuracy = {metrics['accuracy']}%, MCC = {metrics['mcc']}")


  return op(a, b)


Removed highly correlated features: ['char_repeat_hostname', 'shortest_word_hostname', 'longest_word_hostname', 'average_word_length_hostname', 'vowel_count_in_domain', 'domain_lookup_response_time', 'nameservers_count', 'dns_record_check']

Top 10 Principal Components (Explained Variance Ratio):
PC1: 0.1950
PC2: 0.0931
PC3: 0.0649
PC4: 0.0556
PC5: 0.0467
PC6: 0.0415
PC7: 0.0399
PC8: 0.0343
PC9: 0.0290
PC10: 0.0277

PCA Component Loadings:
      full_url_length  hostname_length  ip_address_in_url  dot_count  \
PC1          0.304053         0.279080           0.000850   0.220172   
PC2          0.024393        -0.455294          -0.002093  -0.188162   
PC3          0.014246        -0.128594           0.002766  -0.287591   
PC4          0.190216        -0.043703          -0.004767  -0.168709   
PC5         -0.023768         0.056556           0.002508  -0.027474   
PC6          0.013976        -0.019564          -0.000362  -0.000707   
PC7          0.071920        -0.122774          -0.0