<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Wrapper_Based_Approach/EXHAUSTIVE_FEATURE_SELECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('/content/All.csv')  # Replace with your actual file path

# Handle missing values (filling missing values with median)
data_cleaned = data.fillna(data.median(numeric_only=True))

# Encode the target variable (assuming target variable is 'URL_Type_obf_Type')
le = LabelEncoder()
data_cleaned['URL_Type_obf_Type'] = le.fit_transform(data_cleaned['URL_Type_obf_Type'])

# Separate features and target variable
X = data_cleaned.drop(columns=['URL_Type_obf_Type'])
y = data_cleaned['URL_Type_obf_Type']

# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle NaN values by filling with the median for numerical columns
X.fillna(X.median(numeric_only=True), inplace=True)

# Check for any remaining NaN or infinite values
if X.isnull().values.any():
    raise ValueError("There are still NaN values in the dataset.")
if np.isinf(X.values).any():
    raise ValueError("There are infinite values in the dataset.")

# Scaling the features to avoid issues with very large values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000, solver='liblinear')

# Use SelectKBest for feature selection to select top 40 features
selector = SelectKBest(score_func=f_classif, k=40)  # Select top 40 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the boolean mask of selected features
selected_feature_mask = selector.get_support()

# Get the names of the selected features
selected_features = X.columns[selected_feature_mask]

# Print the names of the selected features
print("Selected features:")
print(selected_features.tolist())

# Fit the model using the selected features
model.fit(X_train_selected, y_train)

# Make predictions using the test set with selected features
predictions = model.predict(X_test_selected)

# Calculate accuracy, precision, F1 score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')  # Use weighted for multiclass
f1 = f1_score(y_test, predictions, average='weighted')  # Use weighted for multiclass

# Calculate ROC AUC score for multiclass
# Predict probabilities
y_proba = model.predict_proba(X_test_selected)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')  # Use One-vs-Rest

# Print the calculated metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")



  f = msb / msw


Selected features:
['domain_token_count', 'path_token_count', 'avgdomaintokenlen', 'longdomaintokenlen', 'tld', 'charcompace', 'ldl_path', 'dld_url', 'dld_path', 'dld_getArg', 'domainlength', 'ArgLen', 'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'argPathRatio', 'NumberofDotsinURL', 'CharacterContinuityRate', 'URL_DigitCount', 'Extension_DigitCount', 'Query_DigitCount', 'host_letter_count', 'Arguments_LongestWordLength', 'URLQueries_variable', 'spcharUrl', 'delimeter_path', 'delimeter_Count', 'NumberRate_Domain', 'NumberRate_Extension', 'NumberRate_AfterPath', 'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname', 'SymbolCount_FileName', 'SymbolCount_Extension', 'SymbolCount_Afterpath', 'Entropy_Domain', 'Entropy_Extension', 'Entropy_Afterpath']
Accuracy: 0.7892
Precision: 0.7906
F1 Score: 0.7869
ROC AUC: 0.9504
