<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Filter_Based_Approach/MISSING_VALUE_RATIO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('/content/All.csv')  # Update with your file path

# Calculate the missing value ratio for each feature
missing_ratio = data.isnull().mean()

# Select the top 40 features with the highest missing value ratios
top_40_missing_features = missing_ratio.sort_values(ascending=False).head(40).index

# Display the top 40 features with their missing value ratios
print("Top 40 Features with Highest Missing Value Ratios:")
for feature in top_40_missing_features:
    print(f"{feature}: {missing_ratio[feature]:.2%}")

# Filter the dataset to include only the selected features and the target variable
selected_data = data[top_40_missing_features.tolist() + ['URL_Type_obf_Type']]

# Separate features and target variable
X = selected_data.drop(columns=['URL_Type_obf_Type'])
y = selected_data['URL_Type_obf_Type']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data: impute missing values with median
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data: impute missing values with most frequent value, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('scaler', StandardScaler(with_mean=False)),  # StandardScaler doesn't support sparse matrices
                        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')

# Print evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC AUC:   {roc_auc:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Top 40 Features with Highest Missing Value Ratios:
NumberRate_Extension: 27.60%
Entropy_DirectoryName: 23.07%
avgpathtokenlen: 0.76%
Entropy_Filename: 0.64%
Entropy_Extension: 0.11%
NumberRate_FileName: 0.03%
NumberRate_DirectoryName: 0.03%
Entropy_Afterpath: 0.02%
NumberRate_AfterPath: 0.01%
Querylength: 0.00%
path_token_count: 0.00%
domain_token_count: 0.00%
avgdomaintokenlen: 0.00%
longdomaintokenlen: 0.00%
tld: 0.00%
charcompvowels: 0.00%
dld_domain: 0.00%
dld_url: 0.00%
ldl_getArg: 0.00%
ldl_filename: 0.00%
ldl_path: 0.00%
ldl_domain: 0.00%
ldl_url: 0.00%
charcompace: 0.00%
dld_path: 0.00%
dld_filename: 0.00%
dld_getArg: 0.00%
urlLen: 0.00%
domainlength: 0.00%
pathLength: 0.00%
subDirLen: 0.00%
fileNameLen: 0.00%
executable: 0.00%
isPortEighty: 0.00%
NumberofDotsinURL: 0.00%
ISIpAddressInDomainName: 0.00%
CharacterContinuityRate: 0.00%
LongestVariableValue: 0.00%
URL_DigitCount: 0.00%
host_DigitCount: 0.00%

Model Evaluation Metrics:
Accuracy:  0.9695
Precision: 0.9698
Recall:    