<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Filter_Based_Approach/INFORMATION_GAIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load your data
data = pd.read_csv('/content/All.csv')  # Change to your file path

# Replace infinite values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data.select_dtypes(include=[np.number]))

# Encode the target variable
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(data['URL_Type_obf_Type'])  # Replace with your target column

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_imputed)

# Calculate Information Gain (Mutual Information)
information_gain = mutual_info_classif(data_scaled, target)

# Combine Information Gain scores with feature names
feature_importance = pd.DataFrame({
    'Feature': data.select_dtypes(include=[np.number]).columns,
    'Information_Gain': information_gain
})

# Sort features by Information Gain in descending order
important_features = feature_importance.sort_values(by='Information_Gain', ascending=False)

# Select the top 40 features based on Information Gain
num_features_to_select = 40
selected_features = important_features.head(num_features_to_select)

# Display the selected features along with their Information Gain scores
print("Selected Features with Information Gain Scores:")
for i, row in selected_features.iterrows():
    print(f"{row['Feature']}: {row['Information_Gain']:.6f}")

# Filter the dataset to keep only the selected features
data_selected = pd.DataFrame(data_scaled, columns=data.select_dtypes(include=[np.number]).columns)[selected_features['Feature'].values]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_selected, target, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
roc_auc = roc_auc_score(y_test, y_probs, multi_class='ovr', average='macro')

# Display the evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy:    {accuracy:.4f}")
print(f"Precision:   {precision:.4f}")
print(f"Recall:      {recall:.4f}")
print(f"F1 Score:    {f1:.4f}")
print(f"ROC AUC:     {roc_auc:.4f}\n")

# Generate and display classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
print("Classification Report:\n")
print(report)


Selected Features with Information Gain Scores:
Entropy_Domain: 1.080421
pathurlRatio: 0.765498
ArgUrlRatio: 0.754376
argPathRatio: 0.752858
argDomanRatio: 0.721424
pathDomainRatio: 0.719932
domainUrlRatio: 0.716639
NumberRate_URL: 0.697755
Entropy_DirectoryName: 0.674671
CharacterContinuityRate: 0.654183
NumberRate_FileName: 0.646348
Entropy_Filename: 0.613215
NumberRate_Extension: 0.557117
avgpathtokenlen: 0.524524
Entropy_Extension: 0.514930
NumberRate_AfterPath: 0.500580
Entropy_URL: 0.497531
Entropy_Afterpath: 0.483519
avgdomaintokenlen: 0.451762
LongestPathTokenLength: 0.400869
LongestVariableValue: 0.382361
subDirLen: 0.363601
pathLength: 0.362949
urlLen: 0.362584
NumberofDotsinURL: 0.360327
ArgLen: 0.350518
domainlength: 0.344909
Querylength: 0.332099
Query_LetterCount: 0.330046
Extension_LetterCount: 0.324130
Arguments_LongestWordLength: 0.305986
host_letter_count: 0.305414
domain_token_count: 0.299580
SymbolCount_FileName: 0.297363
tld: 0.293990
SymbolCount_Domain: 0.291935
E