<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Filter_Based_Approach/FISHER_SCORE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load your data
data = pd.read_csv('/content/All.csv')  # Change to your file path

# Encode the target variable (URL_Type_obf_Type)
label_encoder = LabelEncoder()
data['URL_Type_obf_Type'] = label_encoder.fit_transform(data['URL_Type_obf_Type'])

# Handling missing values by filling them with median
data.fillna(data.median(numeric_only=True), inplace=True)

# Replace infinite values with NaN and then fill NaN with the median
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.median(numeric_only=True), inplace=True)

# Cap very large values to avoid overflow issues (adjust threshold if needed)
threshold = 1e10
data = data.clip(upper=threshold)

# Separate features and target
X = data.drop(columns=['URL_Type_obf_Type'])
y = data['URL_Type_obf_Type']

# Apply Fisher Score (ANOVA F-value) for feature ranking
fisher_scores, _ = f_classif(X, y)

# Create a DataFrame for features and their Fisher Scores
fisher_results = pd.DataFrame({
    'Feature': X.columns,
    'Fisher_Score': fisher_scores
}).sort_values(by='Fisher_Score', ascending=False)

# Select only the top 40 features
top_40_features = fisher_results.head(40)
selected_features = top_40_features['Feature'].values

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')

# Generate classification report
classification_rep = classification_report(y_test, y_pred)

# Display the selected features along with their Fisher Scores
print("Top 40 Features with Fisher Scores:")
for i, row in enumerate(top_40_features.itertuples(), 1):
    print(f"{i}. {row.Feature}: {row.Fisher_Score:.6f}")

# Print evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:\n", classification_rep)


  f = msb / msw


Top 40 Features with Fisher Scores:
1. SymbolCount_Domain: 3528.923323
2. domain_token_count: 3505.067755
3. tld: 3505.067755
4. Entropy_Afterpath: 3473.507167
5. NumberRate_AfterPath: 3424.144460
6. ArgUrlRatio: 3114.109486
7. domainUrlRatio: 2788.370617
8. URLQueries_variable: 2653.092646
9. SymbolCount_FileName: 2647.994462
10. argPathRatio: 2551.406432
11. delimeter_path: 2525.575957
12. delimeter_Count: 2524.101710
13. pathurlRatio: 2490.365833
14. SymbolCount_Extension: 2336.347730
15. SymbolCount_URL: 2301.843351
16. NumberofDotsinURL: 2211.422943
17. Arguments_LongestWordLength: 2152.619875
18. SymbolCount_Afterpath: 2059.704300
19. CharacterContinuityRate: 1973.611670
20. domainlength: 1951.540466
21. host_letter_count: 1891.533403
22. Extension_DigitCount: 1874.519721
23. spcharUrl: 1549.892671
24. SymbolCount_Directoryname: 1486.539534
25. Entropy_Extension: 1445.690235
26. avgdomaintokenlen: 1370.316894
27. Query_DigitCount: 1363.660739
28. URL_DigitCount: 1360.412324
29. E