<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Filter_Based_Approach/CHI_SQUARE_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load your data
data = pd.read_csv('/content/All.csv')  # Change to your file path

# Encode the target variable (URL_Type_obf_Type)
label_encoder = LabelEncoder()
data['URL_Type_obf_Type'] = label_encoder.fit_transform(data['URL_Type_obf_Type'])

# Handling missing values by filling them with median
data.fillna(data.median(numeric_only=True), inplace=True)

# Replace infinite values with NaN and then fill NaN with the median
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.median(numeric_only=True), inplace=True)

# Cap very large values to avoid overflow issues (adjust threshold if needed)
threshold = 1e10
data = data.clip(upper=threshold)

# Separate features and target
X = data.drop(columns=['URL_Type_obf_Type'])
y = data['URL_Type_obf_Type']

# Discretize continuous features for Chi-Square test
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_discretized = discretizer.fit_transform(X)

# Apply Chi-Square test to select top 40 features
chi2_selector = SelectKBest(chi2, k=40)
X_kbest = chi2_selector.fit_transform(X_discretized, y)

# Get feature scores and selected feature names
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]
chi2_scores = chi2_selector.scores_[selected_feature_indices]

# Create a DataFrame for selected features and their Chi-Square scores
chi2_results = pd.DataFrame({
    'Feature': selected_features,
    'Chi_Square_Score': chi2_scores
}).sort_values(by='Chi_Square_Score', ascending=False)

# Display the selected features along with their Chi-Square scores
print("Selected Features with Chi-Square Scores:")
for i, row in enumerate(chi2_results.itertuples(), 1):
    print(f"{i}. {row.Feature}: {row.Chi_Square_Score:.6f}")

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
roc_auc = roc_auc_score(y_test, y_probs, multi_class='ovr', average='macro')

# Display the evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy:    {accuracy:.4f}")
print(f"Precision:   {precision:.4f}")
print(f"Recall:      {recall:.4f}")
print(f"F1 Score:    {f1:.4f}")
print(f"ROC AUC:     {roc_auc:.4f}\n")

# Generate and display classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
print("Classification Report:\n")
print(report)




Selected Features with Chi-Square Scores:
1. Entropy_Afterpath: 46636.348497
2. ArgUrlRatio: 32560.235238
3. NumberRate_AfterPath: 30911.488374
4. argPathRatio: 17623.597596
5. Extension_DigitCount: 14657.170524
6. NumberRate_Domain: 12832.426467
7. URLQueries_variable: 12516.599730
8. SymbolCount_Extension: 12484.431530
9. delimeter_Count: 12140.969209
10. SymbolCount_FileName: 12048.976722
11. Query_DigitCount: 11321.997078
12. LongestVariableValue: 10674.628574
13. Query_LetterCount: 10371.595582
14. Querylength: 10337.045398
15. ArgLen: 10134.795668
16. Extension_LetterCount: 10056.335110
17. URL_DigitCount: 9857.789986
18. argDomanRatio: 9761.104868
19. ldl_getArg: 9646.693640
20. URL_Letter_Count: 9562.160085
21. LongestPathTokenLength: 9491.649772
22. ldl_path: 9470.442753
23. domainUrlRatio: 9423.584553
24. ldl_url: 9405.311557
25. urlLen: 9065.048105
26. dld_getArg: 8936.258383
27. subDirLen: 8902.172790
28. pathLength: 8902.172790
29. SymbolCount_URL: 8495.513886
30. SymbolCo