<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Filter_Based_Approaches/FILTER_BASED_TECHNIQUES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**INFORMATION GAIN**

In [None]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load your data
data = pd.read_csv('/content/All.csv')  # Change to your file path

# Replace infinite values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data.select_dtypes(include=[np.number]))

# Encode the target variable
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(data['URL_Type_obf_Type'])  # Replace with your target column

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_imputed)

# Calculate Information Gain (Mutual Information)
information_gain = mutual_info_classif(data_scaled, target)

# Combine Information Gain scores with feature names
feature_importance = pd.DataFrame({
    'Feature': data.select_dtypes(include=[np.number]).columns,
    'Information_Gain': information_gain
})

# Sort features by Information Gain in descending order
important_features = feature_importance.sort_values(by='Information_Gain', ascending=False)

# Select the top 40 features based on Information Gain
num_features_to_select = 40
selected_features = important_features.head(num_features_to_select)

# Display the selected features along with their Information Gain scores
print("Selected Features with Information Gain Scores:")
for i, row in selected_features.iterrows():
    print(f"{row['Feature']}: {row['Information_Gain']:.6f}")

# Filter the dataset to keep only the selected features
data_selected = pd.DataFrame(data_scaled, columns=data.select_dtypes(include=[np.number]).columns)[selected_features['Feature'].values]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_selected, target, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
roc_auc = roc_auc_score(y_test, y_probs, multi_class='ovr', average='macro')

# Display the evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy:    {accuracy:.4f}")
print(f"Precision:   {precision:.4f}")
print(f"Recall:      {recall:.4f}")
print(f"F1 Score:    {f1:.4f}")
print(f"ROC AUC:     {roc_auc:.4f}\n")

# Generate and display classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
print("Classification Report:\n")
print(report)


Selected Features with Information Gain Scores:
Entropy_Domain: 1.081467
pathurlRatio: 0.762876
argPathRatio: 0.755385
ArgUrlRatio: 0.751371
argDomanRatio: 0.725091
domainUrlRatio: 0.714127
pathDomainRatio: 0.713744
NumberRate_URL: 0.696469
Entropy_DirectoryName: 0.677404
CharacterContinuityRate: 0.655622
NumberRate_FileName: 0.648979
Entropy_Filename: 0.609906
NumberRate_Extension: 0.553940
avgpathtokenlen: 0.527027
Entropy_Extension: 0.511696
NumberRate_AfterPath: 0.502455
Entropy_URL: 0.497878
Entropy_Afterpath: 0.483615
avgdomaintokenlen: 0.448818
LongestPathTokenLength: 0.400368
LongestVariableValue: 0.376572
subDirLen: 0.367960
pathLength: 0.366659
urlLen: 0.364891
NumberofDotsinURL: 0.360477
ArgLen: 0.351163
domainlength: 0.348375
Querylength: 0.331870
Query_LetterCount: 0.327048
Extension_LetterCount: 0.323970
Arguments_LongestWordLength: 0.306057
host_letter_count: 0.305042
domain_token_count: 0.296562
SymbolCount_FileName: 0.295928
SymbolCount_Domain: 0.294521
tld: 0.294497
E

**CHI SQUARE**

In [None]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load your data
data = pd.read_csv('/content/All.csv')  # Change to your file path

# Encode the target variable (URL_Type_obf_Type)
label_encoder = LabelEncoder()
data['URL_Type_obf_Type'] = label_encoder.fit_transform(data['URL_Type_obf_Type'])

# Handling missing values by filling them with median
data.fillna(data.median(numeric_only=True), inplace=True)

# Replace infinite values with NaN and then fill NaN with the median
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.median(numeric_only=True), inplace=True)

# Cap very large values to avoid overflow issues (adjust threshold if needed)
threshold = 1e10
data = data.clip(upper=threshold)

# Separate features and target
X = data.drop(columns=['URL_Type_obf_Type'])
y = data['URL_Type_obf_Type']

# Discretize continuous features for Chi-Square test
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_discretized = discretizer.fit_transform(X)

# Apply Chi-Square test to select top 40 features
chi2_selector = SelectKBest(chi2, k=40)
X_kbest = chi2_selector.fit_transform(X_discretized, y)

# Get feature scores and selected feature names
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]
chi2_scores = chi2_selector.scores_[selected_feature_indices]

# Create a DataFrame for selected features and their Chi-Square scores
chi2_results = pd.DataFrame({
    'Feature': selected_features,
    'Chi_Square_Score': chi2_scores
}).sort_values(by='Chi_Square_Score', ascending=False)

# Display the selected features along with their Chi-Square scores
print("Selected Features with Chi-Square Scores:")
for i, row in enumerate(chi2_results.itertuples(), 1):
    print(f"{i}. {row.Feature}: {row.Chi_Square_Score:.6f}")

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
roc_auc = roc_auc_score(y_test, y_probs, multi_class='ovr', average='macro')

# Display the evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy:    {accuracy:.4f}")
print(f"Precision:   {precision:.4f}")
print(f"Recall:      {recall:.4f}")
print(f"F1 Score:    {f1:.4f}")
print(f"ROC AUC:     {roc_auc:.4f}\n")

# Generate and display classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
print("Classification Report:\n")
print(report)




Selected Features with Chi-Square Scores:
1. Entropy_Afterpath: 46636.348497
2. ArgUrlRatio: 32560.235238
3. NumberRate_AfterPath: 30911.488374
4. argPathRatio: 17623.597596
5. Extension_DigitCount: 14657.170524
6. NumberRate_Domain: 12832.426467
7. URLQueries_variable: 12516.599730
8. SymbolCount_Extension: 12484.431530
9. delimeter_Count: 12140.969209
10. SymbolCount_FileName: 12048.976722
11. Query_DigitCount: 11321.997078
12. LongestVariableValue: 10674.628574
13. Query_LetterCount: 10371.595582
14. Querylength: 10337.045398
15. ArgLen: 10134.795668
16. Extension_LetterCount: 10056.335110
17. URL_DigitCount: 9857.789986
18. argDomanRatio: 9761.104868
19. ldl_getArg: 9646.693640
20. URL_Letter_Count: 9562.160085
21. LongestPathTokenLength: 9491.649772
22. ldl_path: 9470.442753
23. domainUrlRatio: 9423.584553
24. ldl_url: 9405.311557
25. urlLen: 9065.048105
26. dld_getArg: 8936.258383
27. subDirLen: 8902.172790
28. pathLength: 8902.172790
29. SymbolCount_URL: 8495.513886
30. SymbolCo

**FISHER SCORE**

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load your data
data = pd.read_csv('/content/All.csv')  # Change to your file path

# Encode the target variable (URL_Type_obf_Type)
label_encoder = LabelEncoder()
data['URL_Type_obf_Type'] = label_encoder.fit_transform(data['URL_Type_obf_Type'])

# Handling missing values by filling them with median
data.fillna(data.median(numeric_only=True), inplace=True)

# Replace infinite values with NaN and then fill NaN with the median
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.median(numeric_only=True), inplace=True)

# Cap very large values to avoid overflow issues (adjust threshold if needed)
threshold = 1e10
data = data.clip(upper=threshold)

# Separate features and target
X = data.drop(columns=['URL_Type_obf_Type'])
y = data['URL_Type_obf_Type']

# Apply Fisher Score (ANOVA F-value) for feature ranking
fisher_scores, _ = f_classif(X, y)

# Create a DataFrame for features and their Fisher Scores
fisher_results = pd.DataFrame({
    'Feature': X.columns,
    'Fisher_Score': fisher_scores
}).sort_values(by='Fisher_Score', ascending=False)

# Select only the top 40 features
top_40_features = fisher_results.head(40)
selected_features = top_40_features['Feature'].values

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')

# Generate classification report
classification_rep = classification_report(y_test, y_pred)

# Display the selected features along with their Fisher Scores
print("Top 40 Features with Fisher Scores:")
for i, row in enumerate(top_40_features.itertuples(), 1):
    print(f"{i}. {row.Feature}: {row.Fisher_Score:.6f}")

# Print evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:\n", classification_rep)


  f = msb / msw


Top 40 Features with Fisher Scores:
1. SymbolCount_Domain: 3528.923323
2. domain_token_count: 3505.067755
3. tld: 3505.067755
4. Entropy_Afterpath: 3473.507167
5. NumberRate_AfterPath: 3424.144460
6. ArgUrlRatio: 3114.109486
7. domainUrlRatio: 2788.370617
8. URLQueries_variable: 2653.092646
9. SymbolCount_FileName: 2647.994462
10. argPathRatio: 2551.406432
11. delimeter_path: 2525.575957
12. delimeter_Count: 2524.101710
13. pathurlRatio: 2490.365833
14. SymbolCount_Extension: 2336.347730
15. SymbolCount_URL: 2301.843351
16. NumberofDotsinURL: 2211.422943
17. Arguments_LongestWordLength: 2152.619875
18. SymbolCount_Afterpath: 2059.704300
19. CharacterContinuityRate: 1973.611670
20. domainlength: 1951.540466
21. host_letter_count: 1891.533403
22. Extension_DigitCount: 1874.519721
23. spcharUrl: 1549.892671
24. SymbolCount_Directoryname: 1486.539534
25. Entropy_Extension: 1445.690235
26. avgdomaintokenlen: 1370.316894
27. Query_DigitCount: 1363.660739
28. URL_DigitCount: 1360.412324
29. E

**MISSING VALUE R**ATIO

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('/content/All.csv')  # Update with your file path

# Calculate the missing value ratio for each feature
missing_ratio = data.isnull().mean()

# Select the top 40 features with the highest missing value ratios
top_40_missing_features = missing_ratio.sort_values(ascending=False).head(40).index

# Display the top 40 features with their missing value ratios
print("Top 40 Features with Highest Missing Value Ratios:")
for feature in top_40_missing_features:
    print(f"{feature}: {missing_ratio[feature]:.2%}")

# Filter the dataset to include only the selected features and the target variable
selected_data = data[top_40_missing_features.tolist() + ['URL_Type_obf_Type']]

# Separate features and target variable
X = selected_data.drop(columns=['URL_Type_obf_Type'])
y = selected_data['URL_Type_obf_Type']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data: impute missing values with median
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data: impute missing values with most frequent value, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('scaler', StandardScaler(with_mean=False)),  # StandardScaler doesn't support sparse matrices
                        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr', average='weighted')

# Print evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC AUC:   {roc_auc:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Top 40 Features with Highest Missing Value Ratios:
NumberRate_Extension: 27.60%
Entropy_DirectoryName: 23.07%
avgpathtokenlen: 0.76%
Entropy_Filename: 0.64%
Entropy_Extension: 0.11%
NumberRate_FileName: 0.03%
NumberRate_DirectoryName: 0.03%
Entropy_Afterpath: 0.02%
NumberRate_AfterPath: 0.01%
Querylength: 0.00%
path_token_count: 0.00%
domain_token_count: 0.00%
avgdomaintokenlen: 0.00%
longdomaintokenlen: 0.00%
tld: 0.00%
charcompvowels: 0.00%
dld_domain: 0.00%
dld_url: 0.00%
ldl_getArg: 0.00%
ldl_filename: 0.00%
ldl_path: 0.00%
ldl_domain: 0.00%
ldl_url: 0.00%
charcompace: 0.00%
dld_path: 0.00%
dld_filename: 0.00%
dld_getArg: 0.00%
urlLen: 0.00%
domainlength: 0.00%
pathLength: 0.00%
subDirLen: 0.00%
fileNameLen: 0.00%
executable: 0.00%
isPortEighty: 0.00%
NumberofDotsinURL: 0.00%
ISIpAddressInDomainName: 0.00%
CharacterContinuityRate: 0.00%
LongestVariableValue: 0.00%
URL_DigitCount: 0.00%
host_DigitCount: 0.00%

Model Evaluation Metrics:
Accuracy:  0.9695
Precision: 0.9698
Recall:    