<a href="https://colab.research.google.com/github/PRAKASHMS7/Phishing-Detection-By-Using-ML-Models/blob/main/Embedded_Based_Approaches/EMBEDDED_BASED_TECHNIQUES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

REGULARIZATION TECHNIQUE

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

# Load your dataset
data = pd.read_csv('/content/All.csv')

# Separate features and target variable
X = data.drop('URL_Type_obf_Type', axis=1)  # Features
y = data['URL_Type_obf_Type']  # Target

# Convert categorical target to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Handle missing and infinite values
X.replace([float('inf'), float('-inf')], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply L1 Regularization (Lasso) for feature selection
logreg = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
logreg.fit(X_scaled, y_encoded)

# Get absolute feature importance
importance = np.abs(logreg.coef_).sum(axis=0)

# Select the 40 most important features **while maintaining the original order**
top_40_indices = np.argsort(importance)[-40:]  # Get indices of top 40 features
selected_features = X.columns[top_40_indices]  # Feature names

# Keep only the selected features in the same order as they appear in X
X_selected = X[selected_features]  # Extract from original dataframe (preserves order)

# Rename columns with serial numbers (1, 2, 3, ...)
X_selected.columns = [f"{i+1}. {col}" for i, col in enumerate(X_selected.columns)]

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42)

# Train Logistic Regression classifier on the selected features
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')

# Generate classification report
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

# Print results
print("Selected Features in Serial Order:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc_score:.4f}")

print("\nClassification Report:")
print(class_report)


Selected Features in Serial Order:
1. Directory_LetterCount
2. NumberRate_Domain
3. delimeter_path
4. tld
5. dld_domain
6. domain_token_count
7. Entropy_Afterpath
8. host_letter_count
9. SymbolCount_URL
10. Extension_DigitCount
11. host_DigitCount
12. avgdomaintokenlen
13. SymbolCount_Afterpath
14. charcompvowels
15. domainlength
16. URLQueries_variable
17. argPathRatio
18. ldl_url
19. path_token_count
20. delimeter_Count
21. Extension_LetterCount
22. longdomaintokenlen
23. URL_DigitCount
24. ArgUrlRatio
25. SymbolCount_Domain
26. ldl_getArg
27. pathDomainRatio
28. argDomanRatio
29. Query_DigitCount
30. subDirLen
31. pathLength
32. domainUrlRatio
33. LongestVariableValue
34. pathurlRatio
35. Query_LetterCount
36. LongestPathTokenLength
37. urlLen
38. ArgLen
39. Querylength
40. URL_Letter_Count

Accuracy: 0.7834
Precision: 0.7841
Recall: 0.7834
F1 Score: 0.7795
AUC Score: 0.9503

Classification Report:
              precision    recall  f1-score   support

  Defacement       0.80      0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**RANDOM FOREST**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

# Load your dataset
data = pd.read_csv('/content/All.csv')

# Separate features and target variable
X = data.drop('URL_Type_obf_Type', axis=1)  # Features
y = data['URL_Type_obf_Type']  # Target

# Convert target variable to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Handle missing and infinite values
X.replace([float('inf'), float('-inf')], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns)

# Extract the 40 most important features
important_features = importances.sort_values(ascending=False).head(40)

# Keep features in the order they appear in the dataset
selected_features = X.columns[X.columns.isin(important_features.index)]

# Number the selected features
numbered_features = [f"{i+1}. {feature}" for i, feature in enumerate(selected_features)]

# Select only the important features
X_selected_train = X_train[:, [list(X.columns).index(f) for f in selected_features]]
X_selected_test = X_test[:, [list(X.columns).index(f) for f in selected_features]]

# Train a new Random Forest model with selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_selected_train, y_train)

# Make predictions
y_pred = rf_selected.predict(X_selected_test)
y_prob = rf_selected.predict_proba(X_selected_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')

# Print the 40 most important features
print("Top 40 Important Features (with Serial Numbers):")
for feature in numbered_features:
    print(feature)

# Print performance metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc_score:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Top 40 Important Features (with Serial Numbers):
1. domain_token_count
2. avgdomaintokenlen
3. longdomaintokenlen
4. avgpathtokenlen
5. tld
6. ldl_path
7. urlLen
8. domainlength
9. pathLength
10. subDirLen
11. pathurlRatio
12. ArgUrlRatio
13. argDomanRatio
14. domainUrlRatio
15. pathDomainRatio
16. argPathRatio
17. NumberofDotsinURL
18. CharacterContinuityRate
19. LongestVariableValue
20. URL_DigitCount
21. Extension_DigitCount
22. URL_Letter_Count
23. host_letter_count
24. Directory_LetterCount
25. Filename_LetterCount
26. Extension_LetterCount
27. LongestPathTokenLength
28. Domain_LongestWordLength
29. Arguments_LongestWordLength
30. spcharUrl
31. delimeter_path
32. NumberRate_URL
33. NumberRate_FileName
34. SymbolCount_URL
35. SymbolCount_Domain
36. SymbolCount_Directoryname
37. SymbolCount_FileName
38. SymbolCount_Extension
39. Entropy_Domain
40. Entropy_DirectoryName

Accuracy: 0.9773
Precision: 0.9775
Recall: 0.9773
F1 Score: 0.9773
AUC Score: 0.9990

Classification Report:
     

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.feature_selection import mutual_info_classif, VarianceThreshold, SelectKBest, SelectFromModel
from time import time

# Load dataset
print("Loading data...")
data = pd.read_csv('/content/All.csv')

# Define X (features) and y (target)
X = data.drop(columns=['URL_Type_obf_Type'])
y = data['URL_Type_obf_Type']

# 1. Handle infinite and missing values more efficiently
print("Handling missing values...")
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())

# 2. Clip extreme values - fixed implementation
print("Clipping extreme values...")
lower = X.quantile(0.01)
upper = X.quantile(0.99)
# Clip each column separately
X = X.apply(lambda col: np.clip(col, lower[col.name], upper[col.name]), axis=0)

# 3. Remove low variance features
print("Removing low variance features...")
var_thresh = VarianceThreshold(threshold=0.01)
X = var_thresh.fit_transform(X)

# 4. Select top 60 features using Mutual Information
print("Selecting top features with Mutual Information...")
start_time = time()
mi_selector = SelectKBest(mutual_info_classif, k=min(60, X.shape[1]))  # Ensure k <= number of features
X = mi_selector.fit_transform(X, y)
print(f"Feature selection took {time() - start_time:.2f} seconds")

# Split the data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
print("Scaling features...")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize Logistic Regression Model with optimized parameters
print("Initializing model...")
model = LogisticRegression(random_state=42, max_iter=300, solver='lbfgs', n_jobs=-1)

# Feature selection with L1 regularization
print("Performing feature selection with L1 regularization...")
sfm = SelectFromModel(
    LogisticRegression(penalty='l1', solver='liblinear', random_state=42, max_iter=100),
    max_features=40  # Select top 40 features
)
sfm.fit(X_train, y_train)
X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

# Get selected feature indices (relative to the 60 features)
selected_features = sfm.get_support(indices=True)
print(f"\nSelected {len(selected_features)} features")

# Train final model
print("Training final model...")
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
y_probs = model.predict_proba(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
roc_auc = roc_auc_score(y_test, y_probs, multi_class='ovr', average='macro')

# Display performance results
print("\nPerformance Metrics:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print(f"ROC AUC: {roc_auc * 100:.2f}%\n")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Loading data...
Handling missing values...
Clipping extreme values...
Removing low variance features...
Selecting top features with Mutual Information...
Feature selection took 22.37 seconds
Splitting data...
Scaling features...
Initializing model...
Performing feature selection with L1 regularization...

Selected 40 features
Training final model...

Performance Metrics:
Accuracy: 80.48%
Precision: 80.43%
Recall: 80.13%
F1 Score: 80.05%
ROC AUC: 95.89%


Classification Report:
              precision    recall  f1-score   support

  Defacement       0.84      0.83      0.83      1628
      benign       0.79      0.90      0.84      1526
     malware       0.77      0.62      0.69      1332
    phishing       0.75      0.79      0.77      1497
        spam       0.87      0.87      0.87      1359

    accuracy                           0.80      7342
   macro avg       0.80      0.80      0.80      7342
weighted avg       0.80      0.80      0.80      7342

