In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [14]:
# Load datasets
legitimate_urls = pd.read_csv("C:/Users/Tejus Kapoor/Desktop/aa/dataset/legitimate_urls.txt", header=None, names=["url"])

phishing_urls = pd.read_csv('C:/Users/Tejus Kapoor/Desktop/aa/dataset/1000-phishing.txt', delimiter='\t', header=None, names=["url"])

In [15]:
# Label the datasets
legitimate_urls['label'] = 0  # Legitimate
phishing_urls['label'] = 1  # Phishing

In [16]:
# Combine the datasets
data = pd.concat([legitimate_urls, phishing_urls], ignore_index=True)

In [17]:
# Shuffle the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [18]:
# Feature extraction using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['url'])
y = data['label']


In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Define the models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": MultinomialNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300),
    "AdaBoost": AdaBoostClassifier()
}

In [21]:
# Dictionary to store accuracy scores
accuracy_scores = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy
    print(f"{model_name} Accuracy: {accuracy:.2f}\n")

Training Logistic Regression...
Logistic Regression Accuracy: 0.92

Training Random Forest...
Random Forest Accuracy: 0.93

Training SVM...
SVM Accuracy: 0.92

Training Decision Tree...
Decision Tree Accuracy: 0.93

Training K-Nearest Neighbors...
K-Nearest Neighbors Accuracy: 0.77

Training Naive Bayes...
Naive Bayes Accuracy: 0.89

Training XGBoost...
XGBoost Accuracy: 0.89

Training CatBoost...


Parameters: { "use_label_encoder" } are not used.



CatBoost Accuracy: 0.91

Training MLP...
MLP Accuracy: 0.93

Training AdaBoost...




AdaBoost Accuracy: 0.91



In [22]:
# Display accuracy scores for all models
print("Comparison of Model Accuracy Scores:")
for model_name, accuracy in accuracy_scores.items():
    print(f"{model_name}: {accuracy:.2f}")

Comparison of Model Accuracy Scores:
Logistic Regression: 0.92
Random Forest: 0.93
SVM: 0.92
Decision Tree: 0.93
K-Nearest Neighbors: 0.77
Naive Bayes: 0.89
XGBoost: 0.89
CatBoost: 0.91
MLP: 0.93
AdaBoost: 0.91
