In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
file_path = 'Add_your_CSV_file_path/Phishing_Legitimate_full.csv'
data = pd.read_csv(file_path)

# Display basic information
print("Dataset Info:")
print(data.info())



In [None]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Summary statistics
print("\nDataset Description:")
print(data.describe())

In [None]:
# Check class distribution
print("\nClass Distribution:")
print(data['CLASS_LABEL'].value_counts())

# Visualize class distribution
sns.countplot(x='CLASS_LABEL', data=data)
plt.title("Class Distribution (0 = Legitimate, 1 = Phishing)")
plt.show()

In [None]:
# Select features and target
X = data.drop(columns=['id', 'CLASS_LABEL'])  # Exclude ID and target column
y = data['CLASS_LABEL']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData successfully preprocessed and split into train-test sets.")


In [None]:
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)
log_pred = log_model.predict(X_test_scaled)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)

In [None]:
# Evaluation Function
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Evaluation:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.show()

In [None]:

# Evaluate all models
evaluate_model("Logistic Regression", y_test, log_pred)

In [None]:
evaluate_model("Random Forest", y_test, rf_pred)

In [None]:
evaluate_model("SVM", y_test, svm_pred)

In [None]:
# Feature importance from Random Forest
importances = rf_model.feature_importances_
features = X.columns

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x=importances, y=features)
plt.title("Feature Importance (Random Forest)")
plt.show()


In [None]:
# Save the training feature names
training_features = X.columns  # X is the DataFrame of training features
import pickle
with open('training_features.pkl', 'wb') as f:
    pickle.dump(training_features, f)


In [None]:
import pickle
import pandas as pd

# Load training feature names
with open('training_features.pkl', 'rb') as f:
    training_features = pickle.load(f)

# Updated real-time URL prediction function
def predict_url(url, model, scaler):
    print("\nExtracting features for URL:", url)
    features = extract_features(url)

    # Convert features to DataFrame
    feature_df = pd.DataFrame([features])

    # Add missing features with default values (0)
    for feature in training_features:
        if feature not in feature_df.columns:
            feature_df[feature] = 0

    # Reorder columns to match training features
    feature_df = feature_df[training_features]

    # Standardize features using the scaler
    feature_scaled = scaler.transform(feature_df)

    # Make prediction
    prediction = model.predict(feature_scaled)
    print("Prediction:", "Phishing" if prediction[0] == 1 else "Legitimate")
    # Function to accept user input URLs and test
def user_input_test(model, scaler):
    # Ask the user to input URLs (comma separated)
    user_urls = input("Enter URLs separated by commas: ").split(',')

    for url in user_urls:
        url = url.strip()  # Remove any leading or trailing spaces
        predict_url(url, model, scaler)

# Call the function to test user input URLs
user_input_test(rf_model, scaler)