In [33]:
import pandas as pd
import re
import tldextract
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [44]:
import pandas as pd
from sklearn.utils import shuffle

# Load your dataset (adjust the path accordingly)
df = pd.read_csv('C:\\Users\\KIIT\\Desktop\\cyberProject\\merged_phishing_dataset.csv')  # Replace with your actual CSV file

# Standardize column names if needed
df.columns = df.columns.str.strip().str.lower()

# Fix the label column
def clean_label(label):
    if str(label).strip().lower() in ['1', 'phishing']:
        return 1
    elif str(label).strip().lower() in ['0', 'legitimate']:
        return 0
    else:
        return None  # Mark invalids if any

df['label'] = df['label'].apply(clean_label)

# Drop rows with invalid labels if any
df = df.dropna(subset=['label'])

# Shuffle the dataset
df = shuffle(df, random_state=42).reset_index(drop=True)

# Optional: Check distribution
print(df['label'].value_counts())

# Save cleaned and shuffled data
df.to_csv('C://Users//KIIT//Desktop//cyberProject//Phishing URL dataset//URL_dataset.csv', index=False)

print("✅ Dataset cleaned, shuffled, and saved as 'cleaned_shuffled_dataset.csv'")


0    345738
1    168526
Name: label, dtype: int64
✅ Dataset cleaned, shuffled, and saved as 'cleaned_shuffled_dataset.csv'


In [45]:
data = pd.read_csv("C://Users//KIIT//Desktop//cyberProject//Phishing URL dataset//URL_dataset.csv", encoding='latin1')

# Rename columns for consistency
data.columns = ['url', 'label']


In [46]:
# Encode labels (legitimate -> 0, phishing -> 1)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  # 0 = legitimate, 1 = phishing

In [47]:
# Separate features and labels
urls = data['url']
labels = data['label']


In [49]:
def extract_features(url):
    features = {}
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['has_ip'] = 1 if re.search(r"\d+\.\d+\.\d+\.\d+", url) else 0
    features['has_at'] = 1 if "@" in url else 0
    features['has_hyphen'] = 1 if "-" in url else 0
    features['has_https'] = 1 if "https" in url.lower() else 0
    features['num_subdirs'] = url.count('/')

    domain_info = tldextract.extract(url)
    features['domain_length'] = len(domain_info.domain)
    features['is_suspicious_tld'] = 1 if domain_info.suffix in ["tk", "ml", "ga", "cf", "gq"] else 0

    return features


In [50]:
# Extract features from all URLs
features = [extract_features(u) for u in urls]
X = pd.DataFrame(features)
y = labels


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6)
model.fit(X_train, y_train)


In [53]:
y_pred = model.predict(X_test)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[68246   929]
 [ 3451 30227]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     69175
           1       0.97      0.90      0.93     33678

    accuracy                           0.96    102853
   macro avg       0.96      0.94      0.95    102853
weighted avg       0.96      0.96      0.96    102853

Accuracy Score: 0.9574149514355439


In [56]:
# Save Model
# --------------------------
joblib.dump(model, 'phishing_model3.pkl')
print("\n✅ Model saved as phishing_model3.pkl")


✅ Model saved as phishing_model3.pkl
