In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score

In [2]:
#load and explore data
df = pd.read_csv('phishing.csv')
df.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [3]:
#uses the important features only
new_data=df[['UsingIP', 'LongURL', 'Redirecting//', 'HTTPS', 'SubDomains', 
        'AnchorURL', 'WebsiteTraffic', 'PageRank', 'GoogleIndex', 'class']]
new_data.head()

Unnamed: 0,UsingIP,LongURL,Redirecting//,HTTPS,SubDomains,AnchorURL,WebsiteTraffic,PageRank,GoogleIndex,class
0,1,1,1,1,0,0,0,-1,1,-1
1,1,0,1,-1,-1,0,1,-1,1,-1
2,1,0,1,-1,-1,0,1,-1,1,-1
3,1,0,1,1,1,0,0,-1,1,1
4,-1,0,-1,1,1,0,1,-1,1,1


In [4]:
selected_features = ['UsingIP', 'LongURL', 'Redirecting//', 'HTTPS', 'SubDomains', 
        'AnchorURL', 'WebsiteTraffic', 'PageRank', 'GoogleIndex']
target='class'
X = df[selected_features].replace({-1: 0})   # Convert -1 → 0 for ML friendliness
y = df[target].replace({-1: 0})  

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42) ## split train-test model

In [6]:
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 7737, Testing samples: 3317


In [7]:
print(X_train.shape)
print(X_test.shape)

(7737, 9)
(3317, 9)


In [8]:
# Train model
base_model = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=200, max_depth=None)
base_model.fit(X_train, y_train)

In [9]:
calibrated_model = CalibratedClassifierCV(base_model, method='sigmoid', cv=3)
calibrated_model.fit(X_train, y_train)

In [10]:
#predicted model
y_pred = calibrated_model.predict(X_test)
y_proba = calibrated_model.predict_proba(X_test)

In [11]:
# Detailed classification report and accuracy
print("\nDetailed Performance Report:")
print("\nAccuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Phishing', 'Legitimate']))


Detailed Performance Report:

Accuracy: 90.68 %

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.88      0.91      0.90      1455
  Legitimate       0.93      0.91      0.92      1862

    accuracy                           0.91      3317
   macro avg       0.90      0.91      0.91      3317
weighted avg       0.91      0.91      0.91      3317



In [12]:
#understanding the important features
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': base_model.feature_importances_
}).sort_values('importance', ascending=True)
print("\n Important Features:")
print(feature_importance)


 Important Features:
          feature  importance
7        PageRank    0.015224
2   Redirecting//    0.015615
1         LongURL    0.016742
0         UsingIP    0.018539
8     GoogleIndex    0.018611
4      SubDomains    0.083089
6  WebsiteTraffic    0.106670
5       AnchorURL    0.108880
3           HTTPS    0.616630


In [15]:
# Define the feature names first
features_taken = ['UsingIP', 'LongURL', 'Redirecting//', 'HTTPS', 'SubDomains', 
                 'AnchorURL', 'WebsiteTraffic', 'PageRank', 'GoogleIndex']

# Create a proper function to test any website
def analyze_website(features_dict):
    # Convert to DataFrame with proper feature names
    website_df = pd.DataFrame([features_dict], columns=features_taken)
    # Get prediction
    prediction = calibrated_model.predict(website_df)[0]
    probability = calibrated_model.predict_proba(website_df)[0]
    # Get feature values for explanation
    https_status = "✅ HTTPS" if features_dict['HTTPS'] == 1 else "❌ No HTTPS"
    anchor_status = "✅ Good links" if features_dict['AnchorURL'] == 1 else "❌ Suspicious links"
    traffic_status = "✅ Normal traffic" if features_dict['WebsiteTraffic'] == 1 else "❌ Low traffic"
    
    if prediction == 0:
        result = "PHISHING WEBSITE"
        confidence = probability[0] * 100
        explanation = f"  - {https_status}\n  - {anchor_status}\n  - {traffic_status}"
    else:
        result = "LEGITIMATE WEBSITE"
        confidence = probability[1] * 100
        explanation = f"  - {https_status}\n  - {anchor_status}\n  - {traffic_status}"
    
    return result, confidence, explanation

print(" ----- TESTING REAL-WORLD SCENARIOS -----")

# Your test cases (they are correct)
legit_website ={
    'UsingIP': 0,           # Might use IP (surprising!)
    'LongURL': 0,          # Normal URL length
    'Redirecting//': 0,     # Some redirects
    'HTTPS': 1,             # Has HTTPS
    'SubDomains': 0,        # Some subdomains
    'AnchorURL': 1,         # Good links
    'WebsiteTraffic': 1,    # Good traffic
    'PageRank': 1,         # Medium reputation
    'GoogleIndex': 1        # Indexed by Google
}


phishing_site ={
    'UsingIP': 1,           # Uses IP (suspicious)
    'LongURL': 1,          # Not necessarily long URLs
    'Redirecting//': 1,     # Lots of redirects
    'HTTPS': 0,            # No HTTPS
    'SubDomains': 1,       # Few subdomains
    'AnchorURL': 0,        # Suspicious links
    'WebsiteTraffic': 0,   # Low traffic
    'PageRank': 0,         # Low reputation
    'GoogleIndex': 1        # Might be indexed (surprising!)
}


mixed_site = {
    'UsingIP': 1, 'LongURL': 0, 'Redirecting//': 0, 'SubDomains': 1,
    'HTTPS': 0, 'AnchorURL': 1, 'WebsiteTraffic': 1, 'PageRank': 1, 'GoogleIndex': 1
}

# Run the tests
test_sites = [("Secure Site", legit_website), 
              ("Phishing Site", phishing_site),
              ("Mixed Signals Site", mixed_site)]

for name, features in test_sites:
    result, confidence, explanation = analyze_website(features)
    print(f"\n {name}:")
    print(f"   {result}")
    print(f"   Confidence: {confidence:.1f}%")
    print(f"   Key factors:\n{explanation}")

 ----- TESTING REAL-WORLD SCENARIOS -----

 Secure Site:
   LEGITIMATE WEBSITE
   Confidence: 97.1%
   Key factors:
  - ✅ HTTPS
  - ✅ Good links
  - ✅ Normal traffic

 Phishing Site:
   PHISHING WEBSITE
   Confidence: 95.5%
   Key factors:
  - ❌ No HTTPS
  - ❌ Suspicious links
  - ❌ Low traffic

 Mixed Signals Site:
   LEGITIMATE WEBSITE
   Confidence: 77.1%
   Key factors:
  - ❌ No HTTPS
  - ✅ Good links
  - ✅ Normal traffic


In [16]:
import joblib

In [17]:
# Save your hard work (like saving a game)
joblib.dump(calibrated_model, 'my_phishing_detector.pkl')
joblib.dump(selected_features, 'feature_names.pkl')
print("🎉 Model saved!")

🎉 Model saved!


In [20]:
# METHOD 2: Use dictionary format (no column issues)
test_site = {
    'UsingIP': -1, 'LongURL': 1, 'Redirecting//': -1, 'HTTPS': 1,
    'SubDomains': -1, 'AnchorURL': -1, 'WebsiteTraffic': 1, 
    'PageRank': -1, 'GoogleIndex': 1
}

# Convert to DataFrame
test_df = pd.DataFrame([test_site])  # Automatically uses dict keys as columns

prediction = calibrated_model.predict(test_df)
probability = calibrated_model.predict_proba(test_df)

print(f"Prediction: {'PHISHING' if prediction[0] == -1 else 'LEGITIMATE'}")
print(f"Confidence: {probability[0].max()*100:.1f}%")

Prediction: LEGITIMATE
Confidence: 94.3%


In [21]:
# Let's check the actual coding in the dataset
print("🔍 Checking ACTUAL feature coding in dataset:")
print("For class = -1 (PHISHING) - what features do they have?")
print(df[df['class'] == -1][X.columns].mean())
print("\nFor class = 1 (LEGITIMATE) - what features do they have?")
print(df[df['class'] == 1][X.columns].mean())

🔍 Checking ACTUAL feature coding in dataset:
For class = -1 (PHISHING) - what features do they have?
UsingIP           0.213804
LongURL          -0.682867
Redirecting//     0.770880
HTTPS            -0.479681
SubDomains       -0.209312
AnchorURL        -0.632020
WebsiteTraffic   -0.033694
PageRank         -0.586277
GoogleIndex       0.621401
dtype: float64

For class = 1 (LEGITIMATE) - what features do they have?
UsingIP           0.393536
LongURL          -0.593958
Redirecting//     0.718369
HTTPS             0.832223
SubDomains        0.281468
AnchorURL         0.365438
WebsiteTraffic    0.542797
PageRank         -0.401981
GoogleIndex       0.801202
dtype: float64


In [22]:
print(X_train.head())


       UsingIP  LongURL  Redirecting//  HTTPS  SubDomains  AnchorURL  \
10168        0        0              1      0           1          0   
10167        0        0              0      0           0          0   
2115         1        0              1      0           0          0   
1699         1        0              1      1           1          0   
1379         1        0              1      1           0          1   

       WebsiteTraffic  PageRank  GoogleIndex  
10168               0         0            1  
10167               0         0            1  
2115                0         0            1  
1699                0         0            1  
1379                1         0            1  


In [186]:
import streamlit as st