In [1]:
import pandas as pd

# Load the dataset from the CSV file
df = pd.read_csv('Phishing_Legitimate_full.csv')

# Display the first 5 rows to confirm it loaded correctly
df.head()

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,1,3,1,5,72,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,4,3,1,6,79,1,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,5,3,0,4,46,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


In [3]:
# Print a summary of the DataFrame
print("--- Data Info ---")
df.info()

print("\n--- Missing Values ---")
# Check for any missing values in each column
print(df.isnull().sum())

print("\n--- Label Distribution ---")
# See how many examples of each class (phishing/legitimate) we have
print(df['CLASS_LABEL'].value_counts())

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  10000 non-null  int64  
 1   NumDots                             10000 non-null  int64  
 2   SubdomainLevel                      10000 non-null  int64  
 3   PathLevel                           10000 non-null  int64  
 4   UrlLength                           10000 non-null  int64  
 5   NumDash                             10000 non-null  int64  
 6   NumDashInHostname                   10000 non-null  int64  
 7   AtSymbol                            10000 non-null  int64  
 8   TildeSymbol                         10000 non-null  int64  
 9   NumUnderscore                       10000 non-null  int64  
 10  NumPercent                          10000 non-null  int64  
 11  NumQueryComponents      

In [5]:
print(df.columns)

Index(['id', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',
       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',
       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',
       'PctExtNullSelfRedirectHyperl

In [6]:
from sklearn.model_selection import train_test_split

# --- 1. Separate Features (X) and Target (y) ---

# X contains all columns EXCEPT 'CLASS_LABEL' and 'id'
X = df.drop(['CLASS_LABEL', 'id'], axis=1) 

# y contains only the 'CLASS_LABEL' column
y = df['CLASS_LABEL']


# --- 2. Split the data into training and testing sets ---
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# --- 3. Print the shapes to confirm the split ---
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (8000, 48)
Shape of X_test: (2000, 48)
Shape of y_train: (8000,)
Shape of y_test: (2000,)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- 1. Create the model ---
# We'll start with 100 "detectives" (n_estimators=100)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# --- 2. Train the model ---
# This is where the model "learns" from your training data
model.fit(X_train, y_train)

# --- 3. Make predictions on the TEST data ---
# Let's see how our trained model does on data it has never seen before
y_pred = model.predict(X_test)

# --- 4. Evaluate the model's performance ---
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 98.20%


In [8]:
import joblib

# Define the filename for your model
filename = 'phishing_detector_model.joblib'

# Save the trained model to the file
joblib.dump(model, filename)

print(f"Model saved successfully to {filename}")

Model saved successfully to phishing_detector_model.joblib


In [2]:
# These are the features we can reliably calculate in our Flask app
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# These are the features we can reliably calculate
app_features = [
    'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
    'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
    'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
    'NumNumericChars', 'NoHttps', 'IpAddress', 'HostnameLength',
    'PathLength', 'QueryLength', 'NumSensitiveWords'
]

X_simple = df[app_features]
y_simple = df['CLASS_LABEL']

# Split and train the new model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_simple, y_simple, test_size=0.2, random_state=42)
simple_model = RandomForestClassifier(n_estimators=100, random_state=42)
simple_model.fit(X_train_s, y_train_s)

# --- THIS IS THE IMPORTANT CHANGE ---
# Overwrite the old model file with our new, smarter model
joblib.dump(simple_model, 'phishing_detector_model.joblib') 
print("Old model file has been overwritten with the new 20-feature model!")

Old model file has been overwritten with the new 20-feature model!


In [3]:
from urllib.parse import urlparse
import re
import numpy as np

# This is the same logic from app.py, just for our notebook
EXPECTED_FEATURES = [
    'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
    'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
    'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
    'NumNumericChars', 'NoHttps', 'IpAddress', 'HostnameLength',
    'PathLength', 'QueryLength', 'NumSensitiveWords'
]

def extract_features_notebook(url):
    features = {}
    if not urlparse(url).scheme:
        url = "http://" + url
    
    parsed_url = urlparse(url)
    hostname = parsed_url.netloc
    path = parsed_url.path

    features['NumDots'] = url.count('.')
    features['SubdomainLevel'] = len(hostname.split('.')) - 2
    features['PathLevel'] = len(path.split('/')) - 1
    features['UrlLength'] = len(url)
    features['NumDash'] = url.count('-')
    features['NumDashInHostname'] = hostname.count('-')
    features['AtSymbol'] = 1 if '@' in url else 0
    features['TildeSymbol'] = 1 if '~' in url else 0
    features['NumUnderscore'] = url.count('_')
    features['NumPercent'] = url.count('%')
    features['NumQueryComponents'] = len(parsed_url.query.split('&')) if parsed_url.query else 0
    features['NumAmpersand'] = url.count('&')
    features['NumHash'] = url.count('#')
    features['NumNumericChars'] = sum(c.isdigit() for c in url)
    features['NoHttps'] = 0 if url.startswith('https') else 1
    features['IpAddress'] = 1 if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", hostname) else 0
    features['HostnameLength'] = len(hostname)
    features['PathLength'] = len(path)
    features['QueryLength'] = len(parsed_url.query)
    
    sensitive_words = ['secure', 'login', 'signin', 'bank', 'account', 'update', 'password', 'verify']
    features['NumSensitiveWords'] = sum(word in url.lower() for word in sensitive_words)
    
    final_features = [features.get(f_name, 0) for f_name in EXPECTED_FEATURES]
    return final_features

In [4]:
# Our new data samples (Label 1 = Phishing)
new_data = [
    ('my-bank-secure-login-update.com', 1),
    ('paypal-account-verification-service.com', 1),
    ('chase-online-support-team.net', 1),
    ('verify-your-apple-id-now.com', 1),
    ('amazon-customer-support-center.org', 1)
]

# Process the new data
processed_data = []
for url, label in new_data:
    features = extract_features_notebook(url)
    processed_data.append(features + [label])

# Create a new DataFrame for the new data
# We need the original 20 feature columns + the label column
new_data_df = pd.DataFrame(processed_data, columns=app_features + ['CLASS_LABEL'])

print("New data processed and ready:")
print(new_data_df)

# --- Combine Old and New Data ---

# Get the original 20 features + label from the main 'df'
original_data = df[app_features + ['CLASS_LABEL']]

# Combine the original data with our new samples
combined_df = pd.concat([original_data, new_data_df], ignore_index=True)

print("\n--- Data Shapes ---")
print(f"Original data shape: {original_data.shape}")
print(f"New data shape: {new_data_df.shape}")
print(f"Combined data shape: {combined_df.shape}")

New data processed and ready:
   NumDots  SubdomainLevel  PathLevel  UrlLength  NumDash  NumDashInHostname  \
0        1               0          0         38        4                  4   
1        1               0          0         46        3                  3   
2        1               0          0         36        3                  3   
3        1               0          0         35        4                  4   
4        1               0          0         41        3                  3   

   AtSymbol  TildeSymbol  NumUnderscore  NumPercent  ...  NumAmpersand  \
0         0            0              0           0  ...             0   
1         0            0              0           0  ...             0   
2         0            0              0           0  ...             0   
3         0            0              0           0  ...             0   
4         0            0              0           0  ...             0   

   NumHash  NumNumericChars  NoHttps  IpAddr

In [5]:
# --- We now re-train on this COMBINED dataset ---
X_combined = combined_df[app_features]
y_combined = combined_df['CLASS_LABEL']

# Split and train
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train_c, y_train_c)

# Check accuracy (it should be very similar)
y_pred_c = final_model.predict(X_test_c)
accuracy = accuracy_score(y_test_c, y_pred_c)
print(f"New Combined Model Accuracy: {accuracy * 100:.2f}%")

# --- Overwrite the model file ---
joblib.dump(final_model, 'phishing_detector_model.joblib')
print("\nModel file has been overwritten with the new, smarter data!")

New Combined Model Accuracy: 90.45%

Model file has been overwritten with the new, smarter data!
