In [1]:
import requests
import pandas as pd

# OpenPhish Public Feed URL
openphish_url = "https://openphish.com/feed.txt"

try:
    # Fetch the phishing URLs
    response = requests.get(openphish_url)
    response.raise_for_status()  # Raise an error if request fails

    # Extract URLs from the response
    phishing_urls = response.text.split("\n")

    # Convert into a DataFrame
    openphish_df = pd.DataFrame({"URL": phishing_urls})
    openphish_df = openphish_df.dropna()  # Remove empty lines
    openphish_df["Label"] = "bad"  # Assign label

    # Save to CSV
    openphish_df.to_csv("openphish.csv", index=False)

    print(f"✅ Successfully fetched {len(openphish_df)} phishing URLs from OpenPhish!")

except requests.exceptions.RequestException as e:
    print(f"❌ Error fetching OpenPhish data: {e}")


✅ Successfully fetched 501 phishing URLs from OpenPhish!


In [2]:
import pandas as pd
import numpy as np

In [7]:
import pandas as pd

# Load Kaggle dataset
kaggle_df = pd.read_csv("../data/phishing_site_urls.csv")

# Load OpenPhish dataset
openphish_df = pd.read_csv("../data/openphish.csv")

# Load Alexa dataset
alexa_df = pd.read_csv("../data/top-1m.csv")


In [8]:
# Kaggle dataset (already labeled)
kaggle_df = kaggle_df[['URL', 'Label']]

# OpenPhish dataset (all are phishing)
openphish_df = openphish_df[['URL']]
openphish_df["Label"] = "bad"

# Alexa dataset (safe domains)
alexa_df.columns = ['Rank', 'URL']
alexa_df = alexa_df[['URL']]
alexa_df["Label"] = "good"

In [10]:
# Merge all datasets into one
final_df = pd.concat([kaggle_df, openphish_df, alexa_df], ignore_index=True)

# Save the final dataset
final_df.to_csv("../data/final_dataset.csv", index=False)

print(f"✅ Final dataset saved with {len(final_df)} records.")


✅ Final dataset saved with 1549846 records.
