In [1]:
import pandas as pd
import requests
import io
import zipfile

# 1. DOWNLOAD PHISHTANK DATA (Malicious)
print("Downloading PhishTank data...")
# PhishTank provides a verified 'online-valid' CSV
phish_url = "http://data.phishtank.com/data/online-valid.csv"
phish_df = pd.read_csv(phish_url)

# Labeling PhishTank as 1 (Phishing)
phish_df = phish_df[['url']]
phish_df['label'] = 1 

# 2. DOWNLOAD ALEXA TOP 1M DATA (Benign)
print("Downloading Alexa Top 1M data...")
# We use a public mirror for the Alexa Top 1M list
alexa_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
r = requests.get(alexa_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
alexa_df = pd.read_csv(z.open('top-1m.csv'), names=['rank', 'url'])

# Labeling Alexa as 0 (Benign)
# We take only the first 'N' rows to match the PhishTank count for a balanced dataset
alexa_df = alexa_df.head(len(phish_df))
alexa_df = alexa_df[['url']]
alexa_df['label'] = 0

# 3. COMBINE AND SAVE
print("Merging datasets...")
dataset = pd.concat([phish_df, alexa_df], ignore_index=True)

# Save to the local 'datasets' folder (ignored by git)
dataset.to_csv('../datasets/phishing_dataset.csv', index=False)
print("Finished! Labeled dataset saved to model/datasets/phishing_dataset.csv")

Downloading PhishTank data...
Downloading Alexa Top 1M data...
Merging datasets...
Finished! Labeled dataset saved to model/datasets/phishing_dataset.csv


In [2]:
import pandas as pd

# Load your new dataset
df = pd.read_csv('../datasets/phishing_dataset.csv')

# Check the count of each label
print("Label Counts:")
print(df['label'].value_counts())

Label Counts:
label
1    53247
0        1
Name: count, dtype: int64


In [3]:
import pandas as pd
import requests
import io
import zipfile

# 1. LOAD PHISHTANK (You already have this, but let's re-align)
print("Processing PhishTank...")
phish_url = "http://data.phishtank.com/data/online-valid.csv"
phish_df = pd.read_csv(phish_url)
phish_df = phish_df[['url']].copy()
phish_df['label'] = 1

# 2. LOAD ALEXA (The Benign Set)
print("Processing Alexa...")
# Using a reliable mirror for the Top 1M sites
alexa_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
r = requests.get(alexa_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
alexa_df = pd.read_csv(z.open('top-1m.csv'), names=['rank', 'url'])

# Balance the dataset: Take the same number of Alexa sites as PhishTank sites
alexa_df = alexa_df.head(len(phish_df)).copy()
alexa_df = alexa_df[['url']]
alexa_df['label'] = 0

# 3. CONCATENATE (The Merge)
# This appends the Alexa rows BELOW the PhishTank rows
df = pd.concat([phish_df, alexa_df], ignore_index=True)

# 4. SHUFFLE (To mix the 0s and 1s)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Final Check
print("New Label Counts:")
print(df['label'].value_counts())

Processing PhishTank...
Processing Alexa...
New Label Counts:
label
1    53247
0        1
Name: count, dtype: int64


In [None]:
import pandas as pd
import requests
import io
import zipfile

# 1. PHISHTANK (Malicious - Label 1)
print("Downloading PhishTank...")
phish_url = "http://data.phishtank.com/data/online-valid.csv"
phish_df = pd.read_csv(phish_url)
# Select only the URL column and add label
phish_df = phish_df[['url']].copy()
phish_df['label'] = 1

# 2. ALEXA MIRROR (Benign - Label 0)
print("Downloading Alexa Top 1M...")
# Using a 2026-stable mirror as the old s3 link is often restricted
alexa_url = "https://raw.githubusercontent.com/datasets/top-1m/master/top-1m.csv"
alexa_df = pd.read_csv(alexa_url, names=['rank', 'url'])

# CRITICAL: Match the column count of PhishTank for a balanced dataset
alexa_df = alexa_df.head(len(phish_df)).copy()
alexa_df = alexa_df[['url']]
alexa_df['label'] = 0

# 3. THE MERGE (Ensuring labels stay intact)
# We use ignore_index=True to create a clean new index for the 100k+ rows
df = pd.concat([phish_df, alexa_df], ignore_index=True)

# 4. FINAL VERIFICATION
print("\nFinal Label Verification:")
print(df['label'].value_counts())