In [1]:
import base64
import requests
import zipfile
import os
from datetime import datetime, timedelta

# Function to download and extract CSV from a dynamically generated URL
def download_and_extract_csv(start_date):
    # Convert the start date to the required string format
    str_start_date = start_date.strftime("%Y-%m-%d")
    print("Trying to download " + str_start_date + " data")
    
    # Encode the date to base64
    date_zip = str_start_date + ".zip"
    random_str = base64.b64encode(date_zip.encode("utf-8")).decode("utf-8")
    
    # Construct the URL
    url = "https://www.whoisds.com/whois-database/newly-registered-domains/" + random_str + "/nrd"
    
    # Make the request to download the ZIP file
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Define paths
        zip_folder = "Datasets/ZIPS"
        extract_folder = "Datasets"
        
        # Ensure the ZIP folder exists
        os.makedirs(zip_folder, exist_ok=True)
        
        # Save the ZIP file in the ZIPS folder
        zip_filename = os.path.join(zip_folder, date_zip)
        with open(zip_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {zip_filename}")

        # Extract the CSV file from the ZIP into the Datasets folder
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(extract_folder)  # Extracts all files in the ZIP to the Datasets directory
        
        # Rename the extracted domain-names.txt file to include the date
        original_file = os.path.join(extract_folder, "domain-names.txt")
        new_file = os.path.join(extract_folder, f"domain-names-{str_start_date}.txt")
        if os.path.exists(original_file):
            os.rename(original_file, new_file)
            print(f"Renamed {original_file} to {new_file}")

        print(f"Extracted {zip_filename} to {extract_folder}")

    else:
        print(f"Failed to download data for {str_start_date}. Status code: {response.status_code}")

def get_latest_date_from_zips(zip_folder):
    # List all files in the ZIP folder
    files = os.listdir(zip_folder)
    # Filter out files that don't match the date pattern
    date_files = [f for f in files if f.endswith(".zip") and len(f) == 14]
    
    # Extract dates from filenames
    dates = [datetime.strptime(f[:10], "%Y-%m-%d") for f in date_files]
    
    # Return the latest date or None if no dates found
    return max(dates) if dates else None

# Main script
if __name__ == "__main__":
    zip_folder = "Datasets/ZIPS"
    latest_date = get_latest_date_from_zips(zip_folder)
    
    current_date = datetime.now().date()
    
    if latest_date:
        # Increment the latest date by one day to get the next date
        next_date = latest_date + timedelta(days=1)
    else:
        # If no date is found, set the default date to current date - 4
        next_date = current_date - timedelta(days=4)
    
    # Loop to download and extract all missing dates up to the current date
    while next_date < current_date:
        download_and_extract_csv(next_date)
        next_date += timedelta(days=1)

Trying to download 2024-05-30 data
Downloaded Datasets/ZIPS\2024-05-30.zip
Renamed Datasets\domain-names.txt to Datasets\domain-names-2024-05-30.txt
Extracted Datasets/ZIPS\2024-05-30.zip to Datasets
Trying to download 2024-05-31 data
Downloaded Datasets/ZIPS\2024-05-31.zip
Renamed Datasets\domain-names.txt to Datasets\domain-names-2024-05-31.txt
Extracted Datasets/ZIPS\2024-05-31.zip to Datasets
Trying to download 2024-06-01 data
Downloaded Datasets/ZIPS\2024-06-01.zip
Renamed Datasets\domain-names.txt to Datasets\domain-names-2024-06-01.txt
Extracted Datasets/ZIPS\2024-06-01.zip to Datasets
Trying to download 2024-06-02 data
Downloaded Datasets/ZIPS\2024-06-02.zip
Renamed Datasets\domain-names.txt to Datasets\domain-names-2024-06-02.txt
Extracted Datasets/ZIPS\2024-06-02.zip to Datasets


In [3]:
import base64
import requests
import zipfile
import os
from datetime import datetime, timedelta

# Function to download and extract CSV from a dynamically generated URL
def download_and_extract_csv(start_date):
    # Convert the start date to the required string format
    str_start_date = start_date.strftime("%Y-%m-%d")
    print("Trying to download " + str_start_date + " data")
    
    # Encode the date to base64
    date_zip = str_start_date + ".zip"
    random_str = base64.b64encode(date_zip.encode("utf-8")).decode("utf-8")
    
    # Construct the URL
    url = "https://www.whoisds.com/whois-database/newly-registered-domains/" + random_str + "/nrd"
    
    # Make the request to download the ZIP file
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Define paths
        zip_folder = "Datasets/ZIPS"
        extract_folder = "Datasets"
        
        # Ensure the ZIP folder exists
        os.makedirs(zip_folder, exist_ok=True)
        
        # Save the ZIP file in the ZIPS folder
        zip_filename = os.path.join(zip_folder, date_zip)
        with open(zip_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {zip_filename}")

        # Extract the CSV file from the ZIP into the Datasets folder
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(extract_folder)  # Extracts all files in the ZIP to the Datasets directory
        
        # Rename the extracted domain-names.txt file to include the date
        original_file = os.path.join(extract_folder, "domain-names.txt")
        new_file = os.path.join(extract_folder, f"domain-names-{str_start_date}.txt")
        if os.path.exists(original_file):
            os.rename(original_file, new_file)
            print(f"Renamed {original_file} to {new_file}")

        print(f"Extracted {zip_filename} to {extract_folder}")

    else:
        print(f"Failed to download data for {str_start_date}. Status code: {response.status_code}")

# Main script
if __name__ == "__main__":
    start_date = datetime(2024, 4, 1).date()
    current_date = datetime.now().date()
    
    # Loop to download and extract all missing dates up to the current date
    next_date = start_date
    while next_date < current_date:
        download_and_extract_csv(next_date)
        next_date += timedelta(days=1)


Trying to download 2024-04-01 data
Downloaded Datasets/ZIPS\2024-04-01.zip


BadZipFile: File is not a zip file