In [None]:
import json
from time import sleep
from waybackpy import WaybackMachineCDXServerAPI
from tranco import Tranco
from tqdm import tqdm

In [None]:
TOTAL_NUMBER_OF_WEBSITES = 1000

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0"
START_DATE_RANGE = 201604
END_DATE_RANGE = 202104

DATE_TRANCO_LIST='2019-02-20' # Date for fetching Tranco list

In [None]:
def fetch_french_websites(top_n=TOTAL_NUMBER_OF_WEBSITES):
    
    # Initializing tranco-list.eu list.
    tranco_client = Tranco(cache=True, cache_dir='.tranco')

    # Get tranco list for the specific date
    list_for_date = tranco_client.list(date=DATE_TRANCO_LIST)

    # Filter the list to what we are looking for.
    french_websites = [web for web in list_for_date.list if '.fr' in web]
        
    return french_websites[:top_n]

In [None]:
def fetch_archived_urls(website, start_date, end_date, collapse_by, depth = 0):
    """
    Fetches archived URLs for a specific website within a date range using Wayback Machine API.
    """
    # Abort if recursive depth exceeds 2
    if depth > 2:
        return []
    
    archived_urls = []
    # Initialize Wayback Machine API with parameters
    cdx_api = WaybackMachineCDXServerAPI(
        url=website, 
        user_agent=USER_AGENT, 
        start_timestamp=start_date,
        end_timestamp=end_date, 
        collapses=[collapse_by]
    )
    
    try:
        # Iterate through snapshots and extract URL and date
        for snapshot in cdx_api.snapshots():
            date_of_snapshot = snapshot.archive_url.split('/')[4][:8]
            archived_urls.append((snapshot.archive_url, date_of_snapshot))
    except Exception as e:
        # If an error occurs, retry after a delay, incrementing the recursion depth
        sleep(30)
        return fetch_archived_urls(website, start_date, end_date, collapse_by, depth+1)
    return archived_urls


In [None]:
def process_websites(websites):
    """
    Processes a list of websites, retrieving archived URLs for each.
    """
    processed_data = {}

    # Loop through each website and retrieve its snapshots
    for website in tqdm(websites):
        processed_data[website] = fetch_archived_urls(website, START_DATE_RANGE, END_DATE_RANGE, "timestamp:6")
        sleep(20) # Throttle requests to avoid API rate limits

    return processed_data

In [None]:
def createJSON(filename, dictionary):
    # Store JSON log
    with open(filename, 'w') as outfile:
        json.dump(dictionary, outfile, indent = 4)

In [None]:
def create_dictionary_of_urls(websites):
    """
    Create a dictionary of URLs and dates, and store it in a JSON file.
    """
    dictionary_of_urls = process_websites(websites)
    
    # Store the dictionary in a JSON file
    createJSON("dictionary_of_urls_1000.json", dictionary_of_urls)

In [None]:
if __name__ == "__main__":
    french_websites = fetch_french_websites()
    create_dictionary_of_urls(french_websites)