In [None]:
import datetime
import json
import glob, os, os.path

from waybackpy import WaybackMachineCDXServerAPI
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tranco import Tranco
from tqdm import tqdm

In [None]:
TOTAL_NUMBER_OF_WEBSITES = 1000

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0"
START_DATE_RANGE = 201604
END_DATE_RANGE = 202104


DATE_TRANCO_LIST='2019-02-20' # Date for fetching Tranco list

In [None]:
def fetch_french_websites(top_n=TOTAL_NUMBER_OF_WEBSITES):
    
    # Initializing tranco-list.eu list.
    tranco_client = Tranco(cache=True, cache_dir='.tranco')

    # Get tranco list for the specific date
    list_for_date = tranco_client.list(date=DATE_TRANCO_LIST)

    # Filter the list to what we are looking for.
    french_websites = [web for web in list_for_date.list if '.fr' in web]
        
    return french_websites[:top_n]

In [None]:
def fetch_archived_urls(website, start_date, end_date, collapse_by, depth = 0):
    # if we try it more than 2 times we abort
    if depth > 1:
        return []
    
    archived_urls = []
    
    cdx_api = WaybackMachineCDXServerAPI(url=website, user_agent=USER_AGENT, start_timestamp=start_date, end_timestamp=end_date, collapses=[collapse_by])
    snapshots = cdx_api.snapshots()
    
    try:
        for snapshot in snapshots:
            date_of_snapshot = snapshot.archive_url.split('/')[4][:8]
            archived_urls.append((snapshot.archive_url, date_of_snapshot))
    except Exception as e:
        print(e)
        sleep(30)
        return fetch_archived_urls(website, start_date, end_date, collapse_by, depth+1)
    return archived_urls


In [None]:
def go_over_months(website):
    # list of urls for the website for each month
    
    urls_of_dates = fetch_archived_urls(website, START_DATE_RANGE, END_DATE_RANGE, "timestamp:6")
    
    print(website, len(urls_of_dates))
    return (urls_of_dates, len(urls_of_dates))

In [None]:
def process_websites(websites):
    processed_data = {}

    for website in tqdm(websites):
        processed_data[website] = fetch_archived_urls(website, START_DATE_RANGE, END_DATE_RANGE, "timestamp:6")
        sleep(20)
    print(f"Processed: {len(processed_data)} URLs")

    return processed_data

In [None]:
# to print URLs nicely
def print_dictionary_of_urls(dictory_of_urls):
    for website, urls in dictory_of_urls.items():
        print(f"Website: {website}")
        for url, date in urls:
            print(f"URL: {url}, Date: {date}")
        print()

#print_dictionary_of_urls(dictory_of_urls)

In [None]:
def createJSON(filename, dictionary):
    # Store JSON log
    with open(filename, 'w') as outfile:
        json.dump(dictionary, outfile, indent = 4)

In [None]:
def remJSON():
    mydir = r"V:\Uni\Thesis\Code\Thesis"
    filelist = glob.glob(os.path.join(mydir, "*.JSON"))
    for f in filelist:
        os.remove(f)

In [None]:
def create_dictionary_of_urls(websites):
    """
    Create a dictionary of URLs and dates, and store it in a JSON file.
    """
    dictionary_of_urls = process_websites(websites)
    
    # Store the dictionary in a JSON file
    createJSON("dictionary_of_urls_1000.json", dictionary_of_urls)

In [None]:
create_dictionary_of_urls(fetch_french_websites())

# Validate

In [None]:
with open('dictionary_of_urls_1000.json') as json_file:
    data = json.load(json_file)

valid = True

for web in data:
    for link in data[web]:
        valid&= web in link
        if not web in link[0].lower():
            print(link, web)
            break
print(valid)