In [1]:
import datetime
import json
from waybackpy import *
import glob, os, os.path

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tranco import Tranco

In [2]:
user_option = ['NOTHING', 'ACCEPT', 'REJECT', 'MANAGE']
GLOBAL_SELECTOR = "a, button, div, span, form, p"
NUMBER_OF_WEBSITES = 3

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0"
START_DATE_FRANCE = datetime.date(2020,7, 14)
END_DATE_FRANCE = datetime.date(2020, 12, 25)
delta = datetime.timedelta(days=31)

DATE_TRANCO_LIST='2022-10-01'

In [30]:
# Initializing tranco-list.eu list.
t = Tranco(cache=True, cache_dir='.tranco')

# Get tranco list for the specific date
date_list = t.list(date=DATE_TRANCO_LIST)

# Filter the list to what we are looking for.
french_websites = [web for web in date_list.list if '.fr' in web]
french_websites_www = [f'{website}' for website in french_websites]

unformatted_websites = french_websites_www[0:NUMBER_OF_WEBSITES]

first_three_websites = []
for website in unformatted_websites:
    if not website.startswith("www."):
        website = "www." + website
    first_three_websites.append(website)
    
print(first_three_websites)

['www.free.fr', 'www.google.fr', 'www.amazon.fr']


In [31]:
# Convert the date format to integer value (ymd)
def to_integer(dt_time):
    return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day

In [32]:
# Create list of urls for crawling
def waybackify_url(url, closest_timestamp):
    return f'https://web.archive.org/web/{closest_timestamp}/{url}'

loist = {}

for i in first_three_websites:
    loist2 = []
    start_date = START_DATE_FRANCE
    while start_date <= END_DATE_FRANCE:
        url = waybackify_url(i, to_integer(start_date))
        loist2.append((url, (start_date.year, start_date.month)))
        start_date += delta
    loist[i] = loist2

In [33]:
# Iterate over list of months for a specific website
def go_over_months(website):
    # list of urls for the website for each month
    urls_of_dates = []
    wayback_obj = Url(website).save()
    #print(wayback_obj)

    # iterating between 01-07-2019 to 25-03-25 by jumps of 31 days 
    start_date = START_DATE_FRANCE

    # append list of urls using wayback obect wuth specific date to interact with the API.
    while start_date <= END_DATE_FRANCE:
        # sometimes the API fails so in that case we simply try again
        try:
            archive_url = wayback_obj.near(year=start_date.year, month=start_date.month).archive_url
            urls_of_dates.append((archive_url, (start_date.year, start_date.month)))
            print(website, archive_url, "Success.")
            # go to the next month
            start_date += delta
            # the API is rather slow so we need to give some time to rest (:
            sleep(3)
        except Exception as e:
            print(e)
            continue
            
    return urls_of_dates

In [34]:
# For each website, visit urls of each months.
def go_over_websites(websites_list):
    get_list = {}
    for url in websites_list:
        get_list[url] = go_over_months(url)  
    return get_list

In [35]:
dictory_of_urls = go_over_websites(first_three_websites)

www.free.fr https://web.archive.org/web/20170616092025/http://www2004.free.fr/ Success.
www.free.fr https://web.archive.org/web/20170616092025/http://www2004.free.fr/ Success.
www.free.fr https://web.archive.org/web/20170616092025/http://www2004.free.fr/ Success.
www.free.fr https://web.archive.org/web/20170616092025/http://www2004.free.fr/ Success.
www.free.fr https://web.archive.org/web/20170616092025/http://www2004.free.fr/ Success.
www.free.fr https://web.archive.org/web/20170616092025/http://www2004.free.fr/ Success.
www.google.fr https://web.archive.org/web/20200729230705/https://www.google.fr/ Success.
www.google.fr https://web.archive.org/web/20200830010830/http://www.google.fr/ Success.
www.google.fr https://web.archive.org/web/20200929234438/http://www.google.fr/ Success.
www.google.fr https://web.archive.org/web/20201030002813/https://www.google.fr/ Success.
www.google.fr https://web.archive.org/web/20201129235416/https://www.google.fr/ Success.
www.google.fr https://web.arc

In [40]:
def print_dictionary_of_urls(dictory_of_urls):
    for website, urls in dictory_of_urls.items():
        print(f"Website: {website}")
        for url, date in urls:
            print(f"URL: {url}, Date: {date}")
        print()

In [41]:
print_dictionary_of_urls(dictory_of_urls)

# To do: validate this list by checking
# wheter every link for a specific website "contains" the original name of this website starting with "www.". 
# Print a separate json file with the check results.

Website: www.free.fr
URL: https://web.archive.org/web/20170616092025/http://www2004.free.fr/, Date: (2020, 7)
URL: https://web.archive.org/web/20170616092025/http://www2004.free.fr/, Date: (2020, 8)
URL: https://web.archive.org/web/20170616092025/http://www2004.free.fr/, Date: (2020, 9)
URL: https://web.archive.org/web/20170616092025/http://www2004.free.fr/, Date: (2020, 10)
URL: https://web.archive.org/web/20170616092025/http://www2004.free.fr/, Date: (2020, 11)
URL: https://web.archive.org/web/20170616092025/http://www2004.free.fr/, Date: (2020, 12)

Website: www.google.fr
URL: https://web.archive.org/web/20200729230705/https://www.google.fr/, Date: (2020, 7)
URL: https://web.archive.org/web/20200830010830/http://www.google.fr/, Date: (2020, 8)
URL: https://web.archive.org/web/20200929234438/http://www.google.fr/, Date: (2020, 9)
URL: https://web.archive.org/web/20201030002813/https://www.google.fr/, Date: (2020, 10)
URL: https://web.archive.org/web/20201129235416/https://www.google.

In [None]:
for website in dictory_of_urls:
       website_name = website
       print(website)
       for url in dictory_of_urls[website]:
           print(url[1])

driver.quit()

In [None]:
# Selenium Firefox setup
options = FirefoxOptions()
# options.headless = True
service = FirefoxService(r'C:\Program Files\Mozilla Firefox\geckodriver.exe')
driver = webdriver.Firefox(service=service, options=options)

In [None]:
def main():
    
    # Collect cookies for every month, write to two JSON file.
    for website in loist:
        website_name = website
        for url in loist[website]:
            print(website, url[1]) 

In [None]:
main()
driver.quit()

In [None]:
# Unused

In [None]:
mydir = r"V:\Uni\Thesis\Code\Thesis"
filelist = glob.glob(os.path.join(mydir, "*.JSON"))
for f in filelist:
    os.remove(f)

In [None]:
# Store JSON log
with open("{}_all_urls.txt".format("Urls"), 'w') as outfile:
                json.dump(dictory_of_urls, outfile, indent = 4)

In [None]:
def click_banner(driver):
    accept_words_list = set()
    for w in open("accept_words_languages.txt", "r", encoding="utf-8").read().splitlines():
        if not w.startswith("#") and not w == "":
            accept_words_list.add(w)

    sleep(20)
    content = driver.find_elements(By.CSS_SELECTOR, GLOBAL_SELECTOR)
    for element in content:
        try:
            if element.text.lower().strip(" ✓›!\n") in accept_words_list:
                print(element.text)
                print("and here")
                #WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, driver.get(element)))).click()
                element.click()
                return {'successful': True, 'error': 'none'}
        except Exception as e:
            print("Failed", e)
            return {'successful': False, 'error': 'click_error'}
    return {'successful': False, 'error': 'no_accept_button' }