# Pulling pages and their mementos from Arquivo.pt

1. Search for unique URLs in the specified domain
2. For each of the URLs that were found, find the stored versions (mementos)

In [1]:
import requests, json, time, re

websites = {
    "noticias_ao_minuto": {
        "url": "http://noticiasaominuto.com",
        "article_verification_regex": re.compile("^(https:\/\/noticiasaominuto\.com\/[a-z0-9-%]+\/[0-9]+\/[a-z0-9-%]+)(?:\?.*)?$")
    }, 
    "publico": {
        "url": "http://publico.pt",
        "article_verification_regex": re.compile("^(http:\/\/publico\.pt\/[0-9]+\/[0-9]+\/[0-9]+\/[a-z0-9-%]+\/[a-z0-9-%]+\/[a-z0-9-%]+)(?:\?.*)?$")
    }
}
start_domain = "20180101000000"
end_domain = "20211101000000"

results = { identifier: requests.get(f"https://arquivo.pt/textsearch?q=&siteSearch={website['url']}&maxItems=2000&from={start_domain}&to={end_domain}&dedupValue=1&dedupField=url").json()
               for identifier, website in websites.items() }

In [2]:
amount_of_pages = { identifier: len(result["response_items"]) for identifier, result in results.items() }

print("Amount of pages per website: " + str(amount_of_pages))

Amount of pages per website: {'noticias_ao_minuto': 1244, 'publico': 487}


In [3]:
total_amount_of_pages = { identifier: 0 for identifier in websites.keys()}

output = {}

# TODO add progress bar?
for identifier, result in results.items():
    print(f"---------- {identifier}")
    output[identifier] = {}
    for item in result["response_items"]:
        url = item["originalURL"]
        
        match = websites[identifier]["article_verification_regex"].match(url)
        if not match:
            print(f"NOT an article: {url}")
            continue
        
        base_url = match.group(1) # this gets the first part of the url (without GET parameters)
        if base_url not in output[identifier]:
            output[identifier][base_url] = []
    
        cdx_response = None
        
        valid = True
        while True:
            cdx_response = requests.get(f"https://arquivo.pt/wayback/cdx?url={url}&from={start_domain}&to={end_domain}&output=json")
            if cdx_response.ok: break
            if cdx_response.status_code == 429: # limit of api calls
                time.sleep(10)
            else:
                valid = False
                print(f"Invalid url (url: {url}, status_code: {cdx_response.status_code})")
                break
        
        if not valid: continue
        
        # response is a string where each line is JSON for a memento object
        # ignoring last line because it is an empty string
        splitted = cdx_response.text.split("\n")[:-1]
        
        for memento in splitted:
            output[identifier][base_url].append(json.loads(memento)["timestamp"])
        
        total_amount_of_pages[identifier] += len(splitted)

print("Amount of pages (including mementos): " + str(total_amount_of_pages))

with open("data.json", "w") as file:
    json.dump(output, file, indent=4)

---------- noticias_ao_minuto
NOT an article: https://noticiasaominuto.com/
NOT an article: https://noticiasaominuto.com/robots.txt
NOT an article: https://noticiasaominuto.com/manifest.json
---------- publico
NOT an article: http://publico.pt/
NOT an article: http://publico.pt/troika
NOT an article: http://publico.pt/2019/08/01/ciencia/entrevista/quantidade-emissoes-gases-estufa-permitimos-aviacao-absurda-1881906/embed?FromApp=1
NOT an article: http://publico.pt/a-europa-que-conta-podcast
NOT an article: http://publico.pt/acores/vila-franca-do-campo
NOT an article: http://publico.pt/autor/micael-sousa
NOT an article: http://publico.pt/aveiro/santa-maria-da-feira
NOT an article: http://publico.pt/aveiro/sever-do-vouga
NOT an article: http://publico.pt/aveiro/vagos
NOT an article: http://publico.pt/braga/vizela
NOT an article: http://publico.pt/europeias-2019/resultados
NOT an article: http://publico.pt/p3/vicios
NOT an article: http://publico.pt/roman-polanski
NOT an article: http://pu