In [2]:
import requests
from data import DECLARATIONS_DIR, SNAPSHOTS_DIR

In [3]:
declaration_files = list(DECLARATIONS_DIR.glob("declarations/*.json"))
snapshot_files = list(SNAPSHOTS_DIR.glob("**/*.html"))

import json

declarations = []

for d in DECLARATIONS_DIR.glob("declarations/*.json"):
    with open(d, "r") as f:
        declarations.append(json.load(f))


# Fix faulty twitter api json
for declaration in declarations:
    if declaration.get("name") == "X_Twitter-API-V1":
        extra_keys = set(declaration.keys()) - {"name", "documents"}
        for key in extra_keys:
            corrected_key = "Developer Utilities" if key == "Developper Utilities" else key
            declaration["documents"][corrected_key] = declaration.pop(key)

In [4]:
from uuid import uuid4

def process_declaration(declaration):
    res = []
    name = declaration["name"]
    for k, v in declaration["documents"].items():
        if "combine" in v:
            doc_sources = v["combine"]
        else:
            doc_sources = [v]
            
        for source in doc_sources:
            res.append({"name": name, "document": k, "url": source["fetch"]})
    return res

sources = []
for d in declarations:
    sources.extend(process_declaration(d))


for i, source in enumerate(sources):
    source["id"] = i

print("num sources:", len(sources))
print(sources[500])

num sources: 1085
{'name': 'Facebook_Graph-API', 'document': 'Canvas Endpoint', 'url': 'https://developers.facebook.com/docs/graph-api/reference/canvas-footer/', 'id': 500}


In [7]:
def match_snapshot(source):
    match = None
    match_len = 0
    for s in snapshot_files:
        parts = s.parts
        if parts[-2] == source["name"] and parts[-1].startswith(f"{source['document']} #"):
    
            words = s.stem.split("#")[-1].split("-")
            if all([word in source["url"] for word in words]):
                if len(words) > match_len:
                    match = s
                    match_len = len(words)
    return match

s = set()
for source in sources:
    snapshot = match_snapshot(source)
    if snapshot in s:
        # print(source, snapshot)
        pass
    elif snapshot is None:
        print(source, snapshot)
    else:
        s.add(snapshot)
        source["snapshot"] = snapshot
match_snapshot(sources[500])

{'name': 'Google_Transparency-Report', 'document': 'Transparency Report', 'url': 'https://storage.googleapis.com/transparencyreport/report-downloads/pdf-report-27_2023-8-28_2023-9-10_en_v1.pdf', 'id': 13} None
{'name': 'X_Twitter-API-V1', 'document': 'Developer Utilities', 'url': 'https://developer.twitter.com/en/docs/twitter-api/v1/developer-utilities/rate-limit-status/overview', 'id': 188} None
{'name': 'X_Twitter-API-V1', 'document': 'Developer Utilities', 'url': 'https://developer.twitter.com/en/docs/twitter-api/v1/developer-utilities/rate-limit-status/api-reference/get-application-rate_limit_status', 'id': 189} None
{'name': 'X_Twitter-API-V1', 'document': 'Developer Utilities', 'url': 'https://developer.twitter.com/en/docs/twitter-api/v1/developer-utilities/supported-languages/overview', 'id': 190} None
{'name': 'X_Twitter-API-V1', 'document': 'Developer Utilities', 'url': 'https://developer.twitter.com/en/docs/twitter-api/v1/developer-utilities/supported-languages/api-reference/

PosixPath('/Users/eelco/dev/Hackathon-DSA/data/platform-docs-snapshots/Facebook_Graph-API/Canvas Endpoint #docs-graph-api-reference-canvas-footer.html')

In [6]:
len(s)

839

# Postlight Scraper
Postlight is an opensource parser that extract important content from a URL. See https://github.com/postlight/parser 

We host postlight locally and scrape all URLs from the declarations repo


1. Install npm and yarn: https://classic.yarnpkg.com/lang/en/docs/install/#debian-stable
2. Run the postlight parser API
```
git clone https://github.com/postlight/parser-api.git
cd parser-api
yarn install
yarn serve
```

In [5]:
from typing import Dict

import requests

POSTLIGHT_API_URL = "http://localhost:3000"

def scrape_with_postlight(url: str) -> Dict[str, str]:
    response = requests.get(POSTLIGHT_API_URL + "/parser", params={"url": url})
    return response.json()


In [7]:
from data import DATA_DIR

postlight_scraped_file = DATA_DIR / "postlight_scraped.json"

In [9]:
from tqdm.notebook import tqdm
import time

if not postlight_scraped_file.is_file():
    scraped = {}
    
    for source in tqdm(sources):
        time.sleep(0.1)
        try: 
            s = scrape_with_postlight(source["url"])
            scraped[source["id"]] = s
        except Exception:
            print(f"could not parse {source['url']}")

    with open(postlight_scraped_file, "w") as f:
        json.dump(scraped, f)
else:
    with open(postlight_scraped_file, "r") as f:
        scraped = json.load(f)

In [16]:
# Short documents probably didnt work...

# sum([len(s["content"])<100 for s in scraped.values()])

for s in scraped.values():
    if "content" not in s:
        print(s)
    # elif len(s["content"]) < 100:
    #     print(s["url"], s["content"])

{'error': True, 'message': 'Resource returned a response status code of 500 and resource was instructed to reject non-200 status codes.', 'failed': True}


In [127]:
!pip install pypandoc

Collecting pypandoc
  Downloading pypandoc-1.12-py3-none-any.whl (20 kB)
Installing collected packages: pypandoc
Successfully installed pypandoc-1.12

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [139]:
from bs4 import BeautifulSoup

def html_to_plaintext(html: str) -> str:
    soup = BeautifulSoup(html, features="lxml")
    return "\n\n".join([p.get_text() for p in soup.find_all("p")])
    
def convert_html_to_markdown(html: str) -> str:
    # Convert HTML to Markdown using Pandoc
    output = pypandoc.convert_text(html, 'md', format='html')
    return output


In [143]:
print(convert_html_to_markdown(scraped[900]["content"]))

::: is-table-default
If you have been working with the v1.1
[statuses/sample](/content/developer-twitter/en/docs/twitter-api/v1/tweets/sample-realtime/overview/get_statuses_sample)
endpoint, the goal of this guide is to help you understand the
similarities and differences between the standard and Twitter API v2
sample stream endpoints.

-   **Differences**
    -   Endpoint URLs
    -   App and Project requirements
    -   Authentication method
    -   Response data format
    -   Request parameters
    -   Availability of recovery and redundancy features

### Differences 

#### Endpoint URLs

-   Standard v1.1 endpoints:
    -   [https://stream.twitter.com/1.1/statuses/sample]{.code-inline}
-   Twitter API v2 endpoint:
    -   [https://api.twitter.com/2/tweets/sample/stream]{.code-inline}

#### App and Project requirements

The Twitter API v2 endpoints require that you use credentials from
a [developer
App](https://aem-staging.twitter.biz/content/developer-twitter/en/docs/apps.html) th

In [144]:
from IPython.display import HTML

display(HTML(scraped[900]["content"]))

Standard v1.1 parameter,Details
Delimited,"With the v1.1 endpoint, setting this to the string length indicates that statuses should be delimited in the stream, so that clients know how many bytes to read before the end of the status message. This functionality is not available with Twitter API v2"
Stall_warnings,"With the v1.1 endpoint, setting this parameter to the string true will cause periodic messages to be delivered if the client is in danger of being disconnected. With Twitter API v2, stall warnings are sent by default with the new line sent every so often."
