# About

In this notebook we try to collect data from various sources, combining different methods and approaches. This is experimental.

In [1]:
import cudf
import gzip
import requests
import json

from io import BytesIO
from urllib.parse import quote_plus
from warcio.archiveiterator import ArchiveIterator

In [14]:
# The URL of the Common Crawl Index server
SERVER = 'http://index.commoncrawl.org/'

# The Common Crawl index you want to query
INDEX_NAME = 'CC-MAIN-2023-50'      # Replace with the latest index name

In [15]:
target_url = 'bundestag.de'  # Replace with your target URL

In [16]:
# It’s advisable to use a descriptive User-Agent string when developing your own applications.
# This practice aligns with the conventions outlined in RFC 7231. Let's use this simple one:
myagent = 'cc-get-started/1.0 (Example data retrieval script; info@bundestag-mine.de)'

In [17]:
# Function to search the Common Crawl Index
def search_cc_index(url):
    encoded_url = quote_plus(url)
    index_url = f'{SERVER}{INDEX_NAME}-index?url={encoded_url}&output=json'
    response = requests.get(index_url, headers={'user-agent': myagent})
    print("Response from server:\r\n", response.text)
    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None

In [18]:
# Function to fetch content from Common Crawl
def fetch_page_from_cc(records):
    for record in records:
        offset, length = int(record['offset']), int(record['length'])
        s3_url = f'https://data.commoncrawl.org/{record["filename"]}'

        # Define the byte range for the request
        byte_range = f'bytes={offset}-{offset+length-1}'

        # Send the HTTP GET request to the S3 URL with the specified byte range
        response = requests.get(
            s3_url,
            headers={'user-agent': myagent, 'Range': byte_range},
            stream=True
        )

        if response.status_code == 206:
            # Use `stream=True` in the call to `requests.get()` to get a raw
            # byte stream, because it's gzip compressed data

            # Create an `ArchiveIterator` object directly from `response.raw`
            # which handles the gzipped WARC content

            stream = ArchiveIterator(response.raw)
            for warc_record in stream:
                if warc_record.rec_type == 'response':
                    return warc_record.content_stream().read()
        else:
            print(f"Failed to fetch data: {response.status_code}")
            return None

    print("No valid WARC record found in the given records")
    return None

In [19]:
# Search the index for the target URL
records = search_cc_index(target_url)
if records:
    print(f"Found {len(records)} records for {target_url}")

    # Fetch the page content from the first record
    content = fetch_page_from_cc(records)
    if content:
        print(f"Successfully fetched content for {target_url}")
        print(content)
        # You can now process the 'content' variable as needed
        # using something like Beautiful Soup, etc
else:
    print(f"No records found for {target_url}")

Response from server:
 {"urlkey": "de,bundestag)/", "timestamp": "20231130200123", "url": "https://www.bundestag.de/", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "KFAJ23C7T3E2VYJR6PI43CEM56JVSLGS", "length": "36632", "offset": "751532779", "filename": "crawl-data/CC-MAIN-2023-50/segments/1700679100232.63/warc/CC-MAIN-20231130193829-20231130223829-00584.warc.gz", "languages": "deu,ltz", "encoding": "UTF-8"}
{"urlkey": "de,bundestag)/", "timestamp": "20231201232215", "url": "http://www.bundestag.de", "mime": "unk", "mime-detected": "application/octet-stream", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "451", "offset": "330930", "filename": "crawl-data/CC-MAIN-2023-50/segments/1700679100308.37/robotstxt/CC-MAIN-20231201215122-20231202005122-00240.warc.gz", "redirect": "https://www.bundestag.de/"}
{"urlkey": "de,bundestag)/", "timestamp": "20231201232216", "url": "https://www.bundestag.de/", "mime": "text/html", "mime-detecte