In [1]:
import pandas as pd
import numpy as np
import scrapper

### Test

In [9]:
import asyncio
import logging
import re
import sys
from typing import IO
import urllib.error
import urllib.parse

import aiofiles
import aiohttp
from aiohttp import ClientSession
import pathlib
import sys

logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
    level=logging.DEBUG,
    datefmt="%H:%M:%S",
    stream=sys.stderr,
)
logger = logging.getLogger("areq")
logging.getLogger("chardet.charsetprober").disabled = True

HREF_RE = re.compile(r'href="(.*?)"')

async def fetch_html(url: str, session: ClientSession, **kwargs) -> str:
    """GET request wrapper to fetch page HTML.

    kwargs are passed to `session.request()`.
    """

    resp = await session.request(method="GET", url=url, **kwargs)
    resp.raise_for_status()
    logger.info("Got response [%s] for URL: %s", resp.status, url)
    html = await resp.text()
    return html

async def parse(url: str, session: ClientSession, **kwargs) -> set:
    """Find HREFs in the HTML of `url`."""
    found = set()
    try:
        html = await fetch_html(url=url, session=session, **kwargs)
    except (
        aiohttp.ClientError,
        aiohttp.http_exceptions.HttpProcessingError,
    ) as e:
        logger.error(
            "aiohttp exception for %s [%s]: %s",
            url,
            getattr(e, "status", None),
            getattr(e, "message", None),
        )
        return found
    except Exception as e:
        logger.exception(
            "Non-aiohttp exception occured:  %s", getattr(e, "__dict__", {})
        )
        return found
    else:
        for link in HREF_RE.findall(html):
            try:
                abslink = urllib.parse.urljoin(url, link)
            except (urllib.error.URLError, ValueError):
                logger.exception("Error parsing URL: %s", link)
                pass
            else:
                found.add(abslink)
        logger.info("Found %d links for %s", len(found), url)
        return found

async def write_one(file: IO, url: str, **kwargs) -> None:
    """Write the found HREFs from `url` to `file`."""
    res = await parse(url=url, **kwargs)
    if not res:
        return None
    async with aiofiles.open(file, "a") as f:
        for p in res:
            await f.write(f"{url}\t{p}\n")
        logger.info("Wrote results for source URL: %s", url)

async def bulk_crawl_and_write(file: IO, urls: set, **kwargs) -> None:
    """Crawl & write concurrently to `file` for multiple `urls`."""
    async with ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(
                write_one(file=file, url=url, session=session, **kwargs)
            )
        await asyncio.gather(*tasks)



In [16]:


assert sys.version_info >= (3, 7), "Script requires Python 3.7+."
here = pathlib.Path('./').parent

with open(here.joinpath("urls.txt")) as infile:
    urls = set(map(str.strip, infile))

outpath = here.joinpath("foundurls.txt")
with open(outpath, "w") as outfile:
    outfile.write("source_url\tparsed_url\n")

await bulk_crawl_and_write(file=outpath, urls=urls)

22:36:08 INFO:areq: Got response [200] for URL: https://1.1.1.1/
22:36:08 INFO:areq: Found 13 links for https://1.1.1.1/
22:36:08 INFO:areq: Wrote results for source URL: https://1.1.1.1/
22:36:08 INFO:areq: Got response [200] for URL: https://www.ietf.org/rfc/rfc2616.txt
22:36:08 INFO:areq: Found 0 links for https://www.ietf.org/rfc/rfc2616.txt
22:36:08 INFO:areq: Got response [200] for URL: https://regex101.com/
22:36:08 ERROR:areq: aiohttp exception for https://docs.python.org/3/this-url-will-404.html [404]: Not Found
22:36:08 INFO:areq: Found 24 links for https://regex101.com/
22:36:08 INFO:areq: Got response [200] for URL: https://www.mediamatters.org/
22:36:08 INFO:areq: Found 109 links for https://www.mediamatters.org/
22:36:08 INFO:areq: Wrote results for source URL: https://regex101.com/
22:36:08 INFO:areq: Wrote results for source URL: https://www.mediamatters.org/
22:36:08 INFO:areq: Got response [200] for URL: https://www.bloomberg.com/markets/economics
22:36:08 INFO:areq: 

In [20]:
import datetime 

In [29]:
print(datetime.datetime.now().timestamp())

1579820609.106105
