In [1]:
import pandas as pd
import numpy as np
import scrapper
import re

In [4]:
%load_ext autoreload
%autoreload 2

### Test

In [None]:
import asyncio
import logging
import re
import sys
from typing import IO
import urllib.error
import urllib.parse

import aiofiles
import aiohttp
from aiohttp import ClientSession
import pathlib
import sys

logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
    level=logging.DEBUG,
    datefmt="%H:%M:%S",
    stream=sys.stderr,
)
logger = logging.getLogger("areq")
logging.getLogger("chardet.charsetprober").disabled = True

HREF_RE = re.compile(r'href="(.*?)"')

async def fetch_html(url: str, session: ClientSession, **kwargs) -> str:
    """GET request wrapper to fetch page HTML.

    kwargs are passed to `session.request()`.
    """

    resp = await session.request(method="GET", url=url, **kwargs)
    resp.raise_for_status()
    logger.info("Got response [%s] for URL: %s", resp.status, url)
    html = await resp.text()
    return html

async def parse(url: str, session: ClientSession, **kwargs) -> set:
    """Find HREFs in the HTML of `url`."""
    found = set()
    try:
        html = await fetch_html(url=url, session=session, **kwargs)
    except (
        aiohttp.ClientError,
        aiohttp.http_exceptions.HttpProcessingError,
    ) as e:
        logger.error(
            "aiohttp exception for %s [%s]: %s",
            url,
            getattr(e, "status", None),
            getattr(e, "message", None),
        )
        return found
    except Exception as e:
        logger.exception(
            "Non-aiohttp exception occured:  %s", getattr(e, "__dict__", {})
        )
        return found
    else:
        for link in HREF_RE.findall(html):
            try:
                abslink = urllib.parse.urljoin(url, link)
            except (urllib.error.URLError, ValueError):
                logger.exception("Error parsing URL: %s", link)
                pass
            else:
                found.add(abslink)
        logger.info("Found %d links for %s", len(found), url)
        return found

async def write_one(file: IO, url: str, **kwargs) -> None:
    """Write the found HREFs from `url` to `file`."""
    res = await parse(url=url, **kwargs)
    if not res:
        return None
    async with aiofiles.open(file, "a") as f:
        for p in res:
            await f.write(f"{url}\t{p}\n")
        logger.info("Wrote results for source URL: %s", url)

async def bulk_crawl_and_write(file: IO, urls: set, **kwargs) -> None:
    """Crawl & write concurrently to `file` for multiple `urls`."""
    async with ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(
                write_one(file=file, url=url, session=session, **kwargs)
            )
        await asyncio.gather(*tasks)



In [None]:


assert sys.version_info >= (3, 7), "Script requires Python 3.7+."
here = pathlib.Path('./').parent

with open(here.joinpath("urls.txt")) as infile:
    urls = set(map(str.strip, infile))

outpath = here.joinpath("foundurls.txt")
with open(outpath, "w") as outfile:
    outfile.write("source_url\tparsed_url\n")

await bulk_crawl_and_write(file=outpath, urls=urls)

In [32]:
# let's test the doamain function
urls=()
with open('./urls.txt','r') as f:
    urls = f.read().splitlines()

In [33]:
urls

['https://regex101.com/',
 'https://docs.python.org/3/this-url-will-404.html',
 'https://www.nytimes.com/guides/',
 'https://www.mediamatters.org/',
 'https://1.1.1.1/',
 'https://www.politico.com/tipsheets/morning-money',
 'https://www.bloomberg.com/markets/economics',
 'https://www.ietf.org/rfc/rfc2616.txt']

In [34]:
test = scrapper.extract_domain(urls)

In [35]:
test

['regex101.com']

In [38]:
regex = re.compile(r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)")
d = regex.findall('\n'.join(urls))

In [42]:
urls[0] =''

In [47]:
caca = set()
for url in urls:
    d = regex.findall(url)
    if d:
        caca.add(d[0])

In [56]:
keyword, html = scrapper.get_google_search_results('SHARETHELOVE*+LUKO', 100, 'en')
found_results = scrapper.parse_results(html, keyword)

22:47:45 DEBUG:urllib3.connectionpool: Starting new HTTPS connection (1): www.google.com:443
22:47:45 DEBUG:urllib3.connectionpool: https://www.google.com:443 "GET /search?q=SHARETHELOVE*+LUKO&num=100&hl=en HTTP/1.1" 200 None
22:47:45 INFO:luko_scrapper: Got response [200] for URL: https://www.google.com/search?q=SHARETHELOVE*+LUKO&num=100&hl=en


In [57]:
found_results.head()

Unnamed: 0,keyword,rank,title,link,domain
0,SHARETHELOVE*+LUKO,1,Code promo Luko Parrainage 10€ de réduction,https://www.1parrainage.com/offre_parrainage_L...,1parrainage.com
1,SHARETHELOVE*+LUKO,2,Avis et Témoignages clients | Luko,https://www.luko.eu/fr/page/avis-clients-temoi...,luko.eu
2,SHARETHELOVE*+LUKO,3,Luko - Avis | Facebook,https://fr-fr.facebook.com/getluko/reviews/,fr-fr.facebook.com
3,SHARETHELOVE*+LUKO,4,Luko - Reviews | Facebook,https://www.facebook.com/getluko/reviews/,facebook.com
4,SHARETHELOVE*+LUKO,5,Luko - Codes Promo et Parrainages,https://mes-parrainages.fr/luko,mes-parrainages.fr


In [67]:
urls = found_results.groupby('domain')['link'].first().values

In [68]:
urls

array(['https://www.1parrainage.com/offre_parrainage_Luko.php',
       'https://99designs.ca/product-packaging-design/contests/big-silk-needs-trendy-luxurious-packaging-942371',
       'https://www.appannie.com/fr/apps/google-play/app/com.getluko.cover.app/',
       'https://www.assurland.com/assurance-blog/assurance-habitation-actualite/habitation-l-assurtech-luko-vient-de-lever-2-millions-d-euros_131204.html',
       'https://www.bestadsontv.com/profile/176542/Luke-OReilly',
       'http://buildingtomorrowsgeneration.org/mzm/crimewatch-pa-bucks-county.html',
       'https://www.dealabs.com/codes-promo/code-vivatech-sur-lassurance-luko-1605353',
       'https://dribbble.com/madacs_a/buckets/409581-cards-buttons-selectors',
       'https://eyeknowhiphop.wordpress.com/tag/luko/',
       'https://www.facebook.com/getluko/reviews/',
       'https://fr-fr.facebook.com/getluko/reviews/',
       'http://gabukass.blogas.lt/28-skyrius-jausmai-kile-is-niekur-317.html',
       'http://hkcitylife