In [1]:
import pandas as pd
import numpy as np
import scrapper
import re
import asyncio
import logging
import re
import sys
from typing import IO
import urllib.error
import urllib.parse

import aiofiles
import aiohttp
from aiohttp import ClientSession
import pathlib
import sys

In [2]:
%load_ext autoreload
%autoreload 2

### Test

In [3]:


logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
    level=logging.DEBUG,
    datefmt="%H:%M:%S",
    stream=sys.stderr,
)
logger = logging.getLogger("areq")
logging.getLogger("chardet.charsetprober").disabled = True

HREF_RE = re.compile(r'href="(.*?)"')

async def fetch_html(url: str, session: ClientSession, **kwargs) -> str:
    """GET request wrapper to fetch page HTML.

    kwargs are passed to `session.request()`.
    """

    resp = await session.request(method="GET", url=url, **kwargs)
    resp.raise_for_status()
    logger.info("Got response [%s] for URL: %s", resp.status, url)
    html = await resp.text()
    return html

async def parse(url: str, session: ClientSession, **kwargs) -> set:
    """Find HREFs in the HTML of `url`."""
    found = set()
    try:
        html = await fetch_html(url=url, session=session, **kwargs)
    except (
        aiohttp.ClientError,
        aiohttp.http_exceptions.HttpProcessingError,
    ) as e:
        logger.error(
            "aiohttp exception for %s [%s]: %s",
            url,
            getattr(e, "status", None),
            getattr(e, "message", None),
        )
        return found
    except Exception as e:
        logger.exception(
            "Non-aiohttp exception occured:  %s", getattr(e, "__dict__", {})
        )
        return found
    else:
        for link in HREF_RE.findall(html):
            try:
                abslink = urllib.parse.urljoin(url, link)
            except (urllib.error.URLError, ValueError):
                logger.exception("Error parsing URL: %s", link)
                pass
            else:
                found.add(abslink)
        logger.info("Found %d links for %s", len(found), url)
        return found

async def write_one(file: IO, url: str, **kwargs) -> None:
    """Write the found HREFs from `url` to `file`."""
    res = await parse(url=url, **kwargs)
    if not res:
        return None
    async with aiofiles.open(file, "a") as f:
        for p in res:
            await f.write(f"{url}\t{p}\n")
        logger.info("Wrote results for source URL: %s", url)

async def bulk_crawl_and_write(file: IO, urls: set, **kwargs) -> None:
    """Crawl & write concurrently to `file` for multiple `urls`."""
    async with ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(
                write_one(file=file, url=url, session=session, **kwargs)
            )
        await asyncio.gather(*tasks)



In [None]:


assert sys.version_info >= (3, 7), "Script requires Python 3.7+."
here = pathlib.Path('./').parent

with open(here.joinpath("urls.txt")) as infile:
    urls = set(map(str.strip, infile))

outpath = here.joinpath("foundurls.txt")
with open(outpath, "w") as outfile:
    outfile.write("source_url\tparsed_url\n")

await bulk_crawl_and_write(file=outpath, urls=urls)

In [32]:
# let's test the doamain function
urls=()
with open('./urls.txt','r') as f:
    urls = f.read().splitlines()

In [33]:
urls

['https://regex101.com/',
 'https://docs.python.org/3/this-url-will-404.html',
 'https://www.nytimes.com/guides/',
 'https://www.mediamatters.org/',
 'https://1.1.1.1/',
 'https://www.politico.com/tipsheets/morning-money',
 'https://www.bloomberg.com/markets/economics',
 'https://www.ietf.org/rfc/rfc2616.txt']

In [34]:
test = scrapper.extract_domain(urls)

In [35]:
test

['regex101.com']

In [38]:
regex = re.compile(r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)")
d = regex.findall('\n'.join(urls))

In [42]:
urls[0] =''

In [47]:
caca = set()
for url in urls:
    d = regex.findall(url)
    if d:
        caca.add(d[0])

In [3]:
keyword, html = scrapper.get_google_search_results('SHARETHELOVE*+LUKO', 100, 'en')
found_results = scrapper.parse_results(html, keyword)

23:11:49 DEBUG:urllib3.connectionpool: Starting new HTTPS connection (1): www.google.com:443
23:11:49 DEBUG:urllib3.connectionpool: https://www.google.com:443 "GET /search?q=SHARETHELOVE*+LUKO&num=100&hl=en HTTP/1.1" 200 None
23:11:50 INFO:luko_scrapper: Got response [200] for URL: https://www.google.com/search?q=SHARETHELOVE*+LUKO&num=100&hl=en


In [4]:
found_results.head()

Unnamed: 0,keyword,rank,title,link,domain
0,SHARETHELOVE*+LUKO,1,Code promo Luko Parrainage 10€ de réduction,https://www.1parrainage.com/offre_parrainage_L...,1parrainage.com
1,SHARETHELOVE*+LUKO,2,Avis et Témoignages clients | Luko,https://www.luko.eu/fr/page/avis-clients-temoi...,luko.eu
2,SHARETHELOVE*+LUKO,3,Luko - Publications | Facebook,https://fr-fr.facebook.com/getluko/posts/,fr-fr.facebook.com
3,SHARETHELOVE*+LUKO,4,Luko - Avis | Facebook,https://fr-fr.facebook.com/getluko/reviews/,fr-fr.facebook.com
4,SHARETHELOVE*+LUKO,5,Luko - Codes Promo et Parrainages,https://mes-parrainages.fr/luko,mes-parrainages.fr


In [58]:
urls = pd.Series(found_results.groupby('domain')['link'].agg(lambda x :list(x.values))).to_dict()

In [23]:
RESTRICTED_DOMAINS =['facebook.com', 'twitter.com']
keyword, html = scrapper.get_google_search_results('SHARETHELOVE*+LUKO', 100, 'en')
found_results = scrapper.parse_results(html, keyword)
sel_data = pd.Series(found_results.groupby('domain')['link'].agg(lambda x :list(x.values))).to_dict()
assert sys.version_info >= (3, 7), "Script requires Python 3.7+."
#here = pathlib.Path(__file__).parent
outpath_res = pathlib.Path("./res.txt")
with open(outpath_res, "w") as outfile:
    outfile.write("timestamp;##;source_url;##;domain;##;parsed_url;##;code;##;processed\n")

00:53:58 DEBUG:urllib3.connectionpool: Starting new HTTPS connection (1): www.google.com:443
00:53:58 DEBUG:urllib3.connectionpool: https://www.google.com:443 "GET /search?q=SHARETHELOVE*+LUKO&num=100&hl=en HTTP/1.1" 200 None
00:53:59 INFO:luko_scrapper: Got response [200] for URL: https://www.google.com/search?q=SHARETHELOVE*+LUKO&num=100&hl=en


In [24]:
sel_data

{'1parrainage.com': ['https://www.1parrainage.com/offre_parrainage_Luko.php'],
 '99designs.ca': ['https://99designs.ca/product-packaging-design/contests/big-silk-needs-trendy-luxurious-packaging-942371'],
 'appannie.com': ['https://www.appannie.com/fr/apps/google-play/app/com.getluko.cover.app/'],
 'assurland.com': ['https://www.assurland.com/assurance-blog/assurance-habitation-actualite/habitation-l-assurtech-luko-vient-de-lever-2-millions-d-euros_131204.html'],
 'bestadsontv.com': ['https://www.bestadsontv.com/profile/176542/Luke-OReilly'],
 'bigfooty.com': ['https://www.bigfooty.com/forum/threads/2019-draft-general-discussion.1193150/page-22'],
 'dealabs.com': ['https://www.dealabs.com/codes-promo/code-vivatech-sur-lassurance-luko-1605353'],
 'deskgram.org': ['https://deskgram.org/lukas_boe'],
 'dribbble.com': ['https://dribbble.com/madacs_a/buckets/409581-cards-buttons-selectors'],
 'esuba.eu': ['https://www.esuba.eu/'],
 'eyeknowhiphop.wordpress.com': ['https://eyeknowhiphop.wordp

In [34]:
resultat = await scrapper.bulk_crawl_and_write(outpath_res, sel_data,sel_data.values(), set())

00:57:32 INFO:luko_scrapper: urls input ['https://www.1parrainage.com/offre_parrainage_Luko.php']
00:57:32 INFO:luko_scrapper: urls input ['https://99designs.ca/product-packaging-design/contests/big-silk-needs-trendy-luxurious-packaging-942371']
00:57:32 INFO:luko_scrapper: urls input ['https://www.appannie.com/fr/apps/google-play/app/com.getluko.cover.app/']
00:57:32 INFO:luko_scrapper: urls input ['https://www.assurland.com/assurance-blog/assurance-habitation-actualite/habitation-l-assurtech-luko-vient-de-lever-2-millions-d-euros_131204.html']
00:57:32 INFO:luko_scrapper: urls input ['https://www.bestadsontv.com/profile/176542/Luke-OReilly']
00:57:32 INFO:luko_scrapper: urls input ['https://www.bigfooty.com/forum/threads/2019-draft-general-discussion.1193150/page-22']
00:57:32 INFO:luko_scrapper: urls input ['https://www.dealabs.com/codes-promo/code-vivatech-sur-lassurance-luko-1605353']
00:57:32 INFO:luko_scrapper: urls input ['https://deskgram.org/lukas_boe']
00:57:32 INFO:luko_scr

00:57:33 INFO:luko_scrapper: Wrote results for code found: 0 for url: wapmight.tv
00:57:33 INFO:luko_scrapper: Wrote results for url found: 0 for url: wapmight.tv
00:57:33 ERROR:luko_scrapper: aiohttp exception for https://gymlifes.com/tag/lol%F0%9F%98%82%F0%9F%98%82 [403]: Forbidden
00:57:33 INFO:luko_scrapper: Got response [200] for URL: https://www.youtube.com/watch?v=anZnTR88mUI&list=LL98DiA57wG9clAEfTFrQV7A&index=427
00:57:33 INFO:luko_scrapper: Wrote results for code found: 0 for url: gymlifes.com
00:57:33 INFO:luko_scrapper: Wrote results for url found: 0 for url: gymlifes.com
00:57:33 INFO:luko_scrapper: Got response [200] for URL: https://www.super-parrain.com/offres/luko-1/parrainage-luko-1/annonces/blio
00:57:33 INFO:luko_scrapper: Found 18 links for https://www.super-parrain.com/offres/luko-1/parrainage-luko-1/annonces/blio
00:57:33 INFO:luko_scrapper: Found 1 links for https://www.super-parrain.com/offres/luko-1/parrainage-luko-1/annonces/blio
00:57:33 ERROR:luko_scrapper:

00:57:33 INFO:luko_scrapper: Wrote results for code found: 0 for url: hustleliving.com.au
00:57:33 INFO:luko_scrapper: Wrote results for url found: 88 for url: hustleliving.com.au
00:57:33 INFO:luko_scrapper: Wrote results for code found: 0 for url: spinninrecords.com
00:57:33 INFO:luko_scrapper: Wrote results for url found: 53 for url: spinninrecords.com
00:57:33 INFO:luko_scrapper: Wrote results for code found: 1 for url: xn--nobanque-b1a.fr
00:57:33 INFO:luko_scrapper: Wrote results for url found: 56 for url: xn--nobanque-b1a.fr
00:57:33 INFO:luko_scrapper: Got response [200] for URL: https://fr-fr.facebook.com/getluko/reviews/
00:57:33 INFO:luko_scrapper: Got response [200] for URL: https://www.viveos.net/rev/alive+remix+empire+of+the+sun
00:57:33 INFO:luko_scrapper: Got response [200] for URL: https://www.assurland.com/assurance-blog/assurance-habitation-actualite/habitation-l-assurtech-luko-vient-de-lever-2-millions-d-euros_131204.html
00:57:33 INFO:luko_scrapper: Found 76 links 

00:57:33 INFO:luko_scrapper: Wrote results for url found: 7 for url: lewisbnb.fr
00:57:33 INFO:luko_scrapper: Found 4 links for https://www.pinterest.de/pin/296111744231603196/
00:57:33 INFO:luko_scrapper: Found 0 links for https://www.pinterest.de/pin/296111744231603196/
00:57:33 INFO:luko_scrapper: Found 4 links for https://www.pinterest.fr/pin/568790627911755235/
00:57:33 INFO:luko_scrapper: Found 0 links for https://www.pinterest.fr/pin/568790627911755235/
00:57:33 INFO:luko_scrapper: Found 4 links for https://www.pinterest.es/pin/442760207087350803/
00:57:33 INFO:luko_scrapper: Found 0 links for https://www.pinterest.es/pin/442760207087350803/
00:57:33 INFO:luko_scrapper: Wrote results for code found: 0 for url: pinterest.de
00:57:33 INFO:luko_scrapper: Wrote results for url found: 4 for url: pinterest.de
00:57:33 INFO:luko_scrapper: Wrote results for code found: 0 for url: pinterest.fr
00:57:33 INFO:luko_scrapper: Wrote results for url found: 4 for url: pinterest.fr
00:57:33 INFO

00:57:34 INFO:luko_scrapper: Wrote results for url found: 152 for url: twugi.com
00:57:34 INFO:luko_scrapper: Got response [200] for URL: https://www.bestadsontv.com/profile/176542/Luke-OReilly
00:57:34 INFO:luko_scrapper: Found 52 links for https://www.bestadsontv.com/profile/176542/Luke-OReilly
00:57:34 INFO:luko_scrapper: Found 0 links for https://www.bestadsontv.com/profile/176542/Luke-OReilly
00:57:34 INFO:luko_scrapper: Got response [200] for URL: https://www.pinterest.com.mx/pin/251920172879911502/
00:57:34 INFO:luko_scrapper: Wrote results for code found: 0 for url: bestadsontv.com
00:57:34 INFO:luko_scrapper: Wrote results for url found: 52 for url: bestadsontv.com
00:57:34 INFO:luko_scrapper: Got response [200] for URL: https://www.pinterest.dk/pin/493003490430150025/
00:57:34 INFO:luko_scrapper: Got response [200] for URL: https://www.bigfooty.com/forum/threads/2019-draft-general-discussion.1193150/page-22
00:57:34 INFO:luko_scrapper: Got response [200] for URL: https://fr-f

In [35]:
resultat

{'1parrainage.com': {'url': 114, 'code': 6},
 '99designs.ca': {'url': 53, 'code': 0},
 'appannie.com': {'url': 0.0, 'code': 0.0},
 'assurland.com': {'url': 76, 'code': 1},
 'bestadsontv.com': {'url': 52, 'code': 0},
 'bigfooty.com': {'url': 363, 'code': 0},
 'dealabs.com': {'url': 211, 'code': 2},
 'deskgram.org': {'url': 0.0, 'code': 0.0},
 'dribbble.com': {'url': 137, 'code': 0},
 'esuba.eu': {'url': 163, 'code': 0},
 'eyeknowhiphop.wordpress.com': {'url': 68, 'code': 0},
 'fr-fr.facebook.com': {'url': 112, 'code': 3},
 'gabukass.blogas.lt': {'url': 803, 'code': 0},
 'gramho.com': {'url': 332, 'code': 0},
 'gymlifes.com': {'url': 0.0, 'code': 0.0},
 'hkcitylife.com': {'url': 39, 'code': 0},
 'hustleliving.com.au': {'url': 88, 'code': 0},
 'iiaku.com': {'url': 13, 'code': 1},
 'insstoreweb.com': {'url': 332, 'code': 0},
 'insta-stalker.me': {'url': 266, 'code': 0},
 'instasaver.org': {'url': 3, 'code': 0},
 'instazu.com': {'url': 9, 'code': 0},
 'issuu.com': {'url': 55, 'code': 0},
 '

In [36]:
res = scrapper.process_batch_res(set([url for urls in sel_data.values() for url in urls]), resultat, outpath_res)

In [37]:
res[2]

{'1parrainage.com': ['https://twitter.com/share?url=http%3A%2F%2Fwww.1parrainage.com%2Fparrain_definit.php%3Fid_par%3D29836%26id%3D1671%26v%3D1580515053&amp;related=twitterapi%2Ctwitter&amp;hashtags=1parrainage.com&amp;text=Je+vous+parraine+sur+Luko%2C+10%E2%82%AC+de+r%C3%A9duction%2C+avec+1parrainage.com',
  'https://www.1parrainage.com/parrainage-offerte-y.php',
  'https://twitter.com/share?url=http%3A%2F%2Fwww.1parrainage.com%2Fparrain_definit.php%3Fid_par%3D45982%26id%3D1671%26v%3D1580514840&amp;related=twitterapi%2Ctwitter&amp;hashtags=1parrainage.com&amp;text=Je+vous+parraine+sur+Luko%2C+10%E2%82%AC+de+r%C3%A9duction%2C+avec+1parrainage.com',
  'https://www.1parrainage.com/parrainage-offerte-s.php',
  'https://twitter.com/share?url=http%3A%2F%2Fwww.1parrainage.com%2Fparrain_definit.php%3Fid_par%3D40188%26id%3D1671%26v%3D1580514840&amp;related=twitterapi%2Ctwitter&amp;hashtags=1parrainage.com&amp;text=Je+vous+parraine+sur+Luko%2C+10%E2%82%AC+de+r%C3%A9duction%2C+avec+1parrainage.c

In [39]:
data= pd.read_csv(outpath_res.resolve(), sep=';##;', header=0, engine='python')

In [40]:
data.head()

Unnamed: 0,timestamp,source_url,domain,parsed_url,code,processed
0,1580515000.0,https://www.instasaver.org/hashtag/rakhdi,instasaver.org,https://www.instasaver.org/favicon.ico,,False
1,1580515000.0,https://www.instasaver.org/hashtag/rakhdi,instasaver.org,https://www.instasaver.org/,,False
2,1580515000.0,https://www.instasaver.org/hashtag/rakhdi,instasaver.org,https://www.instasaver.org/hashtag/rakhdi,,False
3,1580515000.0,https://muxylukomu.ga/La_nostra_zia_coreografo...,muxylukomu.ga,https://muxylukomu.ga/favicon.ico,,False
4,1580515000.0,https://www.super-parrain.com/offres/luko-1/pa...,super-parrain.com,https://www.super-parrain.com/legals,SHARETHELOVE+1S71P4,False


In [41]:
data.to_csv('./sample_for_benoit.csv')