In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, SoupStrainer
import html2text
from smart_open import smart_open
from time import time
from tqdm import tqdm
import concurrent.futures

import json

from itertools import islice

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
10,application_1563990975713_0011,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
input_file = "s3://onai-ml-dev-eu-west-1/web_crawler/data/seed_urls/company-urls.csv"
count_total = 0
count_200 = 0
count_has_about_url = 0
count_has_description = 0

TIMEOUT = 10
STATUS_CODE_OK = 200


def send_request(url):
    try:
        response = requests.get(url, timeout=TIMEOUT)
    except requests.exceptions.MissingSchema or requests.exceptions.InvalidSchema:
        url = "http://" + url
        response = requests.get(url, timeout=TIMEOUT)

    if response.status_code != STATUS_CODE_OK:
        return response.status_code, None
    else:
        return response.status_code, response


def pre_process_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    return text


def parse_all_links_and_meta_fields(html):
    links = set()
    metas = []
    for line in BeautifulSoup(html, "html.parser", parse_only=SoupStrainer(['a', 'meta'])):
        if line.name == 'a' and line.has_attr('href'):
            link = line['href']
            links.add(link)
        if line.name == 'meta':
            metas.append(line)
    return links, metas


def extract_about_us_urls(base_url, links):
    about_urls = set()
    for link in links:
        if 'about' in link.lower():
            if link.startswith("http"):
                about_urls.add(link)
            else:
                if link.startswith("/"):
                    about_urls.add(base_url + link)
                else:
                    about_urls.add(base_url + '/' + link)

    return about_urls


def extract_description_from_meta(metas):
    description = ''
    for meta in metas:
        if meta.has_attr('name') and meta['name'] == 'description' and meta.has_attr('content'):
            description = pre_process_text(meta['content'])
    return description


def extract_text_from_about_us_urls(about_urls):
    about_text = ''
    if len(about_urls) > 0:
        about_url = min(list(about_urls), key=len)

        about_response_code, about_response = send_request(about_url)
        if about_response_code == STATUS_CODE_OK:
            about_text = extract_text_from_html(about_response.text)
    return about_text


def extract_text_from_html(html):
    parser = html2text.HTML2Text()
    parser.wrap_links = False
    parser.skip_internal_links = True
    parser.inline_links = True
    parser.ignore_anchors = True
    parser.ignore_images = True
    parser.ignore_emphasis = True
    parser.ignore_links = True
    return pre_process_text(parser.handle(html))

def extract_text_from_url(url):
    try:
        response_code, response = send_request(url)
    except:
        response_code, response = -1, None

    about_us_text = ''
    about_us_url = ''
    text = ''
    meta_description = ''

    if response_code == STATUS_CODE_OK:
        return response_code, extract_text_from_html(response.text)
    return response_code, None

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
extract_text_from_url_udf = F.udf(extract_text_from_url, T.StructType(
    [T.StructField("response_code", T.IntegerType()),
     T.StructField("response_text", T.StringType())
    ]
))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
pandas_df = pd.read_csv(input_file, header=0, sep='\t').astype(str)
df = spark.createDataFrame(pandas_df).repartition(1000)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
links_and_meta_df = (df
                     .select("name", extract_text_from_url_udf("url").alias("request_results"))
                     .select("name",
                             F.col("request_results.response_code").alias("response_code"),
                             F.col("request_results.response_text").alias("response_text")
                            )
                    )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
(links_and_meta_df
 .repartition(100)
 .write
 .parquet("s3://onai-ml-dev-eu-west-1/web_crawler/data/raw_links_and_meta", mode="overwrite")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
tst = spark.read.load("s3://onai-ml-dev-eu-west-1/web_crawler/data/raw_links_and_meta")

In [24]:
tst.show(20, False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
tst.count()