### Import libs

In [2]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests

import os
import time
import glob
import itertools
import lxml

### Download pages

Function for HTTP requests

In [15]:
def send_http_request(url, method='GET', body=''):
    response = requests.request(method=method, url=url)
    return response.text

In [21]:
def post_from_url_generator(base_url, sleep_time=5):
    for i in range(1, 99999):
        url = f"{base_url}{i}"
        response_text = send_http_request(url, method='POST')
        time.sleep(sleep_time)
        if response_text: yield response_text
        else: break

Example use

In [22]:
base_url = "https://blog.prokulski.science/index.php/wp-json/nv/v1/posts/page/"
gen = post_from_url_generator(base_url)
page = next(gen)
#[next(gen) for x in range(5)]

### Parse pages for extract post urls

Extract post pages from page

In [23]:
def get_post_urls_from_page(page):
    soup = BeautifulSoup(page, 'html.parser')
    result_urls = []
    for x in soup.select("h2 > a"):
        url = str(x['href']).replace("\\\"", "").replace("\\", "")
        result_urls.append(url)
    return result_urls

Example use

In [25]:
get_post_urls_from_page(page)

### Download posts

Download all posts and store

In [None]:
post_urls = list()
[post_urls.append(get_post_urls_from_page(x)) for x in post_from_url_generator(base_url)]
# post_urls = list(itertools.chain.from_iterable(post_urls))

for url in post_urls:
    post_page = requests.get(url).text
    post_title = url.replace("https://blog.prokulski.science/index.php/", "").replace("/", "_")[:-1]
    print(f'Saving post page in file: {post_title}.html')
    with open("pages/"+post_title+".html", "wb") as outfile:
        outfile.write(post_page)



### Parse posts with BeautifulSoup

Load web pages from disk

In [30]:
def read_posts_from_disk(path: str, parser: str='html.parser'):
    files = glob.glob(path)
    posts = []

    for file in files:
        with open(file, "rb") as input_file:
            filename = os.path.basename(file)
            post_date = filename[:10].replace("_", "-")
            post_name = filename.replace(".html", "")
            soup = BeautifulSoup(input_file.read(), parser)
            posts.append((post_date, post_name, soup))
    return posts

In [None]:
posts = read_posts_from_disk("pages/2005*", parser='lxml')

Functions for extract some attributes from BeatutifulSoup web page

In [None]:
def get_tags(soup):
    tag_links = soup.select(".nv-tags-list > a")
    return list(map(lambda x: x.string, tag_links))


def get_listings_count(soup) -> int:
    code_divs = soup.select(".crayon-code > .crayon-pre")
    return len(code_divs)


def get_code_lines_count(soup) -> int:
    line_divs = soup.select(".crayon-code > .crayon-pre > .crayon-line")
    return len(line_divs)


def get_tables_count(soup) -> int:
    tables = soup.select("table:is(.table, .table-striped, .table-hover, .table-condensed, .table-responsive)")
    return len(tables)


def get_images_count(soup):
    imgs = soup.select("div.nv-content-wrap.entry-content")[0].findChildren("img")
    return len(imgs)


def get_comments(soup):
    comments = []
    for comment in soup.select(".nv-comment-article")[:2]:
        comment_row = {}
        comment_row['author'] = comment.select(".comment-author .author")[0].get_text()
        # print(comment.body.author.get_text())
        comment_row['date'] = pd.to_datetime(comment.select("time.entry-date.published")[0]
                                     .text.replace(" o", ""))
        content_tags = comment.select("div.nv-comment-content.comment.nv-content-wrap > p")[0]
        comment_row['comment'] = content_tags.get_text()
        comments.append(comment_row)
    return comments


Convert BeautifulSoup objects to dataframe and extract features from web page

In [None]:
rows_list = []
comments_list = []
i=1
for date, title, soup in posts:
    row = {}
    row['id'] = i
    row['title'] = title
    row['post_date'] = date
    row['code_lines'] = get_code_lines_count(soup)
    row['listings_num'] = get_listings_count(soup)
    row['tables_count'] = get_tables_count(soup)
    row['tags'] = get_tags(soup)
    row['comments'] = get_comments(soup)
    rows_list.append(row)

    comm = {'id': i, 'comments': get_comments(soup)}
    comments_list.append(comm)
    i += 1

post_df = pd.DataFrame(rows_list)
comment_df = pd.DataFrame(comments_list)

In [None]:
post_df.head()

In [None]:
comment_df.head()