In [56]:
import json
from bs4 import BeautifulSoup
import requests as rq
import os
from pathlib import Path
import csv
import uuid
import pandas as pd

In [10]:
OUT_DIR = Path("data/").joinpath("arg-min")

In [55]:
def scrape_page_for_links(page_num) -> list:
    
    
    if page_num == 1:
        URL = "https://www.argmin.net/"
    
    else: 
        URL = f"https://www.argmin.net/page{page_num}/"
    
    response = rq.get(URL)
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # post-link 
    links = soup.find_all("a", class_ = "post-link" )
    
    links = ["https://www.argmin.net"+link.get("href") for link in links]
    
    # post-meta
    dates = soup.find_all("span", class_ = "post-meta" )
    dates = [date.text for date in dates]
    
    out = []
    
    for i in range(len(links)):
        out.append([links[i], dates[i][:-2]])
        
    print(f"page {page_num} - scraped {len(out)} blogposts.")
    
    return out


def get_blogposts():
    
    # get the links for the blogposts and store them in a text file
    os.makedirs(OUT_DIR, exist_ok = True)
    
    print("Scraping blogposts...")
    
    blogposts = []
    
    for i in range(1, 10):
        
        posts = scrape_page_for_links(i)
        
        if len(posts) == 0:
            continue
        
        blogposts.extend(posts)
        
        
        
    with open(OUT_DIR.joinpath("blogposts.csv"), "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['url', 'created_at'])
        writer.writerows(blogposts)
        
        
    print(f"Collected {len(blogposts)} blogposts.")
        
        
get_blogposts()

Scraping blogposts...
page 1 - scraped 10 blogposts.
page 2 - scraped 10 blogposts.
page 3 - scraped 10 blogposts.
page 4 - scraped 10 blogposts.
page 5 - scraped 8 blogposts.
page 6 - scraped 9 blogposts.
page 7 - scraped 0 blogposts.
page 8 - scraped 0 blogposts.
page 9 - scraped 0 blogposts.
Collected 57 blogposts.


In [63]:
def scrape_post(url, summary_csv):
    
    try:
        resp = rq.get(url)
        
    except Exception as e:
        print("Error while scraping post: ", e)
        return None, None    
    
    soup = BeautifulSoup(resp.text, "html.parser")
    # post-header
    
    title = soup.find("div", class_ = "post-header")
    
    if title is None:
        print("Error while scraping post: ", url)
        return None, None
    
    title = title.find("h1").text
    
    # post-content
    
    content = soup.find("article", class_ = "post-content")
    
    out_id = uuid.uuid4()
    os.makedirs(OUT_DIR.joinpath("posts"), exist_ok = True)
    out_file = OUT_DIR.joinpath(f"posts/{out_id}.txt")
    
    with open(out_file, "w") as f:
        f.write(str(content))
        
    summary_csv.writerow([out_id, url, title])
    
    
def scrape_posts(posts_csv_fp):

    posts = pd.read_csv(posts_csv_fp)
        
    with open(OUT_DIR.joinpath("_temp_posts.csv"), "w") as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'url', 'title'])
    
        for _, post in posts.iterrows():
            scrape_post(post.url, writer)
    
scrape_posts(OUT_DIR.joinpath("blogposts.csv"))

# combine the two csv files
posts = pd.read_csv(OUT_DIR.joinpath("_temp_posts.csv"))
post_info = pd.read_csv(OUT_DIR.joinpath("blogposts.csv"))

posts.merge(post_info, on = "url").to_csv(OUT_DIR.joinpath("blogposts.csv"), index = False)

os.remove(OUT_DIR.joinpath("_temp_posts.csv"))
    
    