# Description

There's 2 parts to this notebook. First, the Towardsdatascience archives are scraped to retrieve post URLs. Then, the actual posts are scraped from this list. 

In [4]:
#For date ranges
import calendar

#Data manipulation
import pandas as pd

#Web scraping
import requests
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver
chrome_options = Options()
#Headless so that you aren't driven mad with a new window every 7 seconds
chrome_options.add_argument("--headless")

# Archive Pages

In [1]:
def date_tuples(year, min_month, max_month):
    """
    
    Returns a (year, month, day) list of tuples.
    
    Function default is to break at the date 7/6. Modify
    as necessary to get a wider/narrower range.
    """
    months = range(min_month, max_month+1)
    
    date_range = []
    
    for m in months:

        days = calendar.monthrange(2020,m)[1]

        for d in range(1,days):
            #Stopping at 7/6 - remove/change for custom stop date
            if m == 7 and d > 6:
                break
            elif d < 10:
                date_range.append((f'{year}', f'0{m}', f'0{d}'))
            else:
                date_range.append((f'{year}', f'0{m}', str(d)))
    
    return date_range

In [2]:
def retrieve_days_links(year, month, day):
    
    if day == 1:
        daily_archive = f'https://towardsdatascience.com/archive/{year}/{month}'
    else:
        daily_archive = f'https://towardsdatascience.com/archive/{year}/{month}/{day}'
    
    response = requests.get(daily_archive).text
    soup = BeautifulSoup(response)
    
    link_lst = []  
    for a in soup.find_all('a', class_ = "", href = True):
        link_lst.append(str(a))
        
    split_by_dashes = [i.split('---------') for i in link_lst if '---------' in i]

    links = [i[3].split('=')[1].split('?')[0].strip('"') for i in split_by_dashes]   
    links_with_date = [[link, f'{month}/{day}/{year}'] for link in links]
    
    return links_with_date

In [5]:
date_range = date_tuples(2020, 1, 7)

In [6]:
print(date_range[:5])
print(date_range[-5:])

[('2020', '01', '01'), ('2020', '01', '02'), ('2020', '01', '03'), ('2020', '01', '04'), ('2020', '01', '05')]
[('2020', '07', '02'), ('2020', '07', '03'), ('2020', '07', '04'), ('2020', '07', '05'), ('2020', '07', '06')]


In [929]:
full_article_list = []

for year,month,day in date_range:
    full_article_list.append(retrieve_days_links(year, month, day))

In [930]:
#Sanity check for date range
print(full_article_list[:1][0][0])
print(full_article_list[-1:][0][0])

['https://towardsdatascience.com/making-python-programs-blazingly-fast-c1cd79bd1b32', '01/01/2020']
['https://towardsdatascience.com/ten-python-development-skills-998a52f8f7c0', '07/06/2020']


# Actual Articles

In [7]:
def load_page(link):
    """
    
    Retrieves an article with Selenium and outputs a Beautifulsoup object.
    
    Dynamic content may not load with near instantaneous opening and closing articles. 
    Thus, the page is allowed to load for 1 second. Additionally, some 
    elements (e.g., videos) did not render until they were viewed. Since these 
    elements are always nested in 'figure' tags, the driver locates and subsequently
    scrolls to each one. Following this, 1 more second is allowed to pass before
    the page is turned into soup.

    """
    #Activate headless driver 
    driver = webdriver.Chrome(chromedriver, options=chrome_options)
    driver.get(link)
    
    time.sleep(1)
    
    #Locate + scroll to figures
    #Code adapted from https://stackoverflow.com/questions/48006078/how-to-scroll-down-in-python-selenium-step-by-step
    read_mores = driver.find_elements_by_tag_name('figure')
    for read_more in read_mores:
        driver.execute_script("arguments[0].scrollIntoView();", read_more)
    
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'html5lib')
    
    driver.close()
    
    return soup

In [8]:
def find_claps(soup):
    """
    
    Returns number of claps article received.
    
    This searches the article for the claps, which 
    are located in a button at the end of the article.
    The try/except block is present due to an error
    ocurring if no claps are found.
    
    """
    try:
        for button in soup.find_all('button'):
            if 'claps' in str(button):
                clap_button = str(button).split('claps')
                claps = clap_button[0].split('>')[1]
    
        return claps
    
    except:
        return 0
        
    

In [885]:
def find_sections(soup):
    """
    
    Returns number of section in article.
    
    This searches for all h1 and h2 tags, with 
    a section defined as the presence of either.
    The minimum value that is returned is 1, since
    every article will have an h1 tag for the title.
    
    """
    h1 = []
    for main in soup.find_all('h1'):
        h1.append(main)

    h2 = []
    for sub in soup.find_all('h2'):
        h2.append(sub)

    return len(h1) + len(h2)

In [704]:
def find_codeblocks(soup):
    """
    
    Returns both static and interactive codeblocks.
    
    Codeblocks on Medium can either be fixed and inside
    a 'pre' tag, or they can be dynamic. These dynamic
    blocks are nested in iframes. Videos are as well, 
    but codeblocks can be differentiated by their lack
    of 'YouTube' in the source. The main limitation is 
    videos not embedded from YouTube will be counted as
    codeblocks, but I have yet to find any non-YouTube
    videos.
    
    
    """
    
    static_count = 0
    for codeblock in soup.find_all('pre'):
        static_count += 1

    dynamic_count = 0
    for codeblock in soup.find_all('figure'):
        for frame in codeblock.find_all('iframe'):
            if 'YouTube' not in str(frame):
                dynamic_count += 1
    
    return static_count + dynamic_count

In [705]:
def find_videos(soup):
    """
    
    Returns number of videos in article.
    
    All videos are located in iframe tags. 
    Currently, this function only locates
    videos embedded from YouTube.
    
    """
    videos = 0
    
    for fig in soup.find_all('figure'):
        for frame in fig.find_all('iframe'):
            if 'YouTube' in str(frame):
                videos += 1
    return videos

In [847]:
def parse_article(link):
    
    """
    
    This function parses a Medium article URL page into a Pandas DataFrame.
    
    
    """
    
    soup = load_page(link[0])
    
    article = soup.find('article')
    
    article_df = pd.DataFrame(index = [0], columns = ['date', 'title', 'post', 'num_sections', 
                                                      'num_images', 'num_codeblocks'])
    
    #Date
    article_df['date'] = pd.to_datetime(link[1])
    
    #Actual article
    article_df['post'] = (' ').join([p.text for p in article.find_all('p')])
    
    #Main sections
    headers = [header.text for header in article.find_all('h1')]
    article_df['title'] = headers[0]
    article_df['num_sections'] = find_sections(article)
    
    #Code blocks
    article_df['num_codeblocks'] = find_codeblocks(article)
    
    #Images
    article_df['num_images'] = len([img for img in article.find_all('figure', class_='paragraph-image')])
    
    #Videos
    article_df['num_videos'] = find_videos(article)
    
    #Claps
    article_df['claps'] = find_claps(soup)

    
    return article_df

In [11]:
def scrape_posts(days):
    """
    
    Returns a dataframe consisting of scraped posts.
    
    The function references 'full_article_list' and will 
    return all posts for however many days (i.e., indices)
    are entered. Posts are scraped using the 'parse_article'
    function above.
    
    """
    post_df = pd.DataFrame()

    days = range(days+1)
    
    for day in days:
        for article in full_article_list[day]:
            try:
                post_df = post_df.append(parse_article(article))
            #articles from TDSTeam have no explicit title and threw IndexErrors. Decided to just exclude since they were
            #links to other articles with descriptions, rather than an article in itself.
            except:
                pass
    
    return post_df

In [10]:
print(f'{len(full_article_list)} days of articles.')

In [None]:
posts = scrape_posts(188)

In [919]:
#posts.to_pickle('./Data/medium_2020_posts.pkl')