# Data Collection

## Workspace Setup

In [None]:
!pip install webdriver_manager

In [None]:
# general libraries
import os
import sys
import glob
import random
from time import sleep

# datasets and numerical manipulation libraries
import numpy as np
import pandas as pd

# libraries for scraping and selecting information
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# create a set of options for the webdriver
options = Options()
options.add_argument(' — no-sandbox')
options.add_argument(' — disable-gpu')
options.add_argument(' — incognito')
options.add_argument(' — disable-dev-shm-usage')
options.add_argument(' — ignore-certificate-errors')
options.add_argument(' — ignore-ssl-errors')
options.add_argument(' — headless')
#options.add_argument(' — start-maximized')

## Utility Functions

<div class="alert alert-block alert-info">
<b>NOTES:</b>

- In order to access the articles published during a certain period of time, we need to know how the date is represented in the URL of the page. For example, if we want all the articles published by Towards Data Science on February 22, 2022 we type: 
    - <em>towardsdatascience.com/archive/2022/02/22</em>

- We create functions that specify the month and the day, to pass to the webdriver.

- The first publications in Towards Data Science date from 2010, the first year when the publications are grouped by months is 2015. In the first couple of years there are months with no published articles.
    
   </div>

In [None]:
def how_many_days(year, month):
    
    '''
    Function to get the number of days in a month.
    It does take into account if the year is bisect or not.
    
    INPUT:
        year (int) - the year
        month (int) - an integer between 1 and 12 for the month
    OUTPUT:
        n_days (int) - number of days in the month
    '''
    if month in [1, 3, 5, 7, 8, 10, 12]:
        n_days = 31
    elif month in [4, 6, 9, 11]:
        n_days = 30
    elif ((month == 2) & (year % 4 != 0)): 
        n_days = 28 
    elif ((month == 2) & (year % 4 == 0)): 
        n_days = 29
    return n_days

In [None]:
def url_date_strings(year, month, day):
    
    '''
    Function to print the date as a string in format needed
    to complete the url for the web request.
    
    If day is 0 the string contains the year and the month only.
    If both day and month are 0, the string contains the year only.
    
    INPUT:
        year (int) - represents the year
        month (int) - the month as an integer from 0 to 1
        day (int) - the day of the month
        
    OUTPUT:
       date (str) in format YYYY/MM/DD, YYYY/MM or YYYY
    '''
    
    if ((month == 0) and (day == 0)):
        date_year = f'{year}'
        return date_year
    
    elif (day == 0):
        
        month = str(month)
        
        if len(month) == 1:
            month = f'0{month}'
        
        date_month = f'{year}/{month}'
        return date_month
    
    else:
        month, day = str(month), str(day)
        
        if len(month) == 1:
            month = f'0{month}'
            
        if len(day) == 1:
            day = f'0{day}'
        
        date_full = f'{year}/{month}/{day}'
        return date_full
    

In [None]:
def url_to_retrieve(publication, year, months, days):
    
    '''
    Creates a list of urls to use for retrieving data from the website.
    If the whole year is desired enter [0] for both months and days.
    If full month(s) are desired, enter [0] for days.
    
    INPUT:
        publication (str) - publication name to be included in url
        year (int) - the desired year 
        months (list) - list of months to retrieve from
        days (list) - list of days to retrieve from
    OUTPUT:
        urls (list) - urls for the specified days, months or years
    '''
    
    list_urls = []
    root_url = f'https://{publication}.com/archive/'
    
    for month in months:
        for day in days:
            formated_date = url_date_strings(year,month,day)
            page_url = root_url + formated_date
            list_urls.append(page_url)
    
    return list_urls

## Functions to Collect the Data

In [None]:
def retrieve_data(publication, year, months, days):
    '''
    The function performs the following steps:
        - scrape the data from the provided urls
        - creates a BeautifulSoup object
        - extracts information about each post 
        - stores all the extracted information in a list
        
    INPUT:
        publication (str) - name of publication as it appears on url
        year (int) - a year between 2010 and 2022 
        months(list) - a list of months to scrape data from
        days (list) - days to scrape data from
    OUTPUT:
        stories_info (list) - list that contain extracted information
                              for each post on the specified dates
     '''
    
    # instantiate the driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # empty list to collect the information for each post   
    data_stories = []
    
    # list of urls for the specified dates
    list_urls = url_to_retrieve(publication, year, months, days)
    
    for url in list_urls:
        # send the request to retrieve the information
        source = driver.get(url)
            
        # create a string object for the page source
        source_string = driver.page_source
            
        # create a beautiful soup object
        soup = BeautifulSoup(source_string,'lxml')
            
        # find all the stories published during the specified period
        stories = soup.find_all('div',
                                class_='streamItem streamItem--postPreview js-streamItem')
            
        for story in stories:
            author_box = story.find('div',
                                    class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
            author_url = author_box.find('a')['href']
            user_id = author_box.find('a')['data-user-id']
            date = author_box.find('time')['datetime']
            reading_time = author_box.find('span', class_='readingTime')['title']
            
            title = story.find('h3').text if story.find('h3') else '-'
            subtitle = story.find('h4').text if story.find('h4') else '-'
        
            if story.find('button',
                          class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents'):
                claps = story.find('button',
                                   class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents').text
            else:
                claps = 0
                    
            if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):
                responses = story.find('a',
                                       class_='button button--chromeless u-baseColor--buttonNormal').text
            else:
                responses = '0 responses'
    

            story_url = story.find('a', 
                                   class_='button button--smaller button--chromeless u-baseColor--buttonNormal')['href']
    
            
            # send the request to retrieve the story information
            source_story = driver.get(story_url)
            # create a string object for the story's page source
            source_string_story = driver.page_source
            # create a beautiful soup object 
            soup_story = BeautifulSoup(source_string_story,'lxml')
            
            # retrieve the full subtitle, place a marker if no subtitle is found
            if soup_story.find('h2', class_='pw-subtitle-paragraph'):
                full_subtitle = soup_story.find('h2', class_='pw-subtitle-paragraph').text
            else:
                full_subtitle = '-'
        
            # collect all the information and add it to the list
            data_stories.append([author_url, user_id, date, reading_time, 
                             title, subtitle, full_subtitle, claps, responses, story_url])
        # take a break
        sleep(np.random.randint(1, 10))
    
    return data_stories    

## Collect and Save the Data

In [None]:
publication = 'towardsdatascience'
data_cols = ['author_url', 'user_id', 'date', 'reading_time', 
         'title', 'subtitle', 'claps', 'responses', 'story_url'] 

def retrieve_publications(publication, year):
    
    '''
    The function cycles through the days and months of one year to extract 
    the information. The data is saved in a csv file.
    
    INPUT:
        publication (str) - name of publication as it appears on url
        year (int) - a year between 2010 and 2022 
    OUTPUT:
        none - data is saved in a file
    '''
    
    for i in range(1, 13):
        #the number of days in the particular month
        num_days = how_many_days(year, i)
        # scrape all the blogs from that year
        all_month = retrieve_metadata(publication, year, [i], range(1, num_days+1))
        # create a dataframe from the retrieved data
        month_df = pd.DataFrame(all_month, columns=data_cols)
        # save the dataframe into a file
        month_df.to_csv(f'data/tds_{year}_{i}.csv', index=False)

In [None]:
retrieve_publications(publication, 2022)

## Collect data in stages - Alternative approach

In [None]:
# retreve metadata from daily archive pages
def retrieve_metadata(publication, year, months, days):
    '''
    The function performs the following steps:
        - scrape the data from the provided urls
        - creates a BeautifulSoup object
        - extracts information about each post 
        - stores all the extracted information in a list
        
    INPUT:
        publication (str) - name of publication as it appears on url
        year (int) - a year between 2010 and 2022 
        months(list) - a list of months to scrape data from
        days (list) - days to scrape data from
    OUTPUT:
        stories_info (list) - list of lists that contain extracted information
                              for each post on the specified dates
     '''
    
    # instantiate the driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # empty list to collect the posts information    
    data_stories = []
    
    # list of urls for the specified dates
    list_urls = url_to_retrieve(publication, year, months, days)
    
    for url in list_urls:
        # send the request to retrieve the information
        source = driver.get(url)
            
        # create a string object for the page source
        source_string = driver.page_source
            
        # create a beautiful soup object
        soup = BeautifulSoup(source_string,'lxml')
            
        # find all the stories published during the specified period
        stories = soup.find_all('div',
                                class_='streamItem streamItem--postPreview js-streamItem')
            
        for story in stories:
            author_box = story.find('div',
                                    class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
            author_url = author_box.find('a')['href']
            user_id = author_box.find('a')['data-user-id']
            date = author_box.find('time')['datetime']
            reading_time = author_box.find('span', class_='readingTime')['title']
            
            # take a break
            sleep(np.random.randint(1, 6))
            
            title = story.find('h3').text if story.find('h3') else '-'
            # some subtitles are truncated in the archive page
            trunc_subtitle = story.find('h4').text if story.find('h4') else '-'
        
            if story.find('button',
                          class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents'):
                claps = story.find('button',
                                   class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents').text
            else:
                claps = 0
                    
            if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):
                responses = story.find('a',
                                       class_='button button--chromeless u-baseColor--buttonNormal').text
            else:
                responses = '0 responses'
    

            story_url = story.find('a', 
                                   class_='button button--smaller button--chromeless u-baseColor--buttonNormal')['href']
        
            # collect all the information and add it to the list
            data_stories.append([author_url, user_id, date, reading_time, 
                             title, trunc_subtitle, claps, responses, story_url])
    return data_stories    

In [None]:
def retrieve_subtitle(stories_list, filename):
    '''
    The function performs the following steps:
        - scrape the data from the provided urls
        - creates a BeautifulSoup object
        - extracts title and subtitle for each post 
        - stores all the extracted information in a list
        - saves the data in a csv file
        
    INPUT:
        stories_list (list) - list of individual post urls
        filename (str) - name of file where the data is saved
    OUTPUT:
        none - data is saved in a file
     '''
    
    # instantiate the driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # empty list to collect the posts information    
    data_stories = []
    
    for url in stories_list:
        
        # send the request to retrieve the information
        source = driver.get(url)
            
        # create a string object for the page source
        source_string = driver.page_source
            
        # create a beautiful soup object
        soup = BeautifulSoup(source_string,'lxml')
            
        title = soup.find('h1').text
        
        if soup.find('h2', class_='pw-subtitle-paragraph'):
            subtitle = soup.find('h2', class_='pw-subtitle-paragraph').text
        elif soup.find('h3', class_='pw-subtitle-paragraph'):
            subtitle = soup.find('h3', class_='pw-subtitle-paragraph').text
        else:
            subtitle = '-'
        
        # collect all the information and add it to the list
        data_stories.append([url, title, subtitle])
        
    # create a dataframe from the retrieved data
    subtitles_df = pd.DataFrame(data_stories, columns=['url', 'title', 'subtitle'])
    # save the dataframe into a file
    subtitles_df.to_csv(filename, index=False) 
    
    # take a break
    sleep(np.random.randint(1, 15))

In [None]:
def retrieve_all_subtitles(year, months):
    
    '''
    The function cycles through the files to extract article urls,
    after which it scrapes the title and the full subtitle for each post.
    The information is saved to a file.
    
    INPUT:
        year (int) - a year between 2010 and 2022 
        months (list) - specified months as a list
    OUTPUT:
        none - data is saved to a file
    '''
    
    for i in months:
        
        # create a dataframe that contains the list of urls
        stories_df = pd.read_csv(f'tds_{year}_{i}.csv')
        
        # create a list of urls
        stories_urls = stories_df.story_url
        # provide the names of the files
        filename = f'sub_tds_{year}_{i}.csv'
        # extract the subtitles and save the data to a file
        retrieve_subtitle(stories_urls, filename)

## Combine Files 

In [None]:
# create a csv file that combines all the individual downloads
merged_dfs = pd.concat([pd.read_csv(csv_file, index_col=None) for csv_file in glob.glob(os.path.join('data/data_raw/data_monthly', '*.csv'))],
                      axis=0, ignore_index=True)
    
merged_dfs.to_csv('data/data_raw/merged.csv', index=False)

In [None]:
dfs = pd.read_csv('data/data_raw/merged.csv', index_col=None)
dfs.head()

In [None]:
dfs.shape