In [1]:
import sys
sys.path.append('../..')

import os
from urllib import request
import bs4
import pandas as pd
import numpy as np
from tabula import read_pdf
from datetime import datetime

from source.bs4.scraper import Scraper

data_files = (os.path.join(os.path.dirname(__file__), 'data_files') if '__file__' in locals() else 'data_files')

In [4]:
class NYCCovidDataScraper(Scraper):
    
    csv_path = os.path.join(data_files, 'nyc-covid-daily-data-summary.csv')
    page = 'https://www1.nyc.gov/site/doh/covid/covid-19-data-archive.page'
    basePath = 'https://www1.nyc.gov'
    
    def parseDate(self, asof):
        tokens = ['(as of', '.', ')']
        for token in tokens:
            asof = asof.replace(token, '')
        try:
            return datetime.strptime(asof, '%B %d %I:%M %p').replace(year=2020)
        except:
            try:
                return datetime.strptime(asof, '%B %d %I %p').replace(year=2020)
            except:
                return datetime.strptime(asof, '%B %d').replace(year=2020)
            
    def parseLink(self, link):
        href = link.get('href')
        text = link.get_text()
        return (href, self.parseDate(text))
        
    def slice(slice, df, val, num_rows=1, skip_row = True):
        idx = df.index[df['key'] == val]
        if idx.empty:
            return pd.DataFrame(columns = df.columns)
        idx = idx[0]

        if skip_row:
            slice_df = df.loc[idx+1:idx+num_rows]
        else:
            slice_df = df.loc[idx:idx+num_rows]
        slice_df = slice_df.copy()
        slice_df['category'] = val
        val_splits = slice_df['value'].str.extract('(?P<abs>\d+)([\s|(]+(?P<pct>\d+)%)?')
        slice_df['abs'] = val_splits['abs']
        slice_df['pct'] = val_splits['pct']
        return slice_df

    def parse(self, source):
        soup = bs4.BeautifulSoup(source, 'lxml')
        data_pages = [self.parseLink(link) for link in soup.find_all('a') if '/assets/doh/downloads/pdf/imm/' in link.get('href')]
        summary_pages = [(page, ts) for page, ts in data_pages if 'daily-data-summary-0' in page]
        
        res_df = pd.DataFrame()
        for page, dt in summary_pages:
            print(f'{page}')
            df = read_pdf(f'{self.basePath}{page}', output_format='dataframe', pages='1', stream=True)
            df = df[0]
            df.columns = ['key', 'value']
            df['key'] = df['key'].str.replace('- ', '')
            df['date'] = dt
            
            res_df = pd.concat([res_df, 
                                self.slice(df, 'Age Group', 6),
                                self.slice(df, 'Age 50 and over', 2),
                                self.slice(df, 'Sex', 3),
                                self.slice(df, 'Borough', 6)
                               ])
        return res_df[['date', 'category', 'key', 'value', 'abs', 'pct']]
    
scraper = NYCCovidDataScraper().load(refresh=True, saveToCsv=False)

/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04092020-2.pdf


In [5]:
scraper

Unnamed: 0,date,category,key,value,abs,pct
2,2020-04-09 17:00:00,Age Group,0 to 17,1726 (2%),1726,2.0
3,2020-04-09 17:00:00,Age Group,18 to 44,33906 (39%),33906,39.0
4,2020-04-09 17:00:00,Age Group,45 to 64,31472 (36%),31472,36.0
5,2020-04-09 17:00:00,Age Group,65 to 74,10990 (13%),10990,13.0
6,2020-04-09 17:00:00,Age Group,75 and over,9424 (11%),9424,11.0
7,2020-04-09 17:00:00,Age Group,Unknown,207,207,
9,2020-04-09 17:00:00,Age 50 and over,Yes,44394 (51%),44394,51.0
10,2020-04-09 17:00:00,Age 50 and over,No,43124 (49%),43124,49.0
12,2020-04-09 17:00:00,Sex,Female,40135 (46%),40135,46.0
13,2020-04-09 17:00:00,Sex,Male,47193 (54%),47193,54.0
