In [None]:
# Data manipulation libraries
import numpy as np
import pandas as pd
from collections import defaultdict

import matplotlib.pyplot as plt
# System libraries
import glob
import os
import sys

# Library for reading through pdf
# pip install pymupdf
# pip install fitz
import fitz

**Data Collection**  
Using a bash script to save all the PDF files from the [google mobility site](https://www.google.com/covid19/mobility/) and store them in a folder.

In [None]:
%%bash

mkdir -p mobility_pdfs/
cd mobility_pdfs/
# hacky way to create a list of states in USA
states_list="Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada New_Hampshire New_Jersey New_Mexico New_York North_Carolina North_Dakota Ohio Oklahoma Oregon Pennsylvania Rhode_Island South_Carolina South_Dakota Tennessee Texas Utah Vermont Virginia Washington West_Virginia Wisconsin Wyoming"
country_list = "US Spain Italy France Germany"
country_list_short = "US ES IT FR DE"
date="2020-04-05"

# Get the pdfs for the mobility information of the states
for state in $states_list ; do
    curl -s -O https://www.gstatic.com/covid19/mobility/${date}_US_${state}_Mobility_Report_en.pdf
done
# Get the pdf for the US country
curl -s -o ${date}_US_US_Mobility_Report_en.pdf https://www.gstatic.com/covid19/mobility/${date}_US_Mobility_Report_en.pdf

for country in $country_list_short ; do
    curl -s -O https://www.gstatic.com/covid19/mobility/${date}_${country}_Mobility_Report_en.pdf
done

**PDF Parser**  
Obtained this script online for parsing from an incoming stream of data and reading the plots in the page. 

In [None]:
def parse_streaming_data(stream):
    data_raw = []
    data_transformed = []
    rotparams = None
    npatches = 0
    for line in stream.splitlines():
        if line.endswith(" cm"):
            # page 146 of https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
            rotparams = list(map(float,line.split()[:-1]))
        elif line.endswith(" l"):
            x,y = list(map(float,line.split()[:2]))
            a,b,c,d,e,f = rotparams
            xp = a*x+c*y+e
            yp = b*x+d*y+f
            data_transformed.append([xp,yp])
            data_raw.append([x,y])
        elif line.endswith(" m"):
            npatches += 1
        else:
            pass
    data_raw = np.array(data_raw)
    basex, basey = data_raw[-1]
    good = False
    if basex == 0.:
        data_raw[:,1] = basey - data_raw[:,1]
        data_raw[:,1] *= 100/60.
        data_raw = data_raw[data_raw[:,1]!=0.]
        if npatches == 1: good = True
    return dict(data=np.array(data_raw), npatches=npatches, good=good)

**PDF Parser**  
The method below parses the pdf by reading lines from it iteratviely and based on specified conditions, stores the information in arrays.

In [None]:
def parse_page(doc, ipage, verbose=False):
    # Set of the categories required
    categories_list = [
        "Retail & recreation",
        "Grocery & pharmacy",
        "Parks",
        "Transit stations",
        "Workplace",
        "Residential",
    ]
    
    counties = []
    curr_county = None
    curr_category = None
    data = defaultdict(lambda: defaultdict(list))
    pagetext = doc.getPageText(ipage)
    lines = pagetext.splitlines()
    tickdates = list(filter(lambda x:len(x.split())==3, set(lines[-10:])))
    #print (tickdates)
    count  =0
    for line in lines:
        # Removing unwanted data from the page
        if ("* Not enough data") in line: continue
        if ("needs a significant volume of data") in line: continue

        # If found the category line, add it to the dictionary, else keep iterating over.
        if any(line.startswith(category) for category in categories_list):
            curr_category = line
        elif curr_category:
            data[curr_county][curr_category].append(line)

        # Filtering data to find the county information
        if (all(category not in line for category in categories_list)
            and ("compared to baseline" not in line)
            and ("Not enough data" not in line)
           ):
            # Only two counties per page
            if len(data.keys()) == 2: break
            count +=1
            #print (line, count)    
            counties.append(line)
            curr_county = line
            
    # Debugging entry skipping 
    if (ipage==5):
        print (ipage,counties, "\n")
    for county in data :
        print (county)
        newdata = {}
    for county in data:
        newdata[county] = {}
        
        for category in data[county]:
            # Skipping the ones with no data. We get to know that based on the space and * in the Pdf
            if category.endswith(" "): continue
            temp = [x for x in data[county][category] if "compared to baseline" in x]
            if not temp: continue
            percent = int(temp[0].split()[0].replace("%",""))
            newdata[county][category.strip()] = percent
    data = newdata
    for county in data :
        print (county, data[county])
    tomatch = []
    #Create a list of counties and the available categories for the given county
    for county in counties:
        for category in categories_list:
            if category in data[county]:
                tomatch.append([county,category,data[county][category]])
                

    print(len(tomatch))
    print(data)

    # Get the readable plots from the page ( Since there are broken and empty plots in the page)
    readableplots = []
    xrefs = sorted(doc.getPageXObjectList(ipage), key=lambda x:int(x[1].replace("X","")))
    for i,xref in enumerate(xrefs):
        stream = doc.xrefStream(xref[0]).decode()
        info = parse_streaming_data(stream)
        if not info["good"]: continue
        readableplots.append(info)
    
    print(len(readableplots))
    
    ret = []
    

    
    for m,g in zip(tomatch,readableplots):
        xs = g["data"][:,0]
        ys = g["data"][:,1]
        maxys = ys[np.where(xs==xs.max())[0]]
        maxy = maxys[np.argmax(np.abs(maxys))]
        
        
        # Parse the dates as text and then based on min to max value, create a range of dates and store it in the dictionary
        ts = list(map(lambda x: pd.Timestamp(x.split(None,1)[-1] + ", 2020"), tickdates))
        low, high = min(ts), max(ts)
        dr = list(map(lambda x:str(x).split()[0], pd.date_range(low, high, freq="D")))
        lutpairs = list(zip(np.linspace(0,200,len(dr)),dr))

        dates = []
        values = []
        asort = xs.argsort()
        xs = xs[asort]
        ys = ys[asort]
        for x,y in zip(xs,ys):
            date = min(lutpairs, key=lambda v:abs(v[0]-x))[1]
            dates.append(date)
            values.append(round(y,3))

        ret.append(dict(
            county=m[0],category=m[1],change=m[2],
            values=values,
            dates=dates,
            changecalc=maxy,
        ))
    return ret

In [None]:
# Create the dataframe for the county and state data.
def parse_state(state):
    doc = fitz.Document(f"pdfs/2020-04-05_US_{state}_Mobility_Report_en.pdf")
    data = []
    # 2 because we are skipping the first 2 pages from the PDF.
    for i in range(2,doc.pageCount-1):
        #print (i, "The actual page")
        for entry in parse_page(doc, i):
            entry["state"] = state
            entry["page"] = i
            print (i, "the page numbers")
            data.append(entry)
    outname = f"data/{state}.json.gz"
    df = pd.DataFrame(data)
    #ncounties = df['county'].nunique()
    ncounties =df['county'].nunique()
    print(f"Parsed {len(df)} plots for {ncounties} counties in {state}")
    df = df[["state","county","category","change","changecalc","dates", "values","page"]]
    return df

In [None]:
# Create the dataframe for the specific state
string = "Texas"
df = parse_state(string)
csv_name = string +"_mobility.csv"
df.to_csv(csv_name)
parse_state(string).head(100)

In [None]:
#Script for scraping the initial two pages of the pdf file
def parse_page_total(doc, ipage, verbose=False):
    """
    First two pages
    """
    category_list = [
        "Retail & recreation",
        "Grocery & pharmacy",
        "Parks",
        "Transit stations",
        "Workplaces",  # They have workplaces there instead of workplace
        "Residential",
    ]

    curr_category = None
    data = defaultdict(lambda: defaultdict(list))
    pagetext = doc.getPageText(ipage)
    lines = pagetext.splitlines()
    tickdates = []
    for line in lines:

        if ("* Not enough data") in line: continue
        if ("needs a significant volume of data") in line: continue
        # Added this condition to check for the extra text in the right of the page
        if 'Mobility trends ' in line or 'hubs' in line: continue
        # If found the category line, add it to the dictionary, else keep iterating over.
        if any(line.startswith(category) for category in categoriy_list):
            curr_category = line
        # Checking for x axis in the details
        elif line[:3] in ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'):
            tickdates.append(line)
        elif line[0] not in ('+', '-'):
            continue
        elif curr_category:
            data[curr_category] = data.get(curr_category, []) + [line]

    newdata = {}
    for category in data:
        # Skipping the ones with no data. We get to know that based on the space and * in the Pdf
        if category.endswith(" "): continue
        temp = data[category][0]
        percent = int(temp.split()[0].replace("%",""))
        newdata[category.strip()] = percent
    data = newdata

    tomatch = []
    # Create a list of counties and the available categories for the given county
    for category in categories:
        if category in data:
            tomatch.append([category,data[category]])

    # Get the readable plots from the page ( Since there are broken and empty plots in the page)
    readableplots = []
    xrefs = sorted(doc.getPageXObjectList(ipage), key=lambda x:int(x[1].replace("X","")))
    for _, xref in enumerate(xrefs):
        stream = doc.xrefStream(xref[0]).decode()
        info = parse_streaming_data(stream)
        if not info["good"]:
            continue
        readableplots.append(info)
    
    print(len(readableplots))
    
    ret = []
    
    if len(tomatch) != len(readableplots):
        return ret
    
    for m,g in zip(tomatch,plots):
        xs = g["data"][:,0]
        ys = g["data"][:,1]
        maxys = ys[np.where(xs==xs.max())[0]]
        maxy = maxys[np.argmax(np.abs(maxys))]
        
        
        # Parse the dates as text and then based on min to max value, create a range of dates and store it in the dictionary
        ts = list(map(lambda x: pd.Timestamp(x.split(None,1)[-1] + ", 2020"), tickdates))
        low, high = min(ts), max(ts)
        dr = list(map(lambda x:str(x).split()[0], pd.date_range(low, high, freq="D")))
        lutpairs = list(zip(np.linspace(0,200,len(dr)),dr))

        dates = []
        values = []
        asort = xs.argsort()
        xs = xs[asort]
        ys = ys[asort]
        for x,y in zip(xs,ys):
            date = min(lutpairs, key=lambda v:abs(v[0]-x))[1]
            dates.append(date)
            values.append(round(y,3))

        ret.append(dict(
            category=m[0],change=m[1],
            values=values,
            dates=dates,
            changecalc=maxy,
        ))
    return ret


# Create the dataframe for the country data.

def parse_country(country):
    doc = fitz.Document(f"mobility_pdfs/2020-04-05_{country}_Mobility_Report_en.pdf")
    data = []
    for i in range(2):
        for entry in parse_page_total(doc, i):
            entry['state'] = state
            entry['page'] = i
            entry['county'] = 'total'
            data.append(entry)
    df = pd.DataFrame(data)
    return df

In [None]:
# Will replace this with a loop once a list for the codes for all the other countries
df_ES = parse_state_total("ES")
df_IT = parse_state_total("IT")
df_FR = parse_state_total("FR")
df_DE = parse_state_total("DE")

combined = [df_US, df_ES, df_IT, df_FR, df_DE]
df_comb = pd.concat(combined)
df_comb.head(100)