In [None]:
import json
import os
import sys
import requests
import time
from io import BytesIO

# for regular expression
import re

ROUTETOROOTDIR = '/home/dssg-cfa/notebooks/dssg-cfa-public/'
from helpers import write_urls as wu
from helpers import dest_fn_from_url as df
from helpers import json_extraction as je
from helpers import check_gazette_filenames as cf


# final dest urls -- change these to where you'd like to save these files to
final_dest_file_gazeti = "/home/dssg-cfa/final_dest_urls/final_dest_urls_gazeti.txt"
final_dest_file_ca = "/home/dssg-cfa/final_dest_urls/final_dest_urls_ca.txt"

# file to save to 
filepath_out = "/home/dssg-cfa/ke-gazettes/"

1. **GET URLS**

Notes: 
* Write out to files, rather than saving in local variable, to avoid having to do this again if something goes wrong
* Note that each of these took us around 20 minutes to run per 1,000 gazettes -- a possible future improvement would be to run these using multiprocessor. 

In [None]:
# For documentation, see "write_urls"

wu.write_dest_urls("gazeti", final_dest_file_gazeti, yr_start = 2000, yr_end = 2009)
wu.write_dest_urls("connected_africa", final_dest_file_ca, yr_start = 2000, yr_end = 2009)

In [None]:
with open(final_dest_file_ca) as fp:
    final_dest_ca = [line.strip() for line in fp.readlines()]

with open(final_dest_file_gazeti) as fp: 
    final_dest_gazeti = [line.strip() for line in fp.readlines()]

2. **Send each URL to Microsoft Read API (or other OCR service)** 

We do this in batches (iterating through sublists) to enable some level of manual error checking. 

The below will run as a loop, printing progress updates as it runs. (Multiprocessing does not improve performance here, since there is a limit on calls per second in any Microsoft Cognitive Services subscription.) 

The below **saves failed gazette URLs to a temporary file**, to be processed later. When the Read API fails on a PDF, our script prints a message describing the error, then saves the URL to the list of failed URLs. At the end of the loop, these failed URLs are saved to a file. Please review step 3 below for how we get these PDFs. 

Duplicate filenames will not be sent to the Read API, so you will not be charged. 

In [None]:
# INITIALIZE
failed_outfile = "failures.txt"

Instructions: 
* Set `img_url_list` to the full list that you want to process. (We separated these between Gazeti and Connected Africa, but you may wish to concatenate the two.) 
* Use the `start_idx` and `end_idx` variables to process only a small number of elements at a time. 
    + E.g., to test the functions on one URL, set `start_idx = 0` and `end_idx = 1`. 
    + After processing this URL, you may wish to test the function on 10 URLs. To do this, change the values of the variables: `start_idx = 1` and `end_idx = 11`. 
    + Keep going, with bigger numbers as you grow more confident that things are working. 

In [None]:
# img_url_list = final_dest_gazeti
img_url_list = final_dest_ca

# Toggle these to loop through the list
start_idx = 0
end_idx = len(img_url_list) - 1
fin_url_sublist = img_url_list[start_idx:end_idx]

In [None]:
# run the loop for a single sub-list 

duplicates = []
failures = []

je.bulk_ocr(
    fin_url_sublist, 
    duplicates, 
    failures,
    flag = "url"
)

# after each iteration, append URLs to the file listing failed URLs
# if you wish to examine duplicates, you may wish to save these as well
with open(failed_outfile, 'a') as f: 
    f.writelines("%s\n" % item for item in failures)


3. **Get JSONs for Gazettes that the Read API failed on in the above code.**

Often, the Read API will fail on a Gazette for one of two reasons: 
* The URL redirects rather than pointing directly to the Gazette. 
* The PDF contains pages that are larger than 17x17 inches, which is the maximum size that the Read API will process. 
* The URL itself is invalid -- there is no PDF for a given Gazette stored in the database. 

The first two of these issues can be solved by downloading the PDF data directly, resizing each page as necessary, and then sending that data as bytes directly to the Read API. We do not do this for all of our PDFs because it is much more computationally expensive than just passing a URL to the Read API. 

In [None]:
# option to save the invalid URLs if you'd like to analyze them

invalid_urls = []
with open(failed_outfile) as f: 
    failed_urls = [line.strip() for line in f]

In [None]:
je.bulk_ocr(
    failures, 
    duplicates, 
    invalid_urls, 
    flag = "pdf"
)