In [3]:
import os
import json
import requests

file_out_mapping = "filename_map_to_database.txt"

from helpers import check_gazette_filenames as cf
from helpers import json_extraction as je
from helpers import write_urls as wu 
from helpers import dest_fn_from_url as df
from helpers import create_db_mapping as db

This file walks the user through a process of calling functions to ensure that, as much as possible, all Gazettes in our filesystem are named accurately and that we have no duplicates, and that all text versions of Gazettes can be linked back to where they were originally stored in CFA's database. 

The **first step** is to rename files to correct potential issues. Some manual error checking is recommended, and this process could definitely be improved.

The **second step** is to create a mapping between all files in our database and all files in the original CfA database. It is common for one file in our database to map to multiple files in the original databases. 

### Step One: rename files to accurately reflect contents

Naming issues are described in the "check_gazette_filenames" file.

Note that "check_filenames", called below, will print information and instructions. Manually checking that output is reasonable is highly recommended; and you may approve or deny the rename suggestion. The below will prompt you to approve renaming files to correctly reflect their contents. 

* The functions for checking special issues (i.e., whether it is a special issue or not) are extremely reliable. 
* The dated functions are relatively reliable, but we recommend at minimum ensuring that the dates displayed are valid dates.
* The volume/issue numbering checks return errors sometimes. We'd recommend manually confirming these -- cross-checking suggested renames with the gazettes in the original database -- especially if the issue or volume numbering only differs by one character (e.g., "cii" vs. "ci"). 

Note that, as the `rename_gazettes` code stands, gazettes that are incorrectly labelled as special issues are **automatically renamed** (without prompting). Change this if you would like to be more cautious. 

In [None]:
def in_date_range(f, yr_start, yr_end): 
    '''
    Filter for files dated beginning at yr_start, up to but not including yr_end
    '''
    for yr in range(yr_start, yr_end + 1): 
        if str(yr) in f: 
            return True
    return False

filepath = "/home/dssg-cfa/ke-gazettes/"
filename_list = [f for f in os.listdir(filepath) if in_date_range(f, 2000, 2009)]

for fn in filename_list:
    with open(filepath + fn) as file:
        gazette_data = json.load(file)
    cf.check_filename(fn, gazette_data)

### Step Two: create a mapping between filenames in our ("DSSG") database and the original CfA databases.

This should associate unique filenames in our system with unique document IDs and other identifying information in the CfA systems. This can easily be flipped to access information the other way around. 

We do this by: 
* Getting and saving an OCR version of only the first page of all Gazettes
* For each Gazette, we then: 
   + save the metadata
   + get the true filename, using the same process as was used to access all filenames (above)

We end up with a dictionary with the following structure: 
* **Key**: Name in our database
* **Value** a dictionary with: 
    + src_database: source database(s) (list) 
    + names_in_db: names under which the file is stored in the database(s) (list) 
    + checksums: if src_database includes connected-africa, then the file will have a checksum, or hash value, associated. This is unique to the content, but different PDF scans of the same Gazette may have different hash values. (list)
    + docid: if src_database includes connected-africa, then the file will be identified by a unique document id. This is unique to the file in the CfA database; no two files, even if they have the same name and/or same content, have the same document id. (list) 
    + docnums: if src_database includes gazeti, then the file will be identified by a unique document number. (These have the additional benefit that you can build a URL that redirects to the document PDF out of this number -- see `get_url_gazeti` below.) (list)  
* **Additional keys**: 
    + "failed_to_map_from_cfa_db" maps to a dictionary which includes the name in the CfA original database (key) and its filename conversion in our standard format. For these, both the filename taken directly from the OCR'd Gazette and the filename taken from how it is stored in the database do not match with any filenames in our database. 
    + "empty_files": files that are empty in the original database
    
**We save this map for future reference.**

In [None]:
# call this to write destination URLs with metadata 

filepath_out_url_ca = "/home/dssg-cfa/final_dest_urls/dest_urls_ca_metadata"
wu.write_ca_urls_metadata(filepath_out_url, yr_start = 2000)

filepath_out_url_gaz = "/home/dssg-cfa/final_dest_urls/dest_urls_gazeti_metadata"
wu.write_gazeti_urls_metadata(filepath_out_url, yr_start = 2000)

In [None]:
# bulk OCR with first page, resizing as needed
# Start with conn. africa database
failures_ca = []
je.bulk_ocr_first_pg(filepath_out_url_ca, "connected-africa", failures_ca)
print(failures_ca)

In [None]:
# Do the same for Gazeti database
filepath_out_url_gaz = "/home/dssg-cfa/final_dest_urls/dest_urls_gazeti_metadata"
failures_gaz = []
je.bulk_ocr_first_pg(filepath_out_url_gaz, "gazeti", failures_gaz)
print(failures_gaz)

In [2]:
# This calls a function that generates the mapping described above, using
# information in the first page of the Gazette, metadata stored when 
# URLs were grabbed, and "check_filename" functions. 

fn_mapping = db.get_info()

Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find issue number; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'No'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find issue number; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find issue number; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; check manually
Unable to find string 'Vol'; chec

In [4]:
# Save the file 
with open(file_out_mapping, 'w') as f:
    json.dump(fn_mapping, f)