Network Analysis of Vaccination Strategies  
Copyright (C) 2020 by The RAND Corporation  
See LICENSE and README.md for information on usage and licensing

# Policy Search Tool

This notebook implements a simple search tool based on sub-string matching against the raw text version of the downloaded documents.

Author: Gavin Hartnett

Last Updated: Dec 31, 2020

In [1]:
## imports
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import collections
from IPython.display import display, HTML
from datetime import date

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

import matplotlib.style as style 
style.use('seaborn-paper')

## Set-up

In [2]:
## load the document paths
doc_path = '../web_scraping/documents/parsed_text/'

doc_types = [
    'AF_afcec/AF_afcec/',
    'AF_epubs/bases/',
    'AF_epubs/departmental/',
    'AF_epubs/dru/',
    'AF_epubs/foa/',
    'AF_epubs/majcom/',
    'AF_epubs/natlguard/',
    'AF_epubs/numberedAFB/',
    'AF_epubs/space_force/',    
    'AF_epubs/units/',
    'CRS_Reports/CRS_Reports/',
    'DoD/admin_instructions/',
    'DoD/directives/',
    'DoD/dtms/',
    'DoD/instructions/',
    'DoD/manuals/',
    'EO/EO/',
    'GAO/GAO'
]

paths = [doc_path + doc_type for doc_type in doc_types]
docs_per_type = {'AF_epubs':0, 'AF_afcec':0, 'CRS_Reports':0, 'DoD':0, 'EO':0, 'GAO':0}
file_paths = []
fnames = []
for doc_type in doc_types:
    path = doc_path + doc_type    
    fs = sorted(os.listdir(path))
    file_paths.extend([path + f for f in fs])
    fnames.extend([f for f in fs])
    if 'AF_epubs' in doc_type:
        docs_per_type['AF_epubs'] += len(fs)
    if 'AF_afcec' in doc_type:
        docs_per_type['AF_afcec'] += len(fs)        
    if 'CRS_Reports' in doc_type:
        docs_per_type['CRS_Reports'] += len(fs)  
    if 'DoD' in doc_type:
        docs_per_type['DoD'] += len(fs)        
    if 'EO' in doc_type:
        docs_per_type['EO'] += len(fs)
    if 'GAO' in doc_type:
        docs_per_type['GAO'] += len(fs)
        
print('found %i documents' %len(file_paths))
print(docs_per_type)
fnames_pdf = [f[:-4] + '.pdf' for f in fnames]

found 37466 documents
{'AF_epubs': 6824, 'AF_afcec': 1136, 'CRS_Reports': 8709, 'DoD': 1219, 'EO': 998, 'GAO': 18580}


In [3]:
## AFCEC
df_afcec = pd.read_csv('../web_scraping/link_scrapers/logs/AF_afcec_afterDL.csv')
df_afcec['doc type'] = len(df_afcec)*['AF_afcec']

## AF E-Pubs
df_afepubs = pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_bases_afterDL.csv')
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_departmental_afterDL.csv')])
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_dru_afterDL.csv')])
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_foa_afterDL.csv')])
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_majcom_afterDL.csv')])
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_natlguard_afterDL.csv')])
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_numberedAFB_afterDL.csv')])
df_afepubs = pd.concat([df_afepubs, pd.read_csv('../web_scraping/link_scrapers/logs/AF_epubs_units_afterDL.csv')])
df_afepubs['doc type'] = len(df_afepubs)*['AF_epubs']

## CRS Reports
df_crs = pd.read_csv('../web_scraping/link_scrapers/logs/CRS_Reports.csv')
df_crs['doc type'] = len(df_crs)*['CRS_Reports']

## DoD
df_dod = pd.read_csv('../web_scraping/link_scrapers/logs/DoD_admin_instructions_afterDL.csv')
df_dod = pd.concat([df_dod, pd.read_csv('../web_scraping/link_scrapers/logs/DoD_admin_instructions_afterDL.csv')])
df_dod = pd.concat([df_dod, pd.read_csv('../web_scraping/link_scrapers/logs/DoD_directives_afterDL.csv')])
df_dod = pd.concat([df_dod, pd.read_csv('../web_scraping/link_scrapers/logs/DoD_dtms_afterDL.csv')])
df_dod = pd.concat([df_dod, pd.read_csv('../web_scraping/link_scrapers/logs/DoD_instructions_afterDL.csv')])
df_dod = pd.concat([df_dod, pd.read_csv('../web_scraping/link_scrapers/logs/DoD_manuals_afterDL.csv')])
df_dod['doc type'] = len(df_dod)*['DoD']

## EO
df_eo = pd.read_csv('../web_scraping/link_scrapers/logs/EO.csv')
df_eo['doc type'] = len(df_eo)*['EO']

## GAO
df_gao = pd.read_csv('../web_scraping/link_scrapers/logs/GAO_afterDL.csv')
df_gao['doc type'] = len(df_gao)*['GAO']

## concatenate to form a master list
df = pd.concat([df_afcec, df_afepubs, df_crs, df_dod, df_eo, df_gao])

In [4]:
len(df)

38132

The number of documents and the number of successful downloads does not match, perhaps because of password-protection.

In [5]:
print('number of successful downloads: %i' %np.sum(df['download success']))
print('number of documents: %i' %len(file_paths))

number of successful downloads: 37782
number of documents: 37466


In [6]:
df.iloc[-1]

Title                                       Securities and Exchange Commission: Transaction Fee Pilot for NMS Stocks
source                                                                                                   GAO reports
source link                                                 https://www.gao.gov/reports-testimonies/month-in-review/
link                                                                       https://www.gao.gov/assets/700/697608.pdf
link date                                                                                           2021-01-29 15:26
downloaded on                                                                                       2021-01-30 16:32
download success                                                                                                True
file name                                     SecuritiesandExchangeCommissionTransactionFeePilotforNMSStocks_gha.pdf
file path           ../documents/pdfs/GAO/GAO/SecuritiesandExcha

Another issue to be mindful of is the fact in many cases the download only superficially succeeds in producing a pdf document. For example, a fairly common occurrence is that the file will be restricted, perhaps because it is FOUO, and as a result clicking on a link will take you to a pdf with some boilerplate text explaining that the document is restricted. My code isn't yet sophisticated enough to filter these out.

Below is a list of all the duplicated filenames, which for the most part consists of these types of boilerplate documents. In 3 cases it looks like a legitimate document appears more than once with the same name. For example, 'afi13-204v1.pdf' is a filename that exists at both the majcom and departmental levels. I checked that these are actually different documents.

In [7]:
fnames_duplicates = [(item, count) for item, count in collections.Counter(fnames_pdf).items() if count > 1]
fnames_duplicates

[('for_official_use_only.pdf', 7),
 ('fouo.pdf', 3),
 ('generic_(fouo).pdf', 3),
 ('generic_fouo.pdf', 5),
 ('generic_opr.pdf', 2),
 ('generic_opr1.pdf', 2),
 ('generic_restricted.pdf', 5),
 ('generic_stocked_and_issued.pdf', 3),
 ('physical.pdf', 5),
 ('releasability.pdf', 3),
 ('restricted_access.pdf', 8),
 ('stocked_and_issued.pdf', 3),
 ('135002p.pdf', 2),
 ('200017p.pdf', 2)]

Here is the main search function.

In [8]:
df.head(5)

Unnamed: 0,Title,source,source link,link,link date,downloaded on,download success,file name,file path,doc type
0,DoD extends deadline for Tenant Satisfaction Survey,Air Force Civil Engineering Center,https://www.afcec.af.mil/News/,http://www.afcec.af.mil/News/Article-Display/Article/2477052/dod-extends-deadline-for-tenant-satisfaction-survey/,2021-01-29 14:10,2021-01-29 21:23,True,DoD extends deadline for Tenant Satisfaction Survey .pdf,documents/parsed_text/AF_afcec/AF_afcec/DoD extends deadline for Tenant Satisfaction Survey .pdf,AF_afcec
1,"Dr. Martin Luther King Jr. Day: Remember, celebrate, act",Air Force Civil Engineering Center,https://www.afcec.af.mil/News/,http://www.afcec.af.mil/News/Article-Display/Article/2477066/dr-martin-luther-king-jr-day-remember-celebrate-act/,2021-01-29 14:10,2021-01-29 21:23,True,"Dr. Martin Luther King Jr. Day: Remember, celebrate, act.pdf","documents/parsed_text/AF_afcec/AF_afcec/Dr. Martin Luther King Jr. Day: Remember, celebrate, act.pdf",AF_afcec
2,SAF/IE Releases Installation Energy Strategic Plan for Energy Assurance,Air Force Civil Engineering Center,https://www.afcec.af.mil/News/,http://www.afcec.af.mil/News/Article-Display/Article/2477038/safie-releases-installation-energy-strategic-plan-for-energy-assurance/,2021-01-29 14:10,,False,SAF/IE Releases Installation Energy Strategic Plan for Energy Assurance.pdf,documents/parsed_text/AF_afcec/AF_afcec/SAF/IE Releases Installation Energy Strategic Plan for Energy Assurance.pdf,AF_afcec
3,Air Force to host Virtual Industry Exchange for Wright-Patterson AFB,Air Force Civil Engineering Center,https://www.afcec.af.mil/News/,http://www.afcec.af.mil/News/Article-Display/Article/2471393/air-force-to-host-virtual-industry-exchange-for-wright-patterson-afb/,2021-01-29 14:10,2021-01-29 21:23,True,Air Force to host Virtual Industry Exchange for Wright-Patterson AFB .pdf,documents/parsed_text/AF_afcec/AF_afcec/Air Force to host Virtual Industry Exchange for Wright-Patterson AFB .pdf,AF_afcec
4,Meet the 2021 I-WEPTAC MAWG Chairs,Air Force Civil Engineering Center,https://www.afcec.af.mil/News/,http://www.afcec.af.mil/News/Article-Display/Article/2477054/meet-the-2021-i-weptac-mawg-chairs/,2021-01-29 14:10,2021-01-29 21:23,True,Meet the 2021 I-WEPTAC MAWG Chairs.pdf,documents/parsed_text/AF_afcec/AF_afcec/Meet the 2021 I-WEPTAC MAWG Chairs.pdf,AF_afcec


In [9]:
def search(searchwords, case_insensitive=True):

    if case_insensitive:
        searchwords = [w.lower() for w in searchwords]
        
    ## initialize returned hit list
    
    ## use -1 for documents which weren't able to be searched
    #hitlist = [[-1]*len(df) for i in range(len(searchwords))]
    hitlist = [[0]*len(df) for i in range(len(searchwords))]

    ## keep a running list of how many times each word appears in the different types of documents
    ## i.e. DoD, AF, EO
    types = list(docs_per_type.keys())
    df_summary = pd.DataFrame()
    df_summary['total number of docs'] = list(docs_per_type.values())
    df_summary.index = types
    for iw in range(len(searchwords)):
        df_summary['hits: ' + searchwords[iw]] = [0]*len(types)
    
    ## convert search words to all lower case
    if case_insensitive:
        searchwords = [s.lower() for s in searchwords]

    ## loop over all documents
    for i in range(len(df)):
        
        ## load document text (not every entry in the df has a successfully downloaded document)
        doc_string = None
        doc_type = df['doc type'].iloc[i]
        
        path = df['file path'].iloc[i].replace('.pdf', '.txt').replace('/pdfs', '/parsed_text')
        
        ## I changed the directory structure without fixing how the path was recorded for some 
        ## doc sources, here's a quick hack to fix the problem
        if path[0:3] == '../':
            path = path[3:]
        path = '../web_scraping/' + path
        
        try:
            ## default is to convert all letters to lowercase
            if case_insensitive:
                doc_string = open(path, "rb").read().lower()
            else:
                doc_string = open(path, "rb").read()
        
        ## the above fails because some documents were not downloaded, and some were, but the pdf parser failed
        except:
            pass

        if doc_string:
            ## for each document, loop over all search words (also converted to lowercase)
            for iw in range(len(searchwords)):
                sub_str = str.encode(searchwords[iw])
                hits = doc_string.count(sub_str)
                hitlist[iw][i] = hits
                #print(hits)

                ## if there is a hit, record the type of document 
                if hits > 0:
                    df_summary['hits: ' + searchwords[iw]][doc_type] += hits
    
        #print(hitlist)
        #break
        
    ## convert hits into a dataframe
    dfhits = df.copy()
    for iw in range(len(searchwords)):
        dfhits['hits: ' + searchwords[iw]] = hitlist[iw]

    return dfhits, df_summary

## Example Search

In [12]:
search_terms = ['COVID', 'anti-satellite', 'South China Sea']

dfhits, df_summary = search(search_terms)

In [13]:
df_summary

Unnamed: 0,total number of docs,hits: covid,hits: anti-satellite,hits: south china sea
AF_epubs,6824,12,7,0
AF_afcec,1136,137,0,0
CRS_Reports,8709,16049,23,1003
DoD,1219,0,1,2
EO,998,124,0,0
GAO,18580,6,33,25
