<a href="https://colab.research.google.com/github/ScottTeran/ga_capstone/blob/main/code/01_extraction_vision_api_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text extraction using Google Cloud Vision API
Credit goes to Silvia Zeamer for the first 60% of this notebook. The other 40% goes to Caroline Schmidt for figuring out how to get JSON files back out of GCS and into txt format. 

In [1]:
# this has to be installed for each new runtime on Google Colab

# !pip install google-cloud-vision

In [None]:
# this has to be installed for each new runtime

# !pip install google-cloud-storage

In [None]:
# using this to check versions

# pip freeze

In [None]:
# this helped with accessing GCS [https://stackoverflow.com/questions/45501082/set-google-application-credentials-in-python-project-to-use-google-api]

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/ra-lafferty-13f7704670eb.json" # this is my API key file

In [None]:
import json
import re

from google.cloud import vision
from google.cloud import storage

In [None]:
# this code is from Silvia Zeamer [https://towardsdatascience.com/how-to-extract-the-text-from-pdfs-using-python-and-the-google-cloud-vision-api-7a0a798adc13]
# also, code included from Vision API documentation [https://cloud.google.com/vision/docs/fulltext-annotations]

def async_detect_document(gcs_source_uri, gcs_destination_uri):
    
    mime_type = 'application/pdf'
    
    # how many PDF pages will go in each file (100 is max)
    batch_size =  100 
    
    # the tool that annotates text in a PDF
    client = vision.ImageAnnotatorClient()
    
    feature = vision.Feature(
        type_ = vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
    
    # telling the Vision API that source type is PDF (mime_type) and where it can be found
    gcs_source = vision.GcsSource(uri = gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source  = gcs_source, mime_type = mime_type)
    
    # generate JSON files with 100 pages worth of data each
    gcs_destination = vision.GcsDestination(uri = gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination = gcs_destination, 
        batch_size = batch_size)
    
    # an asynchronous request using input and output configs
    async_request = vision.AsyncAnnotateFileRequest(
        features = [feature], input_config = input_config,
        output_config = output_config)
    
    # batch annotate files using client and asyn_request set up earlier
    operation = client.async_batch_annotate_files(
        requests = [async_request])
    
    print('Waiting for the operation to finish.')
    operation.result

In [None]:
async_detect_document('gs://ra_lafferty_pdfs/the_devil_is_dead.pdf', 'gs://ra_lafferty_pdfs/txt_files/')

Waiting for the operation to finish.


---

In [None]:
# so that I don't have to copy-paste so much... (note from Caroline Schmidt)
base = 'gs://ra_lafferty_pdfs/txt_files/'

In [None]:
def make_blob_list(gcs_destination_uri, verbose=False):

  '''
  Returns a blob list based off of a GCS URI. Blobs are basically
  GC objects.
  '''
  
  # client to bundle configuration needed for API requests
  storage_client = storage.Client()

  # generate vars for bucket request
  match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
  bucket_name = match.group(1)
  prefix = match.group(2)

  # generate bucket var
  bucket = storage_client.get_bucket(bucket_name)

  # make blob list
  blob_list = list(bucket.list_blobs(prefix=prefix))

  # For troubleshooting purposes, print blob names
  if verbose:
    for blob in blob_list:
      print(blob.name)

  return blob_list

In [None]:
blob_list = make_blob_list(base, verbose=False)

In [None]:
def blob_to_text(blob, verbose=False):
  '''
  Accepts one blob and returns one text, for all
  pages processed and contained in the blob. Optional
  verbose parameter for troubleshooting.
  '''
  
  if verbose:
    print(f'Now processing: {blob.name}')

  blob_string = blob.download_as_string()
  blob_json = json.loads(blob_string)
  responses = [r for r in blob_json['responses'] if 'fullTextAnnotation' in r.keys()]
  texts = [each['fullTextAnnotation']['text'] for each in responses]
  
  if verbose:
    print('Response count:', len(blob_json['responses']))
    print('Texts count:', len(texts))
  
  return ''.join(texts)

In [None]:
def write_to_text(gcs_destination_uri, verbose=False, write=True):

  '''
  Accepts a GCS URI and returns a text file containing all texts for
  blobs in the GCS destination. Optional verbose parameter for
  troubleshooting. Default write to disk; this can be overwritten
  by setting write=False.
  '''
  
  blob_list = make_blob_list(gcs_destination_uri, verbose=verbose)
  blob_texts = [blob_to_text(blob, verbose=verbose) for blob in blob_list]
  texts = ''.join(blob_texts)

  if write:
    with open("lafferty.txt", "w") as f:
      f.write(texts)

  return texts

In [None]:
# if running in Google Colab file will export to 'content/'
t = write_to_text(base, verbose=True, write=True)

txt_files/output-1-to-100.json
txt_files/output-101-to-200.json
txt_files/output-201-to-226.json
txt_files/output-201-to-285.json
Now processing: txt_files/output-1-to-100.json
Response count: 100
Texts count: 97
Now processing: txt_files/output-101-to-200.json
Response count: 100
Texts count: 100
Now processing: txt_files/output-201-to-226.json
Response count: 26
Texts count: 26
Now processing: txt_files/output-201-to-285.json
Response count: 85
Texts count: 85
