In [14]:
import json
import re
from google.cloud import vision
from google.cloud import storage

In [2]:
PROJECT_ID = !gcloud config get-value project # returns SList
PROJECT_ID = PROJECT_ID[0] # gets first element in list -> str
SERVICE_ACCOUNT = "sa-objectdetection" # Replace with a name of your choice
ZONE = "us-central1"# Make sure the zone is set to "us-central1"

In [3]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT_ID)

In [47]:
storage_client = storage.Client()

In [4]:
PDF_DIR = "./dataset/pdf"

In [9]:
len(os.listdir(PDF_DIR))

404

# Import pdf files from storage

In [10]:
# Copying pdfs into the project bucket:
# !gsutil -m cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/*.pdf  ./dataset/pdf

# Optical Character Recognition

## Vision API

In [1]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS
    Link to documentation (types): https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
    """

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 1

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    # source
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)
    
    # destination
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, 
        batch_size=batch_size)
    

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], 
        input_config=input_config,
        output_config=output_config
    )

    operation = client.async_batch_annotate_files(
        requests=[async_request])


def rename():
#     match = re.match(r'gs://([^/]+)/(.+)', gcs_bucket_path)
    bucket_name = PROJECT_ID
    bucket = storage_client.get_bucket(bucket_name)
    blob_list = list(bucket.list_blobs())
    for blob in blob_list:
        if blob.name.endswith("output-1-to-1.json"):
            name = blob.name
            name = re.sub(r"output-1-to-1.json", "", name)
            bucket.rename_blob(blob, name)
            
# create json file with async_detect_document
def create_json():
    bucket_name = PROJECT_ID
    bucket = storage_client.get_bucket(bucket_name)
    blob_names = [blob.name for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/pdf/") and blob.name.endswith(".pdf")]
    for idx, blob_name in enumerate(blob_names):
        full_src_path = f"gs://{PROJECT_ID}/{blob_name}"
        new_filename = os.path.basename(blob_name)[:-3]+"json"
        full_dst_path =  f"gs://{PROJECT_ID}/labeled_patents/text/{new_filename}"
        async_detect_document(full_src_path,full_dst_path)
        print(idx)

# extract text (string) from json file
def create_text_file_from_json():
    bucket = storage_client.get_bucket(PROJECT_ID)
    blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/text/") and blob.name.endswith(".json")]
    for output in blob_list:
        json_string = output.download_as_string()
        response = json.loads(json_string)
        text = response['responses'][0]['fullTextAnnotation']['text']
        new_blob_name = output.name[:-4]+"txt"
        new_blob = bucket.blob(new_blob_name)
        new_blob.upload_from_string(text)

# get text from existing .txt file        
def get_text(gcs_path):
    basename = os.path.basename(gcs_path)[:-3]+"txt"
    new_path = "labeled_patents/text/"
    full_path = os.path.join(new_path, basename)
    blob = bucket.get_blob(full_path)
    text = blob.download_as_string().decode("utf-8")
    return text

## Tikaparser

## Manual check - which is better

# Save text in txt files and upload to storage

In [None]:
gcs_source_uri = f"gs://{PROJECT_ID}/labeled_patents/pdf/"
gcs_destination_uri = f"gs://{PROJECT_ID}/labeled_patents/text/"


In [None]:
# create_json()
# rename()
# create_text_file_from_json()

# Create csv

In [None]:
import pandas as pd
bucket = storage_client.get_bucket(PROJECT_ID)
blob_names = [os.path.basename(blob.name) for blob in list(bucket.list_blobs()) if blob.name.startswith("labeled_patents/pdf/") and blob.name.endswith(".pdf")]

pdfs = [os.path.join(f"gs://{PROJECT_ID}/labeled_patents/pdf/{name}") for name in blob_names]
df_ocr = pd.DataFrame(pdfs, columns=['gcs_path'])
df_ocr["text"] = df_ocr.apply(lambda row: get_text(row.gcs_path), axis=1)
# create csv
df_ocr.to_csv("./dataset/ocr.csv", header=False, index=False)
# save to storage
!gsutil -m cp ./dataset/ocr.csv  gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/ocr.csv

# Create new table

schema:

`
gcs_path: path to pdf files
text: text content
`

In [None]:
# create bigquery table and upload csv
table_id = "qwiklabs-gcp-00-373ac55d0e0a.labeled_patents_preprocessed.ocr"

job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("gcs_path", "STRING"),
        bigquery.SchemaField("text", "STRING")
    ],
    skip_leading_rows=0,
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.CSV,
    allow_quoted_newlines=True,
    
)
uri = "gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/ocr.csv"

load_job = bq.load_table_from_uri(
    uri, table_id, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = bq.get_table(table_id)  # Make an API request.
print("Loaded {} rows.".format(destination_table.num_rows))