# Text Classification
input: prepared pdfs
output: created BQ tables

In [96]:
PROJECT = !gcloud config get-value project # returns SList
PROJECT = PROJECT[0] # gets first element in list -> str
REGION = "us-central1"  
MODEL_RESOURCE_NAME = "" # TODO
import os
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION

In [2]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

In [3]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
from google.cloud import storage
from google.cloud import vision

In [89]:
import time
from datetime import datetime
import json
import pandas as pd
import tempfile 

'2021-08-16_143228'

## TO-DO List:
1. Receive input: Bucket with PDF files (str) DONE
2. Preprocess/Transform PDF for task ONGOING
    1. OCR --> Text DONE
    2. maybe: save as JSONL or text file, documentation unclear: https://cloud.google.com/vertex-ai/docs/predictions/batch-predictions#text_1
3. Batch Predict ONGOING
4. Save result in storage DONE (csv)
5. Load into BigQuery DONE (csv)


In [None]:
'''
gsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/us_076.pdf gs://2021_08_16_tcn_dev/2021-08-16/us_076.pdf
gsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/med_tech_8.pdf gs://2021_08_16_tcn_dev/2021-08-16/med_tech_8.pdf
gsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/computer_vision_1.pdf gs://2021_08_16_tcn_dev/2021-08-16/computer_vision_1.pdf
'''

In [133]:
# Step 1: "main function" Step 1:
def predict(bucket_name, folder=None):
    """Runs AutoML Text classifier on a GCS folder and pushes results to BigQuery."""
    logger.info("Starting text classification.")
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    # Step 2: Preprocess/Transform PDF for task
    logger.info("preprocessing files")
    text_list = preprocess(bucket, bucket_name, folder)
    
        
    # Step 3:predict with automl
    logger.info("predict with AutoML")
    str_time = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    if folder:
        gcs_destination = f"gs://{bucket_name}/{folder}/automl-tcn-{str_time}"
    else:
        gcs_destination = f"gs://{bucket_name}/automl-tcn-{str_time}"
    
    results = run_automl_text_batch(bucket, bucket_name, folder, text_list, gcs_destination)
    
#     # Step 4: save result in storage
#     logger.info("save results to storage")
#     path_to_csv = save_to_storage(bucket, bucket_name, folder, results)
    
#     # Step 5: Load storage result in BQ
#     logger.info("load results into BigQuery")
#     status = load_to_bigquery(path_to_csv)
    
#     logging.info(f"finished task with status {status}")
    

# Step 2:
def preprocess(bucket, bucket_name, folder=None):
    # TODO: implement
    
    list_of_text = []
    list_of_dst_uri = []

    
    if folder:
        blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.startswith(folder) and blob.name.endswith(".pdf")]
    else: 
        blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.endswith(".pdf")]
        
    last_blob_name = ""
        
    for blob in blob_list:
        
        if folder:
            gcs_destination_uri = f"gs://{bucket_name}/{folder}-text/{os.path.basename(blob.name)}"
            last_blob_name = f"{folder}-text/{os.path.basename(blob.name)}output-1-to-1.json"
            
        else:
            gcs_destination_uri = f"gs://{bucket_name}/{os.path.basename(blob.name)}"
            last_blob_name = blob.name+"output-1-to-1.json"
        
        list_of_dst_uri.append(last_blob_name)
        
        
        gcs_source_uri = f"gs://{bucket_name}/{blob.name}"
        async_detect_document(gcs_source_uri, gcs_destination_uri)
        
        logging.info(f"started ocr for {blob.name}. Source uri: {gcs_source_uri}. Destination uri: {gcs_destination_uri}")
        
#     last_uri = list_of_dst_uri[:-1]
    
    while(not bucket.blob(last_blob_name).exists()):
        logging.info("waiting on operation to finish")
        time.sleep(5)
        
    # NEW: create .txt files
    for dst_uri in list_of_dst_uri:
        ocr_blob = bucket.blob(dst_uri)
        json_string = ocr_blob.download_as_string()
        response = json.loads(json_string)
        text = response['responses'][0]['fullTextAnnotation']['text']
        list_of_text.append(annotation["text"])
        
        txt_path = dst_uri.replace("output-1-to-1.json", ".txt")
        text_blob = bucket.blob(txt_path)
        text_blob.upload_from_string(text)

    return list_of_text

# Step 3: predict with automl
# WARNING: check which type content should be????
# gcs source ???? TODO
def run_automl_text_batch(bucket, bucket_name, folder, content, gcs_destination):
    # TODO: implement
    content = content[0]
    results = []
    
    create_batch_prediction_job(
        PROJECT, 
        REGION, 
        MODEL_RESOURCE_NAME,
        gcs_source=[content],
        gcs_destination=gcs_destination,
        sync=True)
    
    
    # read results 
    results = []
    blob_list = bucket.list_blobs(gcs_destination)
    for blob in blob_list:
        blob_str = blob.download_as_string()
        response = json.loads(blob_str)
        
        if folder: 
            file = f"gs://{bucket_name}/{folder}/{os.path.basename(blob)}" # ??? see prediction output
        else:
            file = f"gs://{bucket_name}/{os.path.basename(blob)}"
        

        results.append({
            'file': file, 
            'subject': response["prediction"]["displaynames"][0],
            'score':  response["prediction"]["confidences"][0],
            })
    
    return results


def create_batch_prediction_job(
    project: PROJECT,
    location: REGION,
    model_resource_name: MODEL_RESOURCE_NAME, # TODO
    job_display_name: "tcn_batch",
    gcs_source: str,
    gcs_destination: str,
    sync: bool = True,
):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=sync,
    )

    batch_prediction_job.wait()

    logging.info(batch_prediction_job.display_name)
    logging.info(batch_prediction_job.resource_name)
    logging.info(batch_prediction_job.state)
    return batch_prediction_job


# Step 4: save result in storage
def save_to_storage(bucket, bucket_name, folder, results: list):
    """
    save batch predictions to storage as csv
    returns path
    
    results
    [
    {'file': ..., 'subject': ..., 'score': ...},
    {'file': ..., 'subject': ..., 'score': ...},
    {'file': ..., 'subject': ..., 'score': ...},
    ]
    """
    filename = "predicions.csv"
    dst_path = f"gs://{bucket_name}/{filename}"

    
    df = pd.DataFrame.from_records(results)
    
    df.to_csv(filename, index=False, header=False)
    blob = bucket.blob(filename)
    with open(filename, "rb") as my_file:
        blob.upload_from_file(my_file)
    
    return dst_path

# Step 5: Load storage result in BQ ---  great for utils.py
def load_to_bigquery(bucket, bucket_name, gcs_path):
    """loads csv data in storage to BQ"""
    # create new dataset and table
    str_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    dataset_id = "qwiklabs-gcp-00-373ac55d0e0a.docprocessing_20210816_153004" #  f"{bq.project}.docprocessing_{str_time}"
    table_id = f"{dataset_id}.docprocessing_tcn"
    print(dataset_id)

    # TODO(developer): Set dataset_id to the ID of the dataset to create.
    
    # Construct a full Dataset object to send to the API.
    dataset = bigquery.Dataset(dataset_id)

    # TODO(developer): Specify the geographic location where the dataset should reside.
    dataset.location = "US"

    # Send the dataset to the API for creation, with an explicit timeout.
    # Raises google.api_core.exceptions.Conflict if the Dataset already
    # exists within the project.
    try:
        bq.get_dataset(dataset_id)  # Make an API request.
        logging.info("Dataset {} already exists".format(dataset_id))
    except NotFound:
        logging.info("Dataset {} is not found".format(dataset_id))
        dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
        logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
    
    # create bigquery table and upload csv
    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("file", "STRING"),
            bigquery.SchemaField("subject", "STRING"),
            bigquery.SchemaField("score", "FLOAT"),
        ],
        skip_leading_rows=0,
        # The source format defaults to CSV, so the line below is optional.
        source_format=bigquery.SourceFormat.CSV,
        allow_quoted_newlines=True,

    )
    uri = gcs_path

    load_job = bq.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = bq.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))

# helper functions ---- great candidates for utils.py
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS
    Link to documentation (types): https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
    """

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 1

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    # source
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)
    
    # destination
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, 
        batch_size=batch_size)
    

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], 
        input_config=input_config,
        output_config=output_config
    )

    operation = client.async_batch_annotate_files(
        requests=[async_request])

In [81]:
# list_of_dst_uri

In [116]:
# predict(bucket_name="2021_08_16_tcn_dev", folder="2021-08-16")

In [136]:
bucket_name = "2021_08_16_tcn_dev"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
    
# list_of_dst_uri = preprocess(bucket, bucket_name="2021_08_16_tcn_dev", folder="2021-08-16")

# save_to_storage(bucket, bucket_name, None, results=[
#     {'file': 'myfile1', 'subject': 'A', 'score': 1},
#     {'file': 'myfile2', 'subject': 'B', 'score': 2},
#     {'file': 'myfile3', 'subject': 'C', 'score': 3},
#     ])

# load_to_bigquery(bucket, bucket_name, gcs_path='gs://2021_08_16_tcn_dev/predicions.csv')

In [137]:
# list_of_dst_uri

In [138]:
!gsutil ls gs://2021_08_16_tcn_dev/2021-08-16-text

gs://2021_08_16_tcn_dev/2021-08-16-text/computer_vision_1.pdf.txt
gs://2021_08_16_tcn_dev/2021-08-16-text/computer_vision_1.pdfoutput-1-to-1.json
gs://2021_08_16_tcn_dev/2021-08-16-text/med_tech_8.pdf.txt
gs://2021_08_16_tcn_dev/2021-08-16-text/med_tech_8.pdfoutput-1-to-1.json
gs://2021_08_16_tcn_dev/2021-08-16-text/us_076.pdf.txt
gs://2021_08_16_tcn_dev/2021-08-16-text/us_076.pdfoutput-1-to-1.json


In [118]:
# dst_uris