# Text Classification
input: prepared pdfs
output: created BQ tables

In [147]:
PROJECT = !gcloud config get-value project # returns SList
PROJECT = PROJECT[0] # gets first element in list -> str
REGION = "us-central1"  
MODEL_RESOURCE_NAME = "2393478483993952256"
import os
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION

In [2]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

In [3]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
from google.cloud import storage
from google.cloud import vision
from google.cloud import aiplatform

In [5]:
import time
from datetime import datetime
import json
import pandas as pd
import tempfile 

In [64]:
from google.cloud import aiplatform
from google.cloud.aiplatform.gapic.schema import predict
from google.cloud.aiplatform.v1.schema.predict.instance_v1.types import TextClassificationPredictionInstance
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value

## TO-DO List:
1. Receive input: Bucket with PDF files (str) DONE
2. Preprocess/Transform PDF for task ONGOING
    1. OCR --> Text DONE
    2. maybe: save as JSONL or text file, documentation unclear: https://cloud.google.com/vertex-ai/docs/predictions/batch-predictions#text_1
3. Batch Predict ONGOING
4. Save result in storage DONE (csv)
5. Load into BigQuery DONE (csv)


In [149]:
'''
gsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/us_076.pdf gs://2021_08_16_tcn_dev/2021-08-16/us_076.pdf
gsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/med_tech_8.pdf gs://2021_08_16_tcn_dev/2021-08-16/med_tech_8.pdf
gsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/computer_vision_1.pdf gs://2021_08_16_tcn_dev/2021-08-16/computer_vision_1.pdf
'''

'\ngsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/us_076.pdf gs://2021_08_16_tcn_dev/2021-08-16/us_076.pdf\ngsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/med_tech_8.pdf gs://2021_08_16_tcn_dev/2021-08-16/med_tech_8.pdf\ngsutil cp gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/pdf/computer_vision_1.pdf gs://2021_08_16_tcn_dev/2021-08-16/computer_vision_1.pdf\n'

In [158]:
# Step 1: "main function" Step 1:
def main_predict(bucket_name):
    """Runs AutoML Text classifier on a GCS bucket and pushes results to BigQuery."""
    logger.info("Starting text classification.")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Step 2: Preprocess/Transform PDF for task
    logger.info("preprocessing files")
    jsonl_path = preprocess(bucket, bucket_name)

    # Step 3:predict with automl
    logger.info("predict with AutoML")
    str_time = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    gcs_destination = f"gs://{bucket_name}/automl-tcn-{str_time}"


    results = run_automl_text_batch(bucket, bucket_name, jsonl_path, gcs_destination)

    # Step 4: save result in storage
    logger.info("save results to storage")
    path_to_csv = save_to_storage(bucket, bucket_name, results)

    # Step 5: Load storage result in BQ
    logger.info("load results into BigQuery")
    status = load_to_bigquery(bucket, bucket_name, path_to_csv)

    logging.info(f"finished task with status {status}")
    

# Step 2: run ocr, save results as txt files, save paths in jsonl, push jsonl to storage
def preprocess(bucket, bucket_name):
    list_of_dst_uri = []
    jsonl_items = []

    blob_list  = [blob for blob in list(bucket.list_blobs()) if blob.name.endswith(".pdf")]
    last_blob_name = ""
        
    for blob in blob_list:
        gcs_source_uri = f"gs://{bucket_name}/{blob.name}"
        gcs_destination_uri = f"gs://{bucket_name}/{os.path.basename(blob.name)}"
        blob_name = blob.name+"output-1-to-1.json"
        list_of_dst_uri.append(blob_name)
        
        # run ocr
        async_detect_document(gcs_source_uri, gcs_destination_uri)
        logging.info(f"started ocr for {blob.name}. Source uri: {gcs_source_uri}. Destination uri: {gcs_destination_uri}")
        
    logging.info(f"last blob name = {blob_name}")
        
    
    while(not bucket.blob(blob_name).exists()):
        logging.info("waiting on operation to finish")
        time.sleep(5)

    # NEW: create .txt files
    logging.info(f"list_of_dst_uri: {list_of_dst_uri}")
    for dst_uri in list_of_dst_uri:
        ocr_blob = bucket.blob(dst_uri)
        json_string = ocr_blob.download_as_string()
        response = json.loads(json_string)
        text = response['responses'][0]['fullTextAnnotation']['text'] 
        txt_path = dst_uri.replace("output-1-to-1.json", ".txt")
        text_blob = bucket.blob(txt_path)
        text_blob.upload_from_string(text)
        
        jsonl_items.append({
            "content": f"gs://{bucket_name}/{txt_path}", 
            "mimeType": "text/plain"
        })
        
        logging.info(f"created jsonl: {json.dumps(jsonl_items)}")
        
    jsonl_path = create_jsonl(bucket, bucket_name, jsonl_items)

    return jsonl_path

def create_jsonl(bucket, bucket_name, json_list):
    str_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    jsonl_filename = f"textfiles_{str_time}.jsonl"
    blob = bucket.blob(jsonl_filename)
    jsonl_content = ""
    for item in json_list:
        d = json.dumps(item)+"\n"
#         d = d.encode('utf8')
        jsonl_content = jsonl_content+d
        
    blob.upload_from_string(jsonl_content)
    
    return f"gs://{bucket_name}/{jsonl_filename}"
        


# Step 3: predict with automl
def run_automl_text_batch(bucket, bucket_name, gcs_source, gcs_destination):
    results = []
    
    job = create_batch_prediction_job(
        PROJECT, 
        REGION, 
        model_resource_name=MODEL_RESOURCE_NAME, 
        job_display_name="tcn-job", 
        gcs_source=gcs_source, 
        gcs_destination=gcs_destination, 
        sync=True
        )
    
    logging.info(type(job))
    
    
    # read results 
    results = []
    
    blob_list  = [blob for blob in list(bucket.list_blobs()) if os.path.basename(gcs_destination) in blob.name and blob.name.endswith(".jsonl")]
    for blob in blob_list:
        blob_str = blob.download_as_string().decode("utf-8") 
        responses = []
        for line in blob_str.split("\n")[:-1]:
            responses.append(json.loads(str(line)))

        for response in responses:
            results.append({
                'file': response["instance"]["content"][:-4], # "gs://bucket/text.txt" TODO: check if original path is needed
                'subject': response["prediction"]["displayNames"][0],
                'score':  response["prediction"]["confidences"][0],
                })
    
    return results


def create_batch_prediction_job(
    project,
    location,
    model_resource_name,
    job_display_name,
    gcs_source,
    gcs_destination,
    sync = True,
):
    aiplatform.init(project=project, location=location)

    my_model = aiplatform.Model(model_resource_name)

    batch_prediction_job = my_model.batch_predict(
        job_display_name=job_display_name,
        gcs_source=gcs_source,
        gcs_destination_prefix=gcs_destination,
        sync=sync,
    )

    batch_prediction_job.wait()

    logging.info(batch_prediction_job.display_name)
    logging.info(batch_prediction_job.resource_name)
    logging.info(batch_prediction_job.state)
    return batch_prediction_job


# Step 4: save result in storage
def save_to_storage(bucket, bucket_name, results: list):
    """
    save batch predictions to storage as csv
    returns path
    
    results
    [
    {'file': ..., 'subject': ..., 'score': ...},
    {'file': ..., 'subject': ..., 'score': ...},
    {'file': ..., 'subject': ..., 'score': ...},
    ]
    """
    filename = "predictions.csv"
    dst_path = f"gs://{bucket_name}/{filename}"

    
    df = pd.DataFrame.from_records(results)
    
    df.to_csv(filename, index=False, header=False)
    blob = bucket.blob(filename)
    with open(filename, "rb") as my_file:
        blob.upload_from_file(my_file)
    
    return dst_path

# Step 5: Load storage result in BQ ---  great for utils.py
def load_to_bigquery(bucket, bucket_name, gcs_path):
    """loads csv data in storage to BQ"""
    # create new dataset and table
    str_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    dataset_id = f"{bq.project}.docprocessing_{str_time}"
    table_id = f"{dataset_id}.docprocessing_tcn"
    logging.info(f"dataset_id: {dataset_id}")
    
    
    # Send the dataset to the API for creation, with an explicit timeout.
    # Raises google.api_core.exceptions.Conflict if the Dataset already
    # exists within the project.
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        bq.get_dataset(dataset_id)  # Make an API request.
        logging.info("Dataset {} already exists".format(dataset_id))
    except Exception as e:
        logging.info("Dataset {} is not found".format(dataset_id))
        dataset = bq.create_dataset(dataset, timeout=30)  # Make an API request.
        dataset.location = "US"
        logging.info("Created dataset {}.{}".format(bq.project, dataset.dataset_id))
    
    # create bigquery table and upload csv
    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("file", "STRING"),
            bigquery.SchemaField("subject", "STRING"),
            bigquery.SchemaField("score", "FLOAT"),
        ],
        skip_leading_rows=0,
        # The source format defaults to CSV, so the line below is optional.
        source_format=bigquery.SourceFormat.CSV,
        allow_quoted_newlines=True,

    )
    uri = gcs_path

    load_job = bq.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = bq.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))

# helper functions ---- great candidates for utils.py
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS
    Link to documentation (types): https://googleapis.dev/python/vision/latest/vision_v1/types.html#google.cloud.vision_v1.types.OutputConfig
    """

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 1

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    # source
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)
    
    # destination
    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, 
        batch_size=batch_size)
    

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], 
        input_config=input_config,
        output_config=output_config
    )

    operation = client.async_batch_annotate_files(
        requests=[async_request])
    
def predict_text_classification_single_label_sample(
    project: str,
    endpoint_id: str,
    content: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
    instance = TextClassificationPredictionInstance(
        content=content,
    ).to_value()
    instances = [instance]
    parameters_dict = {}
    parameters = json_format.ParseDict(parameters_dict, Value())
    endpoint = client.endpoint_path(
        project=project, location=location, endpoint=endpoint_id
    )
    response = client.predict(
        endpoint=endpoint, instances=instances, parameters=parameters
    )
    print("response")
    print(" deployed_model_id:", response.deployed_model_id)
    # See gs://google-cloud-aiplatform/schema/predict/prediction/text_classification.yaml for the format of the predictions.
    predictions = response.predictions
    for prediction in predictions:
        print(" prediction:", dict(prediction))
        
    return predictions

## Testing area

In [151]:
bucket_name = "2021_08_16_tcn_dev"

In [152]:
!gsutil ls gs://2021_08_16_tcn_dev

gs://2021_08_16_tcn_dev/computer_vision_1.pdf
gs://2021_08_16_tcn_dev/med_tech_8.pdf
gs://2021_08_16_tcn_dev/us_076.pdf


In [160]:
main_predict(bucket_name)