# Prepare data for Text Classification AutoML Model

In [1]:
PROJECT = !gcloud config get-value project # returns SList
PROJECT = PROJECT[0] # gets first element in list -> str
REGION = "us-central1"  
import os
os.environ["PROJECT"] = PROJECT
os.environ["REGION"] = REGION

In [2]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT)

In [3]:
# create csv dataset containing gcs_path (to .txt file), invention_type (label)
# see also: https://github.com/munnm/professional-services/blob/master/examples/cloudml-document-ai-patents/training_utils.py

In [4]:
# 1. grab bigquery table with invention_types

In [5]:
query_string = """
#standardSQL
SELECT
    *
FROM
    `qwiklabs-gcp-00-373ac55d0e0a.labeled_patents.invention_types`
"""

invention_types = bq.query(query_string).to_dataframe()

In [6]:
invention_types.head()

Unnamed: 0,gcs_path,invention_type
0,gs://gcs-public-data--labeled-patents/us_076.pdf,other
1,gs://gcs-public-data--labeled-patents/us_081.pdf,other
2,gs://gcs-public-data--labeled-patents/us_014.pdf,other
3,gs://gcs-public-data--labeled-patents/us_046.pdf,other
4,gs://gcs-public-data--labeled-patents/us_036.pdf,other


In [16]:
import random
random.seed(42)
for i in range(50):
    idx = random.randint(0,invention_types.shape[0])
    print(invention_types.loc[idx, "gcs_path"])
    print(invention_types.loc[idx, "invention_type"])
    print()

gs://gcs-public-data--labeled-patents/us_024.pdf
other

gs://gcs-public-data--labeled-patents/us_019.pdf
other

gs://gcs-public-data--labeled-patents/us_066.pdf
other

gs://gcs-public-data--labeled-patents/us_092.pdf
other

gs://gcs-public-data--labeled-patents/us_083.pdf
other

gs://gcs-public-data--labeled-patents/us_075.pdf
other

gs://gcs-public-data--labeled-patents/us_061.pdf
other

gs://gcs-public-data--labeled-patents/us_016.pdf
other

gs://gcs-public-data--labeled-patents/med_tech_8.pdf
med_tech

gs://gcs-public-data--labeled-patents/us_099.pdf
other

gs://gcs-public-data--labeled-patents/us_055.pdf
other

gs://gcs-public-data--labeled-patents/us_027.pdf
other

gs://gcs-public-data--labeled-patents/us_087.pdf
other

gs://gcs-public-data--labeled-patents/us_015.pdf
other

gs://gcs-public-data--labeled-patents/computer_vision_1.pdf
computer_vision

gs://gcs-public-data--labeled-patents/us_019.pdf
other

gs://gcs-public-data--labeled-patents/us_006.pdf
other

gs://gcs-public-data

In [9]:
dataset_tcn = invention_types.copy(deep=True)

In [None]:
# change gcs_path to point to .txt files
# example gcs_path: gs://gcs-public-data--labeled-patents/us_076.pdf	

In [18]:
def change_gcs_path_to_txt_path(original):
    split_path = os.path.split(original)
    new_path = os.path.join("gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/text" ,split_path[1])
    return new_path

def change_extension(original):
    return original[:-3] + "txt"
    

In [19]:
dataset_tcn["gcs_path"] = dataset_tcn["gcs_path"].apply(change_gcs_path_to_txt_path)

In [20]:
dataset_tcn["gcs_path"] = dataset_tcn["gcs_path"].apply(change_extension)

In [26]:
dataset_tcn.head()

Unnamed: 0,gcs_path,invention_type
0,gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_pate...,other
1,gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_pate...,other
2,gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_pate...,other
3,gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_pate...,other
4,gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_pate...,other


In [23]:
dataset_tcn.loc[0, "gcs_path"]

'gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents/text/us_076.txt'

In [24]:
# save to csv
dataset_tcn.to_csv("tcn_dataset.csv", index=False, header=False)

In [25]:
!gsutil -m cp ./tcn_dataset.csv gs://qwiklabs-gcp-00-373ac55d0e0a/labeled_patents

Copying file://./tcn_dataset.csv [Content-Type=text/csv]...
/ [1/1 files][ 10.2 KiB/ 10.2 KiB] 100% Done                                    
Operation completed over 1 objects/10.2 KiB.                                     
