### Create input files for batch prediction
The following notebook creates input for batch predictions against both the classification and sentiment analysis models trained by the pipeline.

Author: Dan Waters | danwaters@my.unt.edu

In [10]:
# This pipeline is to be run in Vertex Workbench
# Install Kubeflow Pipelines and GCP AI Platform
!pip3 install kfp --user -q
!pip3 install --upgrade google-cloud-aiplatform --user -q
!pip3 install --upgrade jsonlines --user -q

In [11]:
import pandas as pd
import jsonlines
from google.cloud import storage

In [3]:
PROJECT_ID = "iowa-steam"
BUCKET_NAME = "iowa-steam-source-data"
BUCKET_URI = f"gs://{BUCKET_NAME}"
TEST_FILENAME = f"test_held_out.csv"
TEST_URI = f"{BUCKET_URI}/{TEST_FILENAME}"

In [8]:
def process_data(data, content):
    
        '''
        data is in a csv format

        content is the column in data set that you want to process

        Content_Parsed_4 is the final processed output 

        '''

        #\r and \n
        data['Content_Parsed_1'] = content.str.replace("\r", " ")
        data['Content_Parsed_1'] = data['Content_Parsed_1'].str.replace("\n", " ")
        data['Content_Parsed_1'] = data['Content_Parsed_1'].str.replace("    ", " ")

        # quotation marks
        data['Content_Parsed_1'] = data['Content_Parsed_1'].str.replace('"', '')

        # Lower casing all words so that upper case words (ex: at the beginning of a sentence) 
        # are read the same as lower case words
        data['Content_Parsed_2'] = data['Content_Parsed_1'].str.lower()

        # punctuation signs
        punctuation_signs = list("?:!.,;")
        data['Content_Parsed_3'] = data['Content_Parsed_2']

        for i in punctuation_signs:
            data['Content_Parsed_3'] = data['Content_Parsed_3'].str.replace(i, '')

        # Possessive pronouns 
        data['Content_Parsed_4'] = data['Content_Parsed_3'].str.replace("'s", "")

        return data

test = pd.read_csv(TEST_URI)
test_processed = process_data(test, test['user_review'])
test_processed.head(5)



Unnamed: 0,review_id,title,year,user_review,user_suggestion,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4
0,16058,Eternal Card Game,2017.0,Early Access ReviewHonestly feels like a Card ...,1,Early Access ReviewHonestly feels like a Card ...,early access reviewhonestly feels like a card ...,early access reviewhonestly feels like a card ...,early access reviewhonestly feels like a card ...
1,16059,Eternal Card Game,2017.0,Early Access Reviewreally fun card game thats ...,1,Early Access Reviewreally fun card game thats ...,early access reviewreally fun card game thats ...,early access reviewreally fun card game thats ...,early access reviewreally fun card game thats ...
2,16060,Eternal Card Game,2018.0,Early Access ReviewIn my brief (comparatively)...,1,Early Access ReviewIn my brief (comparatively)...,early access reviewin my brief (comparatively)...,early access reviewin my brief (comparatively)...,early access reviewin my brief (comparatively)...
3,16061,Eternal Card Game,2016.0,Early Access ReviewPlays like Magic and plays ...,1,Early Access ReviewPlays like Magic and plays ...,early access reviewplays like magic and plays ...,early access reviewplays like magic and plays ...,early access reviewplays like magic and plays ...
4,16062,Eternal Card Game,2017.0,Early Access ReviewAfter wasting a lot of time...,1,Early Access ReviewAfter wasting a lot of time...,early access reviewafter wasting a lot of time...,early access reviewafter wasting a lot of time...,early access reviewafter wasting a lot of time...


In [12]:
# Write all of these instances to a test folder in the bucket.
print(len(test_processed))

1765


In [14]:
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

destination_prefix = "automl_batch_prediction"
file_subprefix = "data"

input_file_blob_name = f"{destination_prefix}/steam_reviews_batch_predict_test.jsonl"

batch_input_data = []
for i, r in test_processed.iterrows():
    # Structure each file as required by Vertex AI for batch prediction.
    content = r['Content_Parsed_4']
    blob_name = f"{destination_prefix}/{file_subprefix}/{r['review_id']}.txt"
    uri = f"{BUCKET_URI}/{blob_name}"
    instance = {"content": uri, "mimeType": "text/plain"}
    batch_input_data.append(instance)
    
    # upload this to cloud storage
    blob = bucket.blob(blob_name)
    blob.upload_from_string(content)

batch_string = '\n'.join([str(d) for d in batch_input_data])
input_file_blob = bucket.blob(input_file_blob_name)
input_file_blob.upload_from_string(batch_string)