# Read/Write Test of Current Hopsworks Implementation (Hudi)

In [None]:
!pip install pandas==2.2.0
!pip install timeit

In [None]:
import pyarrow as pa
import pandas as pd
import timeit
import hopsworks
import numpy as np
import os
import importlib
import time
import math
import string
import random
import sys
import requests
import re

## Console 🕹️
**Modify something here if** you want to:
- Change the test repetitions
- Define the number of used cores
- Change the file names
- Save subreports, and define how often
- Change the source from where to collect the data
- Change the path where to save the data
- Change the file type

In [None]:
# Indicate how many times you should repeat each test, and how many cores are you using
reps = dict()
reps["sup_1"]   = 50
reps["sup_10"]  = 50
reps["sup_100"] = 50
reps["line_1"]  = 50
reps["line_10"] = 50
number_of_cores = 1

# Define the file names, according to the name of the test (sup_1, line_10, etc...)
file_names = dict()
file_names["sup_1"]   = 'supplier_tpch_sf1'
file_names["sup_10"]  = 'supplier_tpch_sf10'
file_names["sup_100"] = 'supplier_tpch_sf100'
file_names["line_1"]  = 'lineitem_tpch_sf1'
file_names["line_10"] = 'lineitem_tpch_sf10'

# @optional
# Set those in order to save the subreports,
save_subreports = False
save_how_often  = 0

# Define where the files should be generally saved, and where they can be retrieved
LOCAL_PATH      = '/home/yarnapp/hopsfs/Resources/'
REMOTE_PATH     = 'https://repo.hops.works/dev/gio-hopsworks/'
FILE_TYPE       = '.parquet'
FILENAME_PREFIX = "hudi_benchmark_results_" + str(number_of_cores) + "core"

In [None]:
def get_local_paths(file_names):
    local_paths = dict()
    
    for file in file_names:
        # The local path of each file should look like: "/home/yarnapp/hopsfs/Resources/supplier_tpch_sf1.parquet"
        local_paths[file] = LOCAL_PATH + file_names[file] + FILE_TYPE
    
    return local_paths


def get_remote_paths(file_names):
    remote_paths = dict()
    
    for file in file_names:
        # The remote path of each file should look like: "https://repo.hops.works/dev/gio-hopsworks/supplier_tpch_sf1.parquet"
        remote_paths[file] = REMOTE_PATH + file_names[file] + FILE_TYPE
    
    return remote_paths


def column_renamer(df):
    '''
    Given a dataframe, renames all the column to small lower case, in order to make it possible to save the dataframe on Hopsworks
    via the usage of a Feature Group.
    '''
    
    for name in df.columns:
        df.rename(columns={name : name.lower()}, inplace=True)
    
    return df


def extract_time(text):
    pattern = r'(\d+m\s*)?\d+(\.\d+)?s'
    match = re.search(pattern, text)
    if match:
        return match.group()
    else:
        return None

In [None]:
# Download all the files and creates the local path and remote path dictionaries
local_paths  = get_local_paths(file_names)
remote_paths = get_remote_paths(file_names)

In [None]:
# Download all the datasets from the remote_paths to the local_paths
for file in file_names:
    !curl {remote_paths[file]} -o {local_paths[file]}

In [None]:
def get_setup_code_write(dataset):
    spec_path = str(local_paths[dataset])
        
    pre_code = '''import hopsworks
import pandas as pd

def column_renamer(df):
    for name in df.columns:
        df.rename(columns={name : name.lower()}, inplace=True)

    return df
'''

    spec_code = '''
LOCAL_PATH = "''' + spec_path + '''"
'''    
    
    post_code = '''
df = pd.read_parquet(LOCAL_PATH)
df = column_renamer(df)
'''
    
    # Put the codes together and return the wanted code
    setup_code = pre_code + spec_code + post_code
    return setup_code
    

def get_setup_code(mode, dataset):
    setup_code = []
    
    if (mode == 'read'):
        setup_code = '''import hopsworks
project = hopsworks.login()
fs = project.get_feature_store()
fg = fs.get_feature_group("hudi_test")
'''
    elif (mode == 'write'):
        setup_code = get_setup_code_write(dataset)
        
    return setup_code


def get_test_code(mode, dataset):
    test_code = []
    
    if (mode == 'read'):
        test_code = '''
fg.read()
'''
    elif (mode == 'write'):
        test_code = '''
project = hopsworks.login()
fs = project.get_feature_store()
fg = fs.get_or_create_feature_group(
    name="hudi_test",
    version=1,
    primary_key=df.columns,
    description='Upload of Datasets for testing reasons')
fg.insert(df)
'''
        
    return test_code

In [None]:
# Sometimes this could be useful
project = hopsworks.login()
fs = project.get_feature_store()
fg = fs.get_feature_group("hudi_test")
fg.delete()

In [None]:
# Start the test with the above indicated iterations
for dataset in reps:
    if (reps[dataset] > 0):
        print('\n\n\n\n\n *** STARTED THE TEST OF ' + str(dataset) + ' FOR ' + str(reps[dataset]) + ' TIME(S) *** \n\n\n\n\n')

        # Initialise results list of rows
        results = []


        for rep in range(reps[dataset]):

            ############ CREATE THE FEATURE GROUP, PUBLISH THE INFORMATION ####################
            # Login to the project and insert/upload the new dataset in a new feature group, while keeping track of the time required by each operation.
            # The login time and the creation time can be included in the total time, since they are significantly small.
            SETUP_CODE = get_setup_code('write', dataset)
            TEST_CODE =  get_test_code('write',  dataset)

            upload_time = timeit.timeit(setup = SETUP_CODE, stmt = TEST_CODE, number = 1)
            print('**Create featuregrop on Hopsworks and publish messages on Kafka, the time taken is {0} seconds'.format(upload_time))



            ############# MATERIALIZE THE DATA ON HOPSWORKS THROUGH SPARK JOBS #######################
            # Check the repeatedly the status of the materialization_job. When it is FINISHED,
            # get the current time and calculate the time required by the materialization.
            before_materialize = time.time()

            project = hopsworks.login()
            fs = project.get_feature_store()
            fg = fs.get_feature_group("hudi_test")

            while(fg.materialization_job.get_state() != 'FINISHED'):
                time.sleep(2)

            after_materialize = time.time()

            # Check final state
            final_state = fg.materialization_job.get_final_state()
            if (final_state != 'SUCCEEDED'):
                print("\nWARNING: The final state is: " + str(final_state))

            materialize_time = after_materialize - before_materialize
            print("\n**Time needed to materialize: " + str(materialize_time))



            ############### READ FROM THE MATERIALIZED FEATURE GROUP ################################
            # Read the data passing the feature group name, after having connected to the Hopsworks' project
            SETUP_CODE = get_setup_code('read', dataset)
            TEST_CODE =  get_test_code('read',  dataset)
            
            read_time = timeit.timeit(setup = SETUP_CODE, stmt = TEST_CODE, number = 1)
            print('**Read the dataset from Hopsworks, the time taken is {0} seconds'.format(read_time))



            ################ END OF MEASUREMENTS ##################################################
            # Remove the featuregroup from Hopsworks
            project = hopsworks.login()
            fs = project.get_feature_store()
            fg = fs.get_feature_group("hudi_test")
            fg.delete()


            ## Save the results of the current test
            results_curr = [upload_time, materialize_time, read_time, final_state]
            results.append(results_curr)

            # Wait in order to be sure that the featuregroup has been removed
            time.sleep(5)
            
            if(save_subreports and ((rep + 1) % save_how_often == 0) and ((rep + 1) < reps[dataset])):
                ## Save the test's results in a .csv file with specified name
                df = pd.DataFrame(results, columns =  ["Upload", "Materialize", "Read", "Success_of_materialization"])
                path_csv = str(LOCAL_PATH) + str(FILENAME_PREFIX) + '_' + 'SUB' + str(rep + 1) + '_' + str(file_names[dataset]) + ".csv"
                df.to_csv(path_csv)


        ## Save the test's results in a .csv file with specified name
        df = pd.DataFrame(results, columns =  ["Upload", "Materialize", "Read", "Success_of_materialization"])
        path_csv = str(LOCAL_PATH) + str(FILENAME_PREFIX) + '_' + str(file_names[dataset]) + ".csv"
        df.to_csv(path_csv)
        
        print('\n\n\n\n\n *** ENDED TESTING OF ' + str(dataset) + ' FOR ' + str(reps[dataset]) + ' TIME(S) *** \n\n\n\n\n')

In [None]:
# At the end, remove the kafka topics, in order to free up memory
# (By default, Kafka keeps the messages in a topic for 7 days)
project = hopsworks.login()
kafka_api = project.get_kafka_api()
kafka_topics = kafka_api.get_topics()

for topic in kafka_topics:
    print('\n *** Deleting the topic named: ' + str(topic.name) + ' ***\n')
    topic.delete()
    
print('\n *** Now the dirty work should be done! ***\n')