# Cleaning Utils

## Initialize Notebook

In [0]:
def init_cleaner(file_type):
    '''
    This function sets up the widget in the cleaning notebook and drops duplicates.

    Args:
        file_type (string) : The type ofdata to be cleaned (pin, geo or user).
    
    Returns:
        pyspark.sql.DataFrame : A DataFrame of the data.
    '''
    # Define the input parameter with a default of "Batch" for batch processing
    dbutils.widgets.dropdown("mode", "Batch", ["Batch"])
    print("Running in " + dbutils.widgets.get("mode") + " mode.")

    # If "Batch" or "Stream" mode, use appropriate temp view
    if(dbutils.widgets.get("mode") == "Batch"):
        df_out = spark.table(f"global_temp.gtv_129a67850695_{file_type}")
    elif(dbutils.widgets.get("mode") == "Stream"):
        df_out = spark.table(f"global_temp.gtv_129a67850695_stream_{file_type}")
    else:
        raise Exception("Incorrect input for mode parameter")

    # Drop duplicates
    df_out = df_out.distinct()

    return df_out

## Copy to Temporary Global Views

In [0]:
def copy_to_gtv(df_in, file_type):
    '''
    This function copies the cleaned dataframes to Global Tempory Views to be used by other notebooks.

    Args:
        df_in (pyspark.sql.DataFrame) : A DataFrame of the cleaned data.
        file_type (string) : The type ofdata to be cleaned (pin, geo or user).
    '''
    # If "Batch" or "Stream" mode, use appropriate temp view
    if (dbutils.widgets.get("mode") == "Batch"):
        df_in.createOrReplaceGlobalTempView(f"gtv_129a67850695_{file_type}_clean")
    elif(dbutils.widgets.get("mode") == "Stream"):
        df_in.createOrReplaceGlobalTempView(f"gtv_129a67850695_stream_{file_type}_clean")
    print("Global Temp View created for " + dbutils.widgets.get("mode") + " mode.")