# Rule Based Recommendation Method - Simple Count
  * A rule based method for partition recommendation
  * Counts the number of times a column has been filtered on for each table.
  * Recommend this as a new partition if this column has been filtered on more often than the current partition of the table

In [0]:
import pyspark.sql.functions as F
import json
from datetime import timedelta, datetime
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    FloatType,
    BooleanType,
    TimestampType,
)

### 1. Inputs
* **toTime**: (yyyy-mm-dd HH:MM:SS) from when you want to start the relevant interval (last date)
* **interval**: (weeks) how many weeks (starting from toTime and goes backwards) you want to use data

`Note: toTime is the END of the interval`

`Note2: may include fromTime (either instead or as well as interval) as a input parameter if we find a purpose for it`

In [0]:
%run "./Validators"

In [0]:
datetime_format = "%Y-%m-%d %H:%M:%S"

dbutils.widgets.text(
    "to_time",
    "None",
    'toTime, End of interval, use "None" for time.now(), Format: yyyy-mm-dd HH:MM:SS',
)

dbutils.widgets.text(
    "interval", "4", "Size of interval (weeks) for how much data the methods will use"
)

to_time = dbutils.widgets.get("to_time")
interval = dbutils.widgets.get("interval")

# Because workflows got no way of specifying time.now()
if to_time == "None":
    to_time = datetime.now()
else:
    # function from ./Validators notebook
    if validate_time_input(to_time, datetime_format):
        to_time = datetime.strptime(to_time, datetime_format)
    else:
        raise ValueError("inputted to_time is not on the right format")

assert validate_positive_number(interval), "interval is not a valid number > 0"
interval = float(interval)

from_time = to_time - timedelta(weeks=interval)

#### Tables:
* method_runs
* method_results
* method_recommendations

Are already created in the **SetupTables** notebook

In [0]:
method_runs_schema = StructType(
    [
        StructField("runId", IntegerType(), nullable=True),
        StructField("methodName", StringType(), nullable=True),
        StructField("params", StringType(), nullable=True),
        StructField("fromTime", TimestampType(), nullable=True),
        StructField("toTime", TimestampType(), nullable=True),
        StructField("whenRun", TimestampType(), nullable=True),
    ]
)

### 2. Preprocessing
##### Fetch the data from the operations table and perform the following steps:
* Only fetch operations from within the **from_time** and **to_time** input parameters
* Ignore the "Filter" rows as columns in here seems to already be stored under the "PushedFilters" rows
* Remove rows with NA values and empty strings (may have happended when extracting the table and database name)

In [0]:
# Convert from and to-time to the same format as the table
from_time_timestamp = int(from_time.timestamp() * 1000)
to_time_timestamp = int(to_time.timestamp() * 1000)

# Filter data not between from and to-date
# Remove rows where operation name isn't Filter (end up with PartitionFilters and PushedFilters)
operations = (
    spark.sql("SELECT * FROM operations")
    .filter(F.col("timeGenerated").between(from_time_timestamp, to_time_timestamp))
    .filter(F.col("operationName") != "Filter")
)

# Remove empty rows
operations = operations.select(
    [
        F.when(operations[col] == "", None).otherwise(operations[col]).alias(col)
        for col in operations.columns
    ]
).dropna(how="any")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2721543019531124>:8[0m
[1;32m      3[0m to_time_timestamp [38;5;241m=[39m [38;5;28mint[39m(to_time[38;5;241m.[39mtimestamp() [38;5;241m*[39m [38;5;241m1000[39m)
[1;32m      5[0m [38;5;66;03m# Filter data not between from and to-date[39;00m
[1;32m      6[0m [38;5;66;03m# Remove rows where operation name isn't Filter (end up with PartitionFilters and PushedFilters)[39;00m
[1;32m      7[0m operations [38;5;241m=[39m (
[0;32m----> 8[0m     spark[38;5;241m.[39msql([38;5;124m"[39m[38;5;124mSELECT * FROM operations[39m[38;5;124m"[39m)
[1;32m      9[0m     [38;5;241m.[39mfilter(F[38;5;241m.[39mcol([38;5;124m"[39m[38;5;124mtimeGenerated[39m[38;5;124m"[39m)[38;5;241m.[39mbetween(from_time_timestamp, to_time_timestamp))
[1;32m     10[0m     [38;5;241m.[

In [0]:
# This code is responsible for creating a dataframe where you can check whether a table is partitioned on a particular column
# Used as a lookup before writing results to tables after having run some of the methods

# Group by databaseName, tableName, columnName and find the max timeGenerated
max_time_df = operations.groupBy("databaseName", "tableName", "columnName").agg(
    F.max("timeGenerated").alias("maxTimeGenerated")
)


# Join the max_time_df with the original dataframe
max_time_df = (
    operations.join(max_time_df, ["databaseName", "tableName", "columnName"])
    .filter(F.col("timeGenerated") == F.col("maxTimeGenerated"))
    .groupBy("databaseName", "tableName", "columnName")
    .agg(
        F.first(F.col("operationName")).alias("operationName"),
    )
)


is_partitioned = max_time_df.withColumn(
    "isPartitioned",
    F.when(F.col("operationName") == "PartitionFilters", True).otherwise(False),
).drop("operationName")


display(is_partitioned)



### 3. Method - count
For each databse, table, column; check how often the column is used for filtering (PartitionFilters, Filter), count the occurences
##### parameters:
* **windowStart**: (int - unix_ms) window start of which data to be used
* **windowEnd**: (int - unix_ms) window end of which data to be used
* **windowSize**: (float) number of weeks of which the interval spans


##### metadata of method:
* **whenRun**: (timestamp) when the method is ran

In [0]:
runId = 1
# update runId if there is already a max_id in the method_runs table
max_id = spark.sql("SELECT MAX(runId) AS max_id FROM method_runs").collect()[0][
    "max_id"
]
if max_id is not None:
    runId = max_id + 1
print(f"runId: {runId}")

params = {
    "windowStart": from_time_timestamp,
    "windowEnd": to_time_timestamp,
    "windowSize": interval,
}
metadata = {
    "whenRun": datetime.now(),
}



In [0]:
method_run_info = {
    "runId": runId,
    "methodName": "simpleCount",
    "params": json.dumps(params) if params else "",
    "fromTime": from_time,
    "toTime": to_time,
    "whenRun": metadata["whenRun"],
}

method_run = spark.createDataFrame([method_run_info], schema=method_runs_schema)
display(method_run)



### 3.1 Count the Number of times each column is used for filtering

In [0]:
method_output = (
    operations.groupBy("databaseName", "tableName", "columnName")
    .agg(F.count("executionId").alias("occurrences"))
    .orderBy("databaseName", "tableName", "columnName")
)

method_output.show()



### 3.2 Create Columns Needed for the method_results Table

In [0]:
# add needed columns to method_results table
method_results = (
    method_output.withColumn("methodValue", F.col("occurrences").cast("float"))
).drop("occurrences")

# join on is_partitioned dataframe to check if columns is partitioned or not
# note: this is based on the last occurence of the filter in the operations table
method_results = method_results.join(
    is_partitioned, on=["databaseName", "tableName", "columnName"]
)

method_results = method_results.withColumn("runId", F.lit(runId)).select(
    "databaseName",
    "tableName",
    "columnName",
    "methodValue",
    "isPartitioned",
    "runId",
)

method_results.show()



### 3.3 Find Recommendations Based on the Column with the Highest methodValue per db-table

In [0]:
method_recommendations = (
    method_results.groupBy("databaseName", "tableName")
    .agg(
        F.max(F.struct("methodValue", "columnName", "isPartitioned")).alias(
            "max_methodValue_colName_isPartitioned"
        )
    )
    .select(
        "databaseName",
        "tableName",
        "max_methodValue_colName_isPartitioned.columnName",
        "max_methodValue_colName_isPartitioned.methodValue",
        "max_methodValue_colName_isPartitioned.isPartitioned",
    )
    .withColumn("runId", F.lit(runId))
    .withColumnRenamed("max_methodValue_colName_isPartitioned.columnName", "columnName")
    .withColumnRenamed(
        "max_methodValue_colName_isPartitioned.methodValue", "methodValue"
    )
    .withColumnRenamed(
        "max_methodValue_colName_isPartitioned.isPartitioned", "isPartitioned"
    )
)

method_recommendations.show(truncate=False)



### 4. Save data to tables

In [0]:
### 6. Save data to tables

# write method_runs information
method_run.write.format("delta").mode("append").saveAsTable("method_runs")

# write method_results information
method_results.write.format("delta").mode("append").saveAsTable("method_results")

# write method_recommendation information
method_recommendations.write.format("delta").mode("append").saveAsTable(
    "method_recommendations"
)



In [0]:
%sql
select
  *
from
  method_runs



In [0]:
%sql
select
  *
from
  method_results



In [0]:
%sql
select
  *
from
  method_recommendations

