# Rule Based recommendation Method -  Weighted Count
  * A rule based method for partition recommendation
  * Multiplies each operation with a weight (based on when it happened) and summs the results for each column
  * The **weight** is based on when during the interval the operation happened and is between the **max** and **min** weights  given as parameter to the notebook

  
`Note: We will also implement this method in a step-based manner`

In [0]:
import pyspark.sql.functions as F
import json
from datetime import timedelta, datetime
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    TimestampType,
)

### 1. Inputs
* **toTime**: (yyyy-mm-dd HH:MM:SS) from when you want to start the relevant interval (last date)
* **interval**: (weeks) how many weeks (starting from toTime and goes backwards) you want to use data
* **max_weight**: (float / int) the weight that will be used at the very end of the interval (**newest filters**)
* **min_weight**: (float / int) the weight that will be used at the very start of the interval (**oldest filters**)

`Note: toTime is the END of the interval`

`Note2: may include fromTime (either instead or as well as interval) as a input parameter if we find a purpose for it`

In [0]:
%run "./Validators"

In [0]:
datetime_format = "%Y-%m-%d %H:%M:%S"

dbutils.widgets.text(
    "to_time",
    "None",
    'toTime, End of interval, use "None" for time.now(), Format: yyyy-mm-dd HH:MM:SS',
)

dbutils.widgets.text(
    "interval", "4", "Size of interval (weeks) for how much data the methods will use"
)

dbutils.widgets.text(
    "max_weight", "2", "max_weight, The weight that will be used for the newest filters"
)

dbutils.widgets.text(
    "min_weight", "0", "min_weight, Weight that will be used for the oldest filters"
)

to_time = dbutils.widgets.get("to_time")
interval = dbutils.widgets.get("interval")
max_weight = dbutils.widgets.get("max_weight")
min_weight = dbutils.widgets.get("min_weight")

# Because of work flows
if to_time == "None":
    to_time = datetime.now()
else:
    # function from ./Validators notebook
    if validate_time_input(to_time, datetime_format):
        to_time = datetime.strptime(to_time, datetime_format)
    else:
        raise ValueError("inputted to_time is not on the right format")

assert validate_positive_number(interval), "interval is not a valid number > 0"
assert validate_positive_number(max_weight), "max_weight is not a valid number > 0"
assert validate_positive_number(
    min_weight, gt_zero=False
), "min_weight input is not a a valid number >= 0"

interval = float(interval)
max_weight = float(max_weight)
min_weight = float(min_weight)

from_time = to_time - timedelta(weeks=interval)

#### Tables:
* method_runs
* method_results
* method_recommendations

Are already created in the **SetupTables** notebook

In [0]:
# Schema used to insert into the method_runs table
method_runs_schema = StructType(
    [
        StructField("runId", IntegerType(), nullable=True),
        StructField("methodName", StringType(), nullable=True),
        StructField("params", StringType(), nullable=True),
        StructField("fromTime", TimestampType(), nullable=True),
        StructField("toTime", TimestampType(), nullable=True),
        StructField("whenRun", TimestampType(), nullable=True),
    ]
)

### 2. Preprocessing
##### Fetch the data from the operations table and perform the following steps:
* Only fetch operations from within the **from_time** and **to_time** input parameters
* Ignore the "Filter" rows as columns in here seems to already be stored under the "PushedFilters" rows
* Remove rows with NA values and empty strings (may have happended when extracting the table and database name)

In [0]:
# Convert from and to-time to the same format as the table
from_time_timestamp = int(from_time.timestamp() * 1000)
to_time_timestamp = int(to_time.timestamp() * 1000)

# Only get data between from and to-date
# Remove rows where operation name isn't Filter (end up with PartitionFilters and PushedFilters)
operations = (
    spark.sql("SELECT * FROM operations")
    .filter(F.col("timeGenerated").between(from_time_timestamp, to_time_timestamp))
    .filter(F.col("operationName") != "Filter")
)

# Remove empty rows
operations = operations.select(
    [
        F.when(operations[col] == "", None).otherwise(operations[col]).alias(col)
        for col in operations.columns
    ]
).dropna(how="any")

In [0]:
# This code is responsible for creating a dataframe where you can check whether a table is partitioned on a particular column
# Used as a lookup before writing results to tables after having run some of the methods

# Group by databaseName, tableName, columnName and find the max timeGenerated
max_time_df = operations.groupBy("databaseName", "tableName", "columnName").agg(
    F.max("timeGenerated").alias("maxTimeGenerated")
)


# Join the max_time_df with the original dataframe
max_time_df = (
    operations.join(max_time_df, ["databaseName", "tableName", "columnName"])
    .filter(F.col("timeGenerated") == F.col("maxTimeGenerated"))
    .groupBy("databaseName", "tableName", "columnName")
    .agg(
        F.first(F.col("operationName")).alias("operationName"),
    )
)


is_partitioned = max_time_df.withColumn(
    "isPartitioned",
    F.when(F.col("operationName") == "PartitionFilters", True).otherwise(False),
).drop("operationName")


display(is_partitioned)

### 3. Method - weighted count
For each databse, table, column; check how often the column is used for filtering (PartitionFilters, Filter), count the occurences
##### parameters:
* **max_weight**: (float) the weight that will be used at the very end of the interval (newest filters)
* **min_weight**: (float) the weight that will be used at the very start of the interval (oldest filters)
* **windowStart**: (int - unix_ms) window start of which data to be used
* **windowEnd**: (int - unix_ms) window end of which data to be used
* **windowSize**: (float) number of weeks of which the interval spans


##### metadata of method:
* **whenRun**: (timestamp) when the method is ran

In [0]:
runId = 1
# update runId if there is already a max_id in the method_runs table
max_id = spark.sql("SELECT MAX(runId) AS max_id FROM method_runs").collect()[0][
    "max_id"
]
if max_id is not None:
    runId = max_id + 1
print(f"runId: {runId}")

# No parameters for this method
params = {
    "max_weight": max_weight,
    "min_weight": min_weight,
    "windowStart": from_time_timestamp,
    "windowEnd": to_time_timestamp,
    "windowSize": interval,
}

metadata = {
    "whenRun": datetime.now(),
}

runId: 2


In [0]:
method_run_info = {
    "runId": runId,
    "methodName": "weightedCount",
    "params": json.dumps(params) if params else "",
    "fromTime": from_time,
    "toTime": to_time,
    "whenRun": metadata["whenRun"],
}

method_run = spark.createDataFrame([method_run_info], schema=method_runs_schema)
display(method_run)

runId,methodName,params,fromTime,toTime,whenRun
2,weightedCount,"{""max_weight"": 1.0, ""min_weight"": 0.0, ""windowStart"": 1680175669202, ""windowEnd"": 1682594869202, ""windowSize"": 4.0}",2023-03-30T11:27:49.202+0000,2023-04-27T11:27:49.202+0000,2023-04-27T11:27:50.275+0000


### 3.1 Apply weights based on timeGenerated

In [0]:
interval_weight = F.when(
    F.col("timeGenerated") >= to_time_timestamp, max_weight).otherwise(
        F.when(F.col("timeGenerated") <= from_time_timestamp, min_weight)
    .otherwise(
        (
            (F.col("timeGenerated").cast("long") - from_time_timestamp)
            / (to_time_timestamp - from_time_timestamp)
        )
        * (max_weight - min_weight)
        + min_weight
    )
)

weighted_operations = operations.withColumn("weight", interval_weight.cast("float"))
weighted_operations.show()

+-----------------+------------+-----------+-------------+---------------+-----------+------------+-------------+-----------+----------+
|       columnName|databaseName|executionId|operationName|physicalPlanKey|eventlogKey|   tableName|timeGenerated|operationId|    weight|
+-----------------+------------+-----------+-------------+---------------+-----------+------------+-------------+-----------+----------+
|   sparkContextID|     default|       1308|PushedFilters|    -1577232127| 1186829784|eventlog_raw|1681305905689|         10|0.46719432|
|     lastModified|     default|       1308|PushedFilters|    -1577232127| 1186829784|eventlog_raw|1681305905689|         11|0.46719432|
|clusterInstanceID|     default|       1308|PushedFilters|    -1577232127| 1186829784|eventlog_raw|1681305905689|         12|0.46719432|
|   sparkContextID|     default|       1308|PushedFilters|    -1577232127| 1186829784|eventlog_raw|1681305905689|         13|0.46719432|
|     lastModified|     default|       13

### 3.2 Aggregate Weighted Operations

In [0]:
method_output = (
    weighted_operations.groupBy(
        "databaseName", "tableName", "columnName"
    )
    .agg(F.sum("weight").alias("weightedSum"))
    .orderBy("databaseName", "tableName", "columnName")
)

method_output.show()

+------------+--------------------+-----------------+----------------+-------------------+
|databaseName|           tableName|       columnName|   operationName|        weightedSum|
+------------+--------------------+-----------------+----------------+-------------------+
|     default|        eventlog_raw|        clusterID|   PushedFilters|  0.483463391661644|
|     default|        eventlog_raw|clusterInstanceID|   PushedFilters| 163.97874668240547|
|     default|        eventlog_raw|      eventlogKey|   PushedFilters|   5.79052859544754|
|     default|        eventlog_raw|     lastModified|   PushedFilters|  273.6945593878627|
|     default|        eventlog_raw|   sparkContextID|   PushedFilters| 151.50880932807922|
|     default|monthly_global_oi...|        countryId|   PushedFilters| 0.5894977450370789|
|     default|monthly_global_oi...|      countryName|   PushedFilters|  2.532083049416542|
|     default|monthly_global_oi...|        countryId|   PushedFilters| 0.5504397004842758|

### 3.3 Create Columns Needed for the method_results Table

In [0]:
method_results = (
    method_output.withColumn("methodValue", F.col("weightedSum"))
    .withColumn("methodValue", F.col("methodValue").cast("float"))
).drop("occurrences")

# join on is_partitioned dataframe to check if columns is partitioned or not
# note: this is based on the last occurence of the filter in the operations table
method_results = method_results.join(
    is_partitioned, on=["databaseName", "tableName", "columnName"]
)

method_results = method_results.withColumn("runId", F.lit(runId)).select(
    "databaseName",
    "tableName",
    "columnName",
    "methodValue",
    "isPartitioned",
    "runId",
)

method_results.show()

+------------+--------------------+-----------------+-----------+-------------+-----+
|databaseName|           tableName|       columnName|methodValue|isPartitioned|runId|
+------------+--------------------+-----------------+-----------+-------------+-----+
|     default|        eventlog_raw|        clusterID|  0.4834634|        false|    2|
|     default|        eventlog_raw|clusterInstanceID|  163.97874|        false|    2|
|     default|        eventlog_raw|      eventlogKey|   5.790529|        false|    2|
|     default|        eventlog_raw|     lastModified|  273.69455|        false|    2|
|     default|        eventlog_raw|   sparkContextID|   151.5088|        false|    2|
|     default|monthly_global_oi...|        countryId| 0.58949775|        false|    2|
|     default|monthly_global_oi...|      countryName|   2.532083|        false|    2|
|     default|monthly_global_oi...|        countryId|  0.5504397|        false|    2|
|     default|monthly_global_oi...|      countryName| 

### 3.4 Find recommendations Based on the Column with the Highest methodValue per db-table

In [0]:
method_recommendations = (
    method_results.groupBy("databaseName", "tableName")
    .agg(
        F.max(F.struct("methodValue", "columnName", "isPartitioned")).alias(
            "max_methodValue_colName_isPartitioned"
        )
    )
    .select(
        "databaseName",
        "tableName",
        "max_methodValue_colName_isPartitioned.columnName",
        "max_methodValue_colName_isPartitioned.methodValue",
        "max_methodValue_colName_isPartitioned.isPartitioned",
    )
    .withColumn("runId", F.lit(runId))
    .withColumnRenamed("max_methodValue_colName_isPartitioned.columnName", "columnName")
    .withColumnRenamed(
        "max_methodValue_colName_isPartitioned.methodValue", "methodValue"
    )
    .withColumnRenamed(
        "max_methodValue_colName_isPartitioned.isPartitioned", "isPartitioned"
    )
)

method_recommendations.show(truncate=False)

+------------+--------------------------------------------------+---------------+-----------+-------------+-----+
|databaseName|tableName                                         |columnName     |methodValue|isPartitioned|runId|
+------------+--------------------------------------------------+---------------+-----------+-------------+-----+
|default     |eventlog_raw                                      |lastModified   |273.69455  |false        |2    |
|default     |monthly_global_oil_demand_forecast_countries_v0r0 |countryName    |2.532083   |false        |2    |
|default     |monthly_global_oil_demand_forecast_countries_v0r42|countryName    |2.826852   |false        |2    |
|default     |monthly_global_oil_demand_forecast_countries_v0r44|countryId      |0.10836927 |false        |2    |
|default     |operations                                        |eventlogKey    |15.213155  |false        |2    |
|default     |physical_plan_keys                                |physicalPlanKey|25.7280

### 4. Save Data to Tables

In [0]:
# write method_runs information
method_run.write.format("delta").mode("append").saveAsTable("method_runs")

# write method_results information
method_results.write.format("delta").mode("append").saveAsTable("method_results")

# write method_recommendation information
method_recommendations.write.format("delta").mode("append").saveAsTable(
    "method_recommendations"
)

In [0]:
%sql
select
  *
from
  method_runs

runId,methodName,params,fromTime,toTime,whenRun
2,weightedCount,"{""max_weight"": 1.0, ""min_weight"": 0.0, ""windowStart"": 1680175669202, ""windowEnd"": 1682594869202, ""windowSize"": 4.0}",2023-03-30T11:27:49.202+0000,2023-04-27T11:27:49.202+0000,2023-04-27T11:27:50.275+0000
1,simpleCount,"{""windowStart"": 1680175636634, ""windowEnd"": 1682594836634, ""windowSize"": 4.0}",2023-03-30T11:27:16.634+0000,2023-04-27T11:27:16.634+0000,2023-04-27T11:27:17.713+0000


In [0]:
%sql
select
  *
from
  method_results

runId,databaseName,tableName,columnName,methodValue,isPartitioned
2,default,eventlog_raw,clusterID,0.4834634,False
2,default,eventlog_raw,clusterInstanceID,163.97874,False
2,default,eventlog_raw,eventlogKey,5.790529,False
2,default,eventlog_raw,lastModified,273.69455,False
2,default,eventlog_raw,sparkContextID,151.5088,False
2,default,monthly_global_oil_demand_forecast_countries_v0r0,countryId,0.58949775,False
2,default,monthly_global_oil_demand_forecast_countries_v0r0,countryName,2.532083,False
2,default,monthly_global_oil_demand_forecast_countries_v0r42,countryId,0.5504397,False
2,default,monthly_global_oil_demand_forecast_countries_v0r42,countryName,2.826852,False
2,default,monthly_global_oil_demand_forecast_countries_v0r42,etl_year,0.49614263,True


In [0]:
%sql
select
  *
from
  method_recommendations

runId,databaseName,tableName,columnName,methodValue,isPartitioned
1,default,eventlog_raw,lastModified,673.0,False
1,default,monthly_global_oil_demand_forecast_countries_v0r0,countryName,12.0,False
1,default,monthly_global_oil_demand_forecast_countries_v0r42,countryName,14.0,False
1,default,monthly_global_oil_demand_forecast_countries_v0r44,countryId,1.0,False
1,default,operations,eventlogKey,27.0,False
1,default,physical_plan_keys,physicalPlanKey,49.0,False
1,default,plant_v0r44,UP_DATE,23.0,True
1,default,queries,physicalPlanKey,20.0,False
2,default,eventlog_raw,lastModified,273.69455,False
2,default,monthly_global_oil_demand_forecast_countries_v0r0,countryName,2.532083,False
