In [0]:
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    FloatType,
    BooleanType,
    TimestampType,
)

### 1. Drop the Existing Tables

In [0]:
# To drop all tables and start over from eventlog files
if False:
    spark.sql("drop table eventlog_raw;")
    spark.sql("drop table queries;")
    spark.sql("drop table operations;")
    spark.sql("drop table physical_plan_keys;")
    print("Dropped the tables")

Dropped the tables


In [0]:
# To drop all tables and start over from eventlog files
if False:
    spark.sql("drop table method_runs;")
    spark.sql("drop table method_recommendations;")
    spark.sql("drop table method_results;")
    print("Dropped the tables")

Dropped the tables


### 2. Create New Tables
* method_runs - table with metadata for each method\_run
  * **runId**: unique id for the method run
  * **methodName**: which method used
  * **params**: parameters used for the method
  * **fromTime**: time from when the interval "starts"
  * **toTime**: time from when the interval "starts"
  * **whenRun**: time of when the method was ran
* method_recommendations - table with column recommendations for each table in each method\_run
  * **runId**: same id as the method\_runs table
  * **tableName**: the table that is being recommendation on
  * **columnName**: the column that is recommended to partition on based on the method run
  * **isPartitioned**: true/false if the recommended column is already partitioned on or not
* method_results - table with metrics for each column in each table for each method run
  * **runId**: same id as the method\_runs table
  * **tableName**: table name of relevant table
  * **columnName**: column name of relevant column
  * **methodValue**: value associated with the specific method
  * **isPartitioned**: true/false if the recommended column is already partitioned on or not



```Note:``` the method_runs.**params** column will be used at the start, but if we notice that we need to have a column for eac parameter (to make it easier to filter on spesific parameters at a later stage, we can easily explode the json and create columns instead)

```Note2:``` the multiple methods implemented must **NOT** be run in parallel when creating workflows. Reason being is that runId might get duplicated making it no long unique for each "run"

In [0]:
%sql
CREATE table IF NOT EXISTS method_runs (
  runId INT,
  methodName STRING,
  params STRING,
  --Store parameters as a JSON string
  fromTime TIMESTAMP,
  toTime TIMESTAMP,
  whenRun TIMESTAMP
);
CREATE table IF NOT EXISTS method_recommendations (
  runId INT,
  databaseName STRING,
  tableName STRING,
  columnName STRING,
  methodValue FLOAT,
  isPartitioned BOOLEAN
);
CREATE table IF NOT EXISTS method_results (
  runId INT,
  databaseName STRING,
  tableName STRING,
  columnName STRING,
  methodValue FLOAT,
  isPartitioned BOOLEAN
);