# Clean prior run data files

In [0]:
dbutils.fs.rm('/tmp/ch-13/', True)

# Drop & recreate database
spark.sql("DROP DATABASE IF EXISTS ch_13 CASCADE")
spark.sql("CREATE DATABASE ch_13 ")
spark.sql("USE ch_13")

# Configure Path
DELTALAKE_PATH = "/tmp/ch-13/data"

# Remove table if it exists
dbutils.fs.rm(DELTALAKE_PATH, recurse=True)

# Migration to Delta Operational Simplification
* No ned to <b> Refresh Table</b> as Delta always return the most up-to-date information
* No ned to run Table repair commands such as <b> MSCK </b> or babysit partition creation/deletion
* Because listing large number of files in a directory is often slower than reading the list of files from the transaction log, you can use gneric WHERE clause instead of trying to optimize by loading single partitions explicitly
  * Use spark.read.delta("/data").where("partitionKey = 'value'")
  * as opposed to spark.read.format("parquet").load("/data/partitionKey=value")
* The delta transactional log is the source of truth for the state of data
  * Do not manually manipulate the data/log files as you can invalidate the data
* Use the language, services, connectors, or database of your choice with Delta Lake and Delta Sharing. 
  * https://delta.io/integrations/

# Do you have the buy in for your Data Initiative?
* Is the Business Use Case and value proposition clearly articulated?
* Have businesss requirements been captured, prioritized and agreed upon? (Functional & Non-Functional)
* Are stakeholders and champions vessted with a timeline and budget?

# Developing your data product/service for production
* Have businesss requirements been mapped to clear technical Requirements (Functional & Non-Functional)
* Access to Data?
  * Identify sensitive data sets
  * Ascrtain user priveleges
* Are folks available and trained to work on the big data initiative?
  * Identify enablement and training needs
* For a multi-tenancy scenario, have all access considerations been accounted for?
  * Understanding the collaboration and IIsolation needs of data teams

# Data Migration Plays
* Use Ventor tools to automate bulk of Migrating from other data systems such as Hadoop, Oracle, Netezza, Teradata
  * Data storage
  * Metadata storage
  * Code migration around compatible libraries and APIs
  * Data processing and transformations
  * Security
  * Orchestration of jobs and workflows
* Fix workloads that cannot be automated
* Establish performance benchmarks before and after
* Tune the expensive workloads
* Run workloads in parallel before switching off older systems

* Convert a Parquet table to Delta
  * CONVERT TO DELTA `<parquet table>`
* Convert files to Delta format and create a table using that data
  * CONVERT TO DELTA parquet.<`/data-path/`>
  * CREATE TABLE `<delta table>` USING DELTA LOCATION <’/data-path/’>
* Convert a non-Parquet format such as ORC to Parquet and then to Delta
  * Read dataframe as orc and save as parquet
  * CREATE TABLE `<parquet table>` USING PARQUET LOCATION <’/data-path/’>
  * CONVERT TO DELTA `<parquet table>`
* Generate a manifest file that can be read by other processing engines
  * GENERATE symlink_format_manifest FOR TABLE `<delta table>`

## Parquet to Delta
* Conversion in place
* Creates a Delta Lake transaction log that tracks all files in provided directory,  automatically infers the  schema
* Collects statistics to improve query performance on the converted Delta table. 
* If table name is provided, the metastore is also updated 
* Run Additional OPTIMIZE/ZORDER for better performance on Delta
* Caution
  * Avoid changes to the data files during the conversion process. 
  * If multiple external tables share the same underlying Parquet directory, all of them should be converted
  * Turning off stats collection using NO STATISTICS will hasten the conversion process
  * If table has partitions, use the PARTITIONED BY clause on appropriate column/s

In [0]:
#simulate a parquet data path and parquet table
columns = ["State","Name", "Age"]
data = [("TX","Jack", 25), ("NV","Jane",66), ("CO","Bill",79),("CA","Tom",53), ("WY","Shawn",45)]
age_df = spark.sparkContext.parallelize(data).toDF(columns)
age_location = DELTALAKE_PATH+'/demographic'
age_df.write.format('parquet').save(age_location)

s_sql = "CREATE TABLE IF NOT EXISTS demographic USING parquet LOCATION '" + age_location + "'"
spark.sql(s_sql)
spark.sql('DESCRIBE EXTENDED demographic')

In [0]:
df = spark.sql('DESCRIBE EXTENDED demographic')
df.filter(df.col_name.like('Provider')).show()

In [0]:
# Convert a Parquet table to Delta
spark.sql('CONVERT TO DELTA demographic')

In [0]:
df = spark.sql('DESCRIBE EXTENDED demographic')
df.filter(df.col_name.like('Provider')).show()

## Non-Parquet, Eg. ORC to Delta
* first convert to parquet, then to delta

In [0]:
age_location_orc = DELTALAKE_PATH+'/demographic_orc'
age_df.write.format('orc').save(age_location_orc)

s_sql = "CREATE TABLE IF NOT EXISTS demographic_orc USING ORC LOCATION '" + age_location_orc + "'"
spark.sql(s_sql)

df_orc = spark.sql('DESCRIBE EXTENDED demographic_orc')
df_orc.filter(df_orc.col_name.like('Provider')).show()

In [0]:
orc_df = spark.sql("SELECT * FROM demographic_orc")
age_location_orc_pq = DELTALAKE_PATH+'/demographic_orc_pq'
orc_df.write.format('parquet').save(age_location_orc_pq)
s_sql = "CREATE TABLE IF NOT EXISTS demographic_orc_pq USING parquet LOCATION '" + age_location_orc_pq + "'"
spark.sql(s_sql)

df_orc_pq = spark.sql('DESCRIBE EXTENDED demographic_orc_pq')
df_orc_pq.filter(df_orc_pq.col_name.like('Provider')).show()

In [0]:
spark.sql('CONVERT TO DELTA demographic_orc_pq')
df_orc_pq_delta = spark.sql('DESCRIBE EXTENDED demographic_orc_pq')
df_orc_pq_delta.filter(df_orc_pq_delta.col_name.like('Provider')).show()

## Undo Conversion Operation
* VACUUM delta.`<path-to-table>` RETAIN 0 HOURS
* Delete the <path-to-table>/_delta_log directory.

# Capacity Planning
* Understand Consumption patterns
  * Number of total users & concurrent users
  * Benchmark known and reepresentative queries
* Understand Data Volumes & Processing needs 
  * per day/per month/per year
  * Establish a yarly projection
  * Extend to multi-year with buffer for growth of use cases 
* Benchmarking to ascertain Cluster type and sizing
* How many environments ?
  * Dev/Staging/Prod ?
  * Planning for Disaster Recovery

# Data Democratization
* Via Policy & Process
* Identify needs for Delta Data Sharing

# Managing & Monitoring
* Audit Logs
* Cluster Logs
* Spark Metrics
* Sytem Metrics
* Custom Logging

# Establish COE
* Responsible for
  * Infrastructure, on-prem/cloud strategy
  * Centralized Governance and Security
  * Approving architecture blueprints
  * Enablement/Training
  * Reporting on usage and chargeback
  * Automation