In [0]:
%run "/Workspace/DatabricksMasterclass/Tutorial"

# Databricks Masterclass

id,name,marks
1,aa,30
2,bb,20
3,cc,10


### Access Data

- create a service principal (like a service account in gcp)
- in gcp, we assign all the role/access directly from service account page.
- core difference here, role/access is assigned from each app itself (like bigquery). Service principal only creates the account.

### DB Utilities

**dbutils.fs()**

[FileInfo(path='abfss://source@datalakesoong.dfs.core.windows.net/Sales.csv', name='Sales.csv', size=869537, modificationTime=1755708107000)]

**dbutils.widgets**

# Delta Lake

In [0]:
df_sales.write.format('delta')\
  .mode('append')\
  .option('path', 'abfss://destination@datalakesoong.dfs.core.windows.net/sales')\
  .save()

## Managed vs External Delta Tables

**Database**

In [0]:
%sql
CREATE DATABASE salesDB;

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-6054741448858752>, line 1[0m
[0;32m----> 1[0m get_ipython()[38;5;241m.[39mrun_cell_magic([38;5;124m'[39m[38;5;124msql[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mCREATE DATABASE salesDB;[39m[38;5;130;01m\n[39;00m[38;5;124m'[39m)

File [0;32m/databricks/python/lib/python3.12/site-packages/IPython/core/interactiveshell.py:2541[0m, in [0;36mInteractiveShell.run_cell_magic[0;34m(self, magic_name, line, cell)[0m
[1;32m   2539[0m [38;5;28;01mwith[39;00m [38;5;28mself[39m[38;5;241m.[39mbuiltin_trap:
[1;32m   2540[0m     args [38;5;241m=[39m (magic_arg_s, cell)
[0;32m-> 2541[0m     result [38;5;241m=[39m fn([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)
[1;32m   2543[0m [38;5;66;03m# The code below prevent

**Managed Table**

In [0]:
%sql
CREATE TABLE salesDB.mantable
(
    id INT,
    name STRING,
    marks INT
)
USING DELTA

In [0]:
%sql
INSERT INTO salesDB.mantable
VALUES
(1, 'aa', 10),
(2, 'bb', 20),
(3, 'cc', 30)

num_affected_rows,num_inserted_rows
3,3


In [0]:
%sql
SELECT * FROM salesdb.mantable

id,name,marks
1,aa,10
2,bb,20
3,cc,30


In [0]:
%sql
DROP TABLE salesdb.mantable

**External Table**

In [0]:
%sql
-- for this part to work we need to create an "access connector for azure databricks". 
-- go back to our storage account -> IAM -> storage blob data contributor (where we add the service principal)
-- add the "access connector for azure databricks" that we created as a member also
-- then go to databricks catalog -> create external location -> access connector ID: <resource ID> 

CREATE TABLE salesDB.exttable
(
  id INT,
  name STRING,
  marks INT
)
USING DELTA
LOCATION 'abfss://destination@datalakesoong.dfs.core.windows.net/salesDB/exttable'

In [0]:
%sql
INSERT INTO salesDB.exttable
VALUES
(1, 'aa', 10),
(2, 'bb', 20),
(3, 'cc', 30)

num_affected_rows,num_inserted_rows
3,3


In [0]:
%sql
SELECT * FROM salesDB.exttable

id,name,marks
1,aa,10
2,bb,20
3,cc,30


## Delta Tables Functionalities

**INSERT**

In [0]:
%sql
INSERT INTO salesDB.exttable
VALUES
(4, 'dd', 10),
(5, 'ee', 20),
(6, 'ff', 30),
(7, 'gg', 10),
(8, 'hh', 20)

num_affected_rows,num_inserted_rows
5,5


In [0]:
%sql
SELECT * FROM salesDB.exttable

id,name,marks
4,dd,10
5,ee,20
6,ff,30
7,gg,10
8,hh,20
1,aa,10
2,bb,20
3,cc,30


**DELETE**

In [0]:
%sql
DELETE FROM salesdb.exttable
WHERE id=8

num_affected_rows
1


**DATA VERSIONING**

In [0]:
%sql
DESCRIBE HISTORY salesDB.exttable;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2025-08-21T14:12:34Z,145245422906190,chris1soong96@gmail.com,DELETE,"Map(predicate -> [""(id#2423 = 8)""])",,List(3605023645371243),0820-173904-9xfbz7or,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3260, numDeletionVectorsUpdated -> 0, numDeletedRows -> 1, scanTimeMs -> 1994, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 1256)",,Databricks-Runtime/16.4.x-scala2.12
2,2025-08-21T14:05:35Z,145245422906190,chris1soong96@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(3605023645371243),0820-173904-9xfbz7or,1.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 5, numOutputBytes -> 1146)",,Databricks-Runtime/16.4.x-scala2.12
1,2025-08-21T14:00:21Z,145245422906190,chris1soong96@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(3605023645371243),0820-173904-9xfbz7or,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 1092)",,Databricks-Runtime/16.4.x-scala2.12
0,2025-08-21T14:00:10Z,145245422906190,chris1soong96@gmail.com,CREATE TABLE,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> false, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,List(3605023645371243),0820-173904-9xfbz7or,,WriteSerializable,True,Map(),,Databricks-Runtime/16.4.x-scala2.12


**TIME TRAVEL**

In [0]:
%sql
RESTORE TABLE salesDB.exttable TO VERSION AS OF 2

table_size_after_restore,num_of_files_after_restore,num_removed_files,num_restored_files,removed_files_size,restored_files_size
2238,2,1,1,1146,1146


In [0]:
%sql
SELECT * FROM salesDB.exttable

id,name,marks
4,dd,10
5,ee,20
6,ff,30
7,gg,10
8,hh,20
1,aa,10
2,bb,20
3,cc,30


**VACUUM**

In [0]:
%sql
-- VACUUM is a dangerous command
-- hard delete if files are older than 7 days (default)
VACUUM salesDB.exttable

path
abfss://destination@datalakesoong.dfs.core.windows.net/salesDB/exttable


In [0]:
%sql
SELECT * FROM salesDB.exttable

id,name,marks
4,dd,10
5,ee,20
6,ff,30
7,gg,10
8,hh,20
1,aa,10
2,bb,20
3,cc,30


**VACUUM RETAIN 0 HOURS**

In [0]:
%sql
-- delete everything now
-- VACUUM salesDB.exttable RETAIN 0 HOURS

### **DELTA Table Optimization**

**OPTIMIZE**

In [0]:
%sql
OPTIMIZE salesDB.exttable

path,metrics
abfss://destination@datalakesoong.dfs.core.windows.net/salesDB/exttable,"List(1, 2, List(1179, 1179, 1179.0, 1, 1179), List(1092, 1146, 1119.0, 2, 2238), 0, null, null, 0, 1, 2, 0, true, 0, 0, 1755788703216, 1755788705989, 4, 1, null, List(0, 0), null, 3, 3, 218, 0, null)"


In [0]:
%sql
SELECT * FROM salesDB.exttable

id,name,marks
1,aa,10
2,bb,20
3,cc,30
4,dd,10
5,ee,20
6,ff,30
7,gg,10
8,hh,20


**ZORDER BY**

In [0]:
%sql
OPTIMIZE salesDB.exttable ZORDER BY id

path,metrics
abfss://destination@datalakesoong.dfs.core.windows.net/salesDB/exttable,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 1179), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1755788961883, 1755788963958, 4, 0, null, List(0, 0), null, 3, 3, 0, 0, null)"


In [0]:
%sql
SELECT * FROM salesDB.exttable

id,name,marks
1,aa,10
2,bb,20
3,cc,30
4,dd,10
5,ee,20
6,ff,30
7,gg,10
8,hh,20


### AUTO LOADER

**Streaming Dataframe**

In [0]:
df = spark.readStream.format("cloudFiles")\
  .option('cloudFiles.format', 'parquet')\
  .option('cloudFiles.schemaLocation', 'abfss://aldestination@datalakesoong.dfs.core.windows.net/checkpoint')\
  .load('abfss://alsource@datalakesoong.dfs.core.windows.net')

In [0]:
df.writeStream.format('delta')\
    .option('checkpointLocation', 'abfss://aldestination@datalakesoong.dfs.core.windows.net/checkpoint')\
    .trigger(processingTime='5 seconds')\
    .start('abfss://aldestination@datalakesoong.dfs.core.windows.net/data')

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7f58523d9d30>