In [0]:
# Unity Catalog Managed Tables
# is a DeltaLake table whose metadata and data files are both stored in a unity catalog managed location
# with unity catalog handling the tables, storage lifecycle, gouvernance control, and built in performance optimizations

# A Databricks table is a named metadata object that represents structured data organized into rows and columns mapping to underlying files in cloud storage and enabling SQL queries, schema enforcement and management features

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType( # StructType object

    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)
df.display()


In [0]:
# Save this dataframe above as a unity catalog managed table

df.write.saveAsTable("population_metrics.default.countries_population")

In [0]:
# Query a table using the Python API

df = spark.read.table("population_metrics.default.countries_population")

df.display()

In [0]:
# Save this dataframe above as a unity catalog managed table

df.write.mode("append").saveAsTable("population_metrics.default.countries_population")

In [0]:
df = spark.read.table("population_metrics.default.countries_population")

df.display()

In [0]:
df.write.mode("overwrite").saveAsTable("population_metrics.default.countries_population")

In [0]:
df = spark.read.table("population_metrics.default.countries_population")

df.display()

In [0]:
%sql
SELECT * FROM population_metrics.default.countries_population


In [0]:
# Run SQL query with the Python API

spark.sql("SELECT * FROM population_metrics.default.countries_population").display()

In [0]:
%sql
--- Create managed table using SQL

 CREATE TABLE population_metrics.default.countries_population_2
 (
  country_id int,
  name string,
  nationality string,
  country_code string,
  iso_alpha2 string,
  capital string,
  population int,
  area_km2 int,
  region_id int,
  sub_region_id int
 )

In [0]:
spark.sql("SELECT * FROM population_metrics.default.countries_population_2").display()

In [0]:
# insert the data
df = spark.sql("select * from population_metrics.default.countries_population")
df.display()

In [0]:
df.write.mode("append").saveAsTable("population_metrics.default.countries_population_2")

In [0]:
%sql
CREATE TABLE population_metrics.default.countries_population_3
AS
select * from population_metrics.default.countries_population_2

In [0]:
%sql

select * from population_metrics.default.countries_population_3

In [0]:
# Creating views with SQL
# A view is a readonly object that is the result of a query over one or more tables and views in a unity catalog metastore.
# In Azure Databricks a view is equivalent to a spark DataFrame persisted as an object in a schema
# Creating a view doesnt process or write any data, only the query text is registered to the metastore in the associated schema

# top 10 countries by population


In [0]:
%sql
SELECT * FROM population_metrics.default.countries_population 
ORDER BY population DESC
LIMIT 10

In [0]:
%sql
CREATE VIEW population_metrics.default.top_10_population AS
SELECT * FROM population_metrics.default.countries_population 
ORDER BY population DESC
LIMIT 10

In [0]:
%sql

SELECT * FROM population_metrics.default.top_10_population

In [0]:
spark.sql("SELECT * FROM population_metrics.default.top_10_population").display()

In [0]:
# Return the view as a dataframe

spark.read.table("population_metrics.default.top_10_population").display()3

In [0]:
# Create Catalogs, schemas and volumes SQL
# Catalong  -> Schemas -> Volumes

In [0]:
%sql

-- Create catalog

CREATE CATALOG test_catalog


In [0]:
# Create DataFrames from Python Data Structures

data_1 = [
    ["Alice", 30, "London"],
    ["Bob", 25, "New York"],
    ["Carol", 27, "San Francisco"],
    ["Dave", 35, "Berlin"]
]

In [0]:
data_2 = [
    {"name": "Alice", "age": 30, "city": "London"},
    {"name": "Bob", "age": 25, "city": "New York"},
    {"name": "Carol", "age": 27, "city": "San Francisco"},
    {"name": "Dave", "age": 35, "city": "Berlin"}
]

In [0]:
schema = "name string, age integer, city string"
spark.createDataFrame(data=data_1, schema=schema).display()

In [0]:
spark.createDataFrame(data_2).display()