In [0]:
# Unity Catalog Managed Tables
# is a DeltaLake table whose metadata and data files are both stored in a unity catalog managed location
# with unity catalog handling the tables, storage lifecycle, gouvernance control, and built in performance optimizations

# A Databricks table is a named metadata object that represents structured data organized into rows and columns mapping to underlying files in cloud storage and enabling SQL queries, schema enforcement and management features

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType( # StructType object

    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)
df.display()


In [0]:
# Save this dataframe above as a unity catalog managed table

df.write.saveAsTable("population_metrics.default.countries_population")

In [0]:
# Query a table using the Python API

df = spark.read.table("population_metrics.default.countries_population")

df.display()

In [0]:
# Save this dataframe above as a unity catalog managed table

df.write.mode("append").saveAsTable("population_metrics.default.countries_population")

In [0]:
df = spark.read.table("population_metrics.default.countries_population")

df.display()

In [0]:
df.write.mode("overwrite").saveAsTable("population_metrics.default.countries_population")

In [0]:
df = spark.read.table("population_metrics.default.countries_population")

df.display()

In [0]:
%sql
SELECT * FROM population_metrics.default.countries_population


In [0]:
# Run SQL query with the Python API

spark.sql("SELECT * FROM population_metrics.default.countries_population").display()

In [0]:
%sql
--- Create managed table using SQL

 CREATE TABLE population_metrics.default.countries_population_2
 (
  country_id int,
  name string,
  nationality string,
  country_code string,
  iso_alpha2 string,
  capital string,
  population int,
  area_km2 int,
  region_id int,
  sub_region_id int
 )

In [0]:
spark.sql("SELECT * FROM population_metrics.default.countries_population_2").display()

In [0]:
# insert the data
df = spark.sql("select * from population_metrics.default.countries_population")
df.display()

In [0]:
df.write.mode("append").saveAsTable("population_metrics.default.countries_population_2")

In [0]:
%sql
CREATE TABLE population_metrics.default.countries_population_3
AS
select * from population_metrics.default.countries_population_2

In [0]:
%sql

select * from population_metrics.default.countries_population_3

In [0]:
# Creating views with SQL
# A view is a readonly object that is the result of a query over one or more tables and views in a unity catalog metastore.
# In Azure Databricks a view is equivalent to a spark DataFrame persisted as an object in a schema
# Creating a view doesnt process or write any data, only the query text is registered to the metastore in the associated schema

# top 10 countries by population


In [0]:
%sql
SELECT * FROM population_metrics.default.countries_population 
ORDER BY population DESC
LIMIT 10

In [0]:
%sql
CREATE VIEW population_metrics.default.top_10_population AS
SELECT * FROM population_metrics.default.countries_population 
ORDER BY population DESC
LIMIT 10

In [0]:
%sql

SELECT * FROM population_metrics.default.top_10_population

In [0]:
spark.sql("SELECT * FROM population_metrics.default.top_10_population").display()

In [0]:
# Return the view as a dataframe

spark.read.table("population_metrics.default.top_10_population").display()3

In [0]:
# Create Catalogs, schemas and volumes SQL
# Catalong  -> Schemas -> Volumes

In [0]:
%sql

-- Create catalog

CREATE CATALOG test_catalog


In [0]:
# Create DataFrames from Python Data Structures

data_1 = [
    ["Alice", 30, "London"],
    ["Bob", 25, "New York"],
    ["Carol", 27, "San Francisco"],
    ["Dave", 35, "Berlin"]
]

In [0]:
data_2 = [
    {"name": "Alice", "age": 30, "city": "London"},
    {"name": "Bob", "age": 25, "city": "New York"},
    {"name": "Carol", "age": 27, "city": "San Francisco"},
    {"name": "Dave", "age": 35, "city": "Berlin"}
]

In [0]:
schema = "name string, age integer, city string"
spark.createDataFrame(data=data_1, schema=schema).display()

In [0]:
spark.createDataFrame(data_2).display()

In [0]:
df = spark.read.table("population_metrics.default.countries_population")
df.display()

In [0]:
# Ways to reference columns
df.select("country_id", "name", "population").display()

In [0]:
# Ways to reference columns
df_selected = df.select("country_id", "name", "population")
df_selected.display()

In [0]:
# Display all the columns
df.select("*").display()

In [0]:
# Using bracket notation => column objects
df.select(df['name'], df['country_id'], df['population']).display()

In [0]:
type(df['name'])

In [0]:
# Perform additional methon on this
df.select(df['name'].alias("country"), df['country_id'].alias("id"), df['population']).display()

In [0]:
# Reference columns via attribute access
df.select(df.country_id.alias("id"), df.name, df.population).display()

In [0]:
# Using the call function from the PySpark functions module
from pyspark.sql.functions import col

df.select(col("country_id").alias("id"), col("name"), col("population")).display()

In [0]:
# Column operations with select() and selectExpr()

df.select("country_id", "name", "population", "area_km2").display()

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.functions import upper


In [0]:
df.select(
    col("country_id"),
    upper(col("name")).alias("country_name"), 
    col("population"), 
    col("area_km2"),
    (col("population")/col("area_km2")).alias("population_density") #column expression
    ).display()

In [0]:
# Select expert projects a set of SQL expressions and returns a new DataFrame
df.selectExpr(
    "country_id",
    "upper(name) as country_name", # uppers is an SQL function
    "population",
    "area_km2",
    "population/area_km2 as population_density" # the expression
).display()

In [0]:
# Column operations with withColumn() and withColumns()
# withColumn() adds or replaces a single column in a Data Frame
# withColumns() allows to add or replace multiple columns at once in a cleaner way

df.display()


In [0]:
df.withColumn(
    "name", upper("name")
).display()

In [0]:
df.\
    withColumn("country_name", upper("name")).\
        withColumn("population_density", col("population") / col("area_km2")).\
            select("country_name", "population", "area_km2", "population_density").\
                display()

In [0]:
df_1 = df.withColumn("country_name", upper("name"))
df_2 = df_1.withColumn("population_density", col("population") / col("area_km2"))
df_3 = df_2.select("country_name", "population", "area_km2", "population_density")

df_3.display() # 4 data Frames

In [0]:
df.select(
    df.name.alias("country_name"),
    df.population,
    df.area_km2,
    (df.population / df.area_km2).alias("population_density")
).display() # only 1 Data Frame

In [0]:
df.withColumns(
    {
    "country_name": upper("name"), # dictionary element
    "population_density": col('population') / col('area_km2')
    }
).select("country_name", "population", "area_km2", "population_density").display()

In [0]:
# Renaming columns

df = spark.read.table("population_metrics.default.countries_population")

df.select(df.country_id.alias("id"), df.name.alias("country_name"), "population", "area_km2").display()

In [0]:
df.withColumnRenamed("country_id", "id").withColumnRenamed("name", "country_name").display()

In [0]:
df.withColumnsRenamed(
    {
        "country_id": "id",
        "name": "country_name"
    }
).display()

In [0]:
# Changing data types
df.dtypes

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.functions import StringType
from pyspark.sql.types import IntegerType

In [0]:
df_1 = df.select(
    col("country_id").cast(StringType()).alias("country_id"),
    col("name"),
    col("nationality"),
    col("country_code"),
    col("iso_alpha2"),
    col("capital"),
    col("population").cast(StringType()),
    col("area_km2").cast(StringType()),
    col("region_id").cast(StringType()),
    col("sub_region_id").cast(StringType())
)
df_1.dtypes

In [0]:
df_2 = df.select(
    col("country_id").cast("string"),
    col("name"),
    col("nationality"),
    col("country_code"),
    col("iso_alpha2"),
    col("capital"),
    col("population").cast("string"),
    col("area_km2").cast("string"),
    col("region_id").cast("string"),
    col("sub_region_id").cast("string")
)
df_2.dtypes

In [0]:
df_3 = df.withColumns(
{
    "country_id": col("country_id").cast(StringType()),
    "population": col("population").cast(StringType()),
    "area_km2": col("area_km2").cast(StringType()),
    "sub_region_id": col("sub_region_id").cast(StringType())
}
)
df_3.dtypes