In [0]:
path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

In [0]:
spark.read # data frame reader object

In [0]:
df = spark.read.csv(path)

In [0]:
type(df)

In [0]:
df = spark.read.csv(path, header=True, sep=",") # comma is by default
# df = spark.read.options(header=True, sep=",").csv(path)
df.display()

In [0]:
df = spark.read.load(path, format = "csv", header=True)
df.display()

In [0]:
# more examples
df = spark.read.format("csv").load("/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population_partitioned")

df.display()

In [0]:
path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

In [0]:
df = spark.read.format("csv").options(header=True).load(path)

In [0]:
df.dtypes

In [0]:
df.describe()

In [0]:
df.printSchema()

In [0]:
df.display()

In [0]:
df = spark.read.format("csv").options(header = True, inferSchema=True).load(path) #inferSchema is to determine the datatype; but is not efficient when working with large data sets
df.display()

In [0]:
df.dtypes

In [0]:

# define a schema using SQL DDL syntax => Data definition language (DDL) describes the portion of SQL that creates, alters, and deletes database objects
df = spark.read.format("csv").schema("country_id int, name string, nationality string, country_code string, iso_alpha2 string, capital string, population int, area_km2 int, region_id int, sub_region_id int").options(header=True).load(path)

In [0]:
schema = """
        country_id int, 
        name string, 
        nationality string, 
        country_code string, 
        iso_alpha2 string,
        capital string, 
        population int, 
        area_km2 int, 
        region_id int, 
        sub_region_id int
"""

df = spark.read.format("csv").schema(schema).options(header=True).load(path)

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# StructField -> defines a single column in a spark schema
# specifies the name of the field, data type of the field, nullable flag, metadata

from pyspark.sql.types import StructField, StructType, StringType, IntegerType

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

In [0]:
df = spark.read.format("csv").options(header=True).schema(schema).load(path)

In [0]:
# Write Data Frames to CSV
output_path = "/Volumes/population_metrics/landing/datasets/output_dataset/csv/countries_population"

In [0]:
df.write.mode("overwrite").options(header=True).csv(output_path) # option 1

In [0]:
df.write.mode("append").format("csv").options(header=True).save(output_path) # option 2

In [0]:

# change the delimiter
df.write.mode("overwrite").format("csv").options(header=True, sep="|").save(output_path) # option 2

In [0]:
# Work with JSON
path  = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

df = spark.read.format("csv").options(header=True).load(path)

In [0]:
# df.write.mode("overwrite").json("/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population")

# the lines are equivalent
df.write.format("json").mode("overwrite").save("/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# StructField -> defines a single column in a spark schema
# specifies the name of the field, data type of the field, nullable flag, metadata

from pyspark.sql.types import StructField, StructType, StringType, IntegerType

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)

In [0]:
# df.write.mode("overwrite").json("/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population")

# the lines are equivalent
df.write.format("json").mode("overwrite").save("/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population")

In [0]:
# Read JSON data
# Option 1
#spark.read.json("/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population").display()

# Option 2
spark.read.format("json").load("/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population").display()

# When you read the JSON data the JSON reader tries to infer the data types


In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

path = "/Volumes/population_metrics/landing/datasets/output_dataset/json/countries_population"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("json").schema(schema).load(path)
df.display()

In [0]:
# Work with ORC - Optimized Row Columnar
# Free open-source, column oriented data storage format that's designed for high performance, large-sclae data processing

# We don't need to provide the schema; is stored in the metadata for ORC

from pyspark.sql.types import StructField, StructType, StringType, IntegerType

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)

In [0]:
# Write ORC
# Option 1
#df.write.mode("overwrite").orc("/Volumes/population_metrics/landing/datasets/output_dataset/orc/countries_population")

# Option 2
df.write.mode("overwrite").format("orc").save("/Volumes/population_metrics/landing/datasets/output_dataset/orc/countries_population")

In [0]:
spark.read.format("orc").load("/Volumes/population_metrics/landing/datasets/output_dataset/orc/countries_population").display()

In [0]:
# equivalent
spark.read.orc("/Volumes/population_metrics/landing/datasets/output_dataset/orc/countries_population").display()

In [0]:
# Work with Parquet
# Parquet is an open source columnar storage format that writes data in an efficient compressed binary layout optimized for analytical workloads
# Embedded schema metadata so don't need to supply schema when reading parquet files in the code

from pyspark.sql.types import StructField, StructType, StringType, IntegerType

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)

In [0]:
# Write Parquet
# Option 1
#df.write.mode("overwrite").parquet("/Volumes/population_metrics/landing/datasets/output_dataset/parquet/countries_population")

# Option 2
df.write.mode("overwrite").format("parquet").save("/Volumes/population_metrics/landing/datasets/output_dataset/parquet/countries_population")

In [0]:
# read parquet data
spark.read.format("parquet").load("/Volumes/population_metrics/landing/datasets/output_dataset/parquet/countries_population").display()

In [0]:
# equivalent
spark.read.parquet("/Volumes/population_metrics/landing/datasets/output_dataset/parquet/countries_population").display()

In [0]:
# DeltaLake Format
# DeltaLake builds on parquet file format and adds additional features like ACID transactions, scalable metadata handling, and schema reinforcement to data lakes
# Enables reliable pipelines with time travel, upserts, and effcient streaming and batch workloads


from pyspark.sql.types import StructField, StructType, StringType, IntegerType

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)

In [0]:
df.write.format("delta").mode("overwrite").save("/Volumes/population_metrics/landing/datasets/output_dataset/delta_lake/countries_population")


In [0]:
# Databricks Time Travel, a key feature of Delta Lake, lets you query historical versions of your data by accessing past table states using specific timestamps or version numbers, enabling data recovery, auditing, reproducing analyses, fixing errors, and ensuring consistent snapshots for complex temporal queries. 
spark.read.format("delta").load("/Volumes/population_metrics/landing/datasets/output_dataset/delta_lake/countries_population/").display()

In [0]:
# Rendering Data Frame besides display() method

from pyspark.sql.types import StructField, StructType, StringType, IntegerType

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)


In [0]:
df.show(n=200)

In [0]:
df.display()

Databricks visualization. Run in Databricks to view.

In [0]:
# Data Partition

path = "/Volumes/population_metrics/landing/datasets/countries_dataset/csv_data/countries_population/countries_population.csv"

schema = StructType( # StructType object
    [
        StructField("country_id", IntegerType(), False), # each StructField defines a column; False = cannot accept null values
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("iso_alpha2", StringType(), True),
        StructField("capital", StringType(), True),
        StructField("population", IntegerType(), True),
        StructField("area_km2", IntegerType(), True),
        StructField("region_id", IntegerType(), True),
        StructField("sub_region_id", IntegerType(), True)
    ]
)

df = spark.read.format("csv").schema(schema).options(header=True).load(path)
df.display()

In [0]:
df.write.\
    format("csv").\
    partitionBy("region_id", "sub_region_id").\
    mode("overwrite").\
    save("/Volumes/population_metrics/landing/datasets/output_dataset/csv/countries_population_partitioned")

In [0]:
dbutils.help()

In [0]:
dbutils.fs.help()

In [0]:
help(dbutils.fs.cp)

In [0]:
# copy a file in another place
dbutils.fs.cp(
    "/Volumes/population_metrics/landing/datasets/output_dataset/csv/countries_population/part-00000-tid-6582892365978343552-72935b3a-679f-4e5a-a0c3-9645e712c918-263-1-c000.csv",
    "/Volumes/population_metrics/landing/datasets/output_dataset"
)

In [0]:
# copy a folder in another place
dbutils.fs.cp(
    "/Volumes/population_metrics/landing/datasets/output_dataset/csv/countries_population/",
    "/Volumes/population_metrics/landing/datasets/output_dataset/countries_population_copy"
, recurse=True)

In [0]:
# Remove file or directory
help(dbutils.fs.rm)

In [0]:
dbutils.fs.rm("/Volumes/population_metrics/landing/datasets/output_dataset/part-00000-tid-6582892365978343552-72935b3a-679f-4e5a-a0c3-9645e712c918-263-1-c000.csv")

In [0]:
dbutils.fs.rm("/Volumes/population_metrics/landing/datasets/output_dataset/countries_population_copy/", recurse=True)

In [0]:
help(dbutils.fs.ls)

In [0]:
# list directories
dbutils.fs.ls("/Volumes/population_metrics/landing/datasets/output_dataset/csv")

In [0]:
# list directories nicer
display(dbutils.fs.ls("dbfs:/Volumes/population_metrics/landing/datasets/output_dataset/csv/countries_population/"))

In [0]:
# preview a file 
dbutils.fs.head("dbfs:/Volumes/population_metrics/landing/datasets/output_dataset/csv/countries_population/part-00000-tid-6582892365978343552-72935b3a-679f-4e5a-a0c3-9645e712c918-263-1-c000.csv")