In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import substring, avg, sum
import pyspark.sql.functions as f

In [2]:
#CONFIGURAR O SPARK
warehouse_location = 'hdfs://hdfs-nn:9000/demo/'

builder = SparkSession \
    .builder \
    .appName("Python Spark") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
stadiums = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/deltalake_table/")

In [4]:
spark.table("FootballStadiums.deltalake_table").show()

+--------------------+-----------+--------------------+--------+-------+----------+-------------+
|             Stadium|       City|           Hometeams|Capacity|Country|Population|Confederation|
+--------------------+-----------+--------------------+--------+-------+----------+-------------+
| Stadiumi Besëlidhja|      Lezhë|          Besëlidhja|    7000|Albania|   2876591|         UEFA|
| Stadiumi Flamurtari|      Vlorë|    Flamurtari Vlorë|    8200|Albania|   2876591|         UEFA|
|       Stadiumi Laçi|       Laçi|             KF Laçi|    5000|Albania|   2876591|         UEFA|
|Stadiumi Niko Dovana|     Durrës|               Teuta|   12040|Albania|   2876591|         UEFA|
|Stadiumi Selman S...|     Tirana|KF Tirana, Dinamo...|    9500|Albania|   2876591|         UEFA|
| Stadiumi Skënderbeu|      Korçë|          Skënderbeu|    7500|Albania|   2876591|         UEFA|
|   Alashkert Stadion|    Yerevan|           Alashkert|    6850|Armenia|   2924816|         UEFA|
|     Banants Stadio

In [5]:
#CREATE GOLD DATABASE
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS UC3_gl LOCATION 'hdfs://hdfs-nn:9000/demo/gold/UC3_gl.db/'
    """
)

#DROP TABLE
spark.sql(
    """
    DROP TABLE IF EXISTS UC3_gl.stadiums
    """
)

#CREATE THE TABLE IN GOLD
spark.sql(
    """
    CREATE EXTERNAL TABLE UC3_gl.stadiums (
        Confederation STRING,
        Stadium STRING,
        City STRING,
        Hometeams STRING,
        Capacity INT,
        Country STRING,
        Population INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/demo/gold/UC3_gl.db/stadiums/'
    """
)

DataFrame[]

In [6]:
from pyspark.sql.functions import substring, avg, sum
from pyspark.sql.functions import col, concat_ws, concat, lit, desc, asc

#JOIN THE TABLES
flat_stadiums_df = stadiums.select("Confederation", "Stadium", "City", "Hometeams", "Capacity", "Country", "Population")

#WRITE TO DELTA TABLE
flat_stadiums_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/demo/gold/UC3_gl.db/stadiums/")

flat_stadiums_df.show()

#CHECK RESULTS
spark.table("UC3_gl.stadiums").count()

+-------------+--------------------+-----------+--------------------+--------+-------+----------+
|Confederation|             Stadium|       City|           Hometeams|Capacity|Country|Population|
+-------------+--------------------+-----------+--------------------+--------+-------+----------+
|         UEFA| Stadiumi Besëlidhja|      Lezhë|          Besëlidhja|    7000|Albania|   2876591|
|         UEFA| Stadiumi Flamurtari|      Vlorë|    Flamurtari Vlorë|    8200|Albania|   2876591|
|         UEFA|       Stadiumi Laçi|       Laçi|             KF Laçi|    5000|Albania|   2876591|
|         UEFA|Stadiumi Niko Dovana|     Durrës|               Teuta|   12040|Albania|   2876591|
|         UEFA|Stadiumi Selman S...|     Tirana|KF Tirana, Dinamo...|    9500|Albania|   2876591|
|         UEFA| Stadiumi Skënderbeu|      Korçë|          Skënderbeu|    7500|Albania|   2876591|
|         UEFA|   Alashkert Stadion|    Yerevan|           Alashkert|    6850|Armenia|   2924816|
|         UEFA|     

2024

In [9]:
spark.table("UC3_gl.stadiums").show()

+-------------+--------------------+-----------+--------------------+--------+-------+----------+
|Confederation|             Stadium|       City|           Hometeams|Capacity|Country|Population|
+-------------+--------------------+-----------+--------------------+--------+-------+----------+
|         UEFA| Stadiumi Besëlidhja|      Lezhë|          Besëlidhja|    7000|Albania|   2876591|
|         UEFA| Stadiumi Flamurtari|      Vlorë|    Flamurtari Vlorë|    8200|Albania|   2876591|
|         UEFA|       Stadiumi Laçi|       Laçi|             KF Laçi|    5000|Albania|   2876591|
|         UEFA|Stadiumi Niko Dovana|     Durrës|               Teuta|   12040|Albania|   2876591|
|         UEFA|Stadiumi Selman S...|     Tirana|KF Tirana, Dinamo...|    9500|Albania|   2876591|
|         UEFA| Stadiumi Skënderbeu|      Korçë|          Skënderbeu|    7500|Albania|   2876591|
|         UEFA|   Alashkert Stadion|    Yerevan|           Alashkert|    6850|Armenia|   2924816|
|         UEFA|     

In [7]:
spark.sql(
    """
    GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/demo/gold/UC3_gl.db/stadiums/`
    """
).show()

++
||
++
++



In [8]:
spark.sql("""
DROP TABLE IF EXISTS UC3_gl.stadiums_presto
""").show()

spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS UC3_gl.stadiums_presto (
        Confederation STRING,
        Stadium STRING,
        City STRING,
        Hometeams STRING,
        Capacity INT,
        Country STRING,
        Population INT
    )
    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION 'hdfs://hdfs-nn:9000/demo/gold/UC3_gl.db/stadiums/_symlink_format_manifest/'
    """
).show()

++
||
++
++

++
||
++
++

