In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DecimalType

In [2]:
#SPARK CONFIG
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

#Warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/demo/silver'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
spark.sql(
    """
    DROP DATABASE IF EXISTS FootballStadiums CASCADE
    """
)

DataFrame[]

In [5]:
#CREATE DATABASE CALLED FOOTBALLSTADIUMS
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS FootballStadiums LOCATION 'hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/'
    """
)

#DROP TABLE
spark.sql(
    """
    DROP TABLE IF EXISTS FootballStadiums.deltalake_table
    """
)

#CREATE DELTALAKE TABLE CALLED FOOTBALLSTADIUMS
spark.sql(
    """
    CREATE EXTERNAL TABLE FootballStadiums.deltalake_table (
        Stadium STRING,
        City STRING,
        Hometeams STRING,
        Capacity INT,
        Country STRING,
        Population INT
    )
    USING DELTA
    PARTITIONED BY (
         Confederation STRING
    )
    LOCATION 'hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/deltalake_table/'
    """
)

DataFrame[]

In [6]:
from pyspark.sql.functions import col

#READ THE .CSV FILE IN HDFS AND PUT IT IN A DATAFRAME
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/FootballStadiums.csv"

#DEFINE THE SCHEMA OF THE DATAFRAME
customSchema = StructType([
    StructField("Confederation", StringType(), True),        
    StructField("Stadium", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Hometeams", StringType(), True),
    StructField("Capacity", IntegerType(), True),
    StructField("Country", StringType(), True),
    StructField("Ioc", StringType(), True),
    StructField("Population", IntegerType(), True)
])

st_bd = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(customSchema) \
    .csv(hdfs_path) \

st_bd.show()
st_bd.printSchema()

+-------------+--------------------+-------------+--------------------+--------+---------+---+----------+
|Confederation|             Stadium|         City|           Hometeams|Capacity|  Country|Ioc|Population|
+-------------+--------------------+-------------+--------------------+--------+---------+---+----------+
|         UEFA| Stadiumi Besëlidhja|        Lezhë|          Besëlidhja|    7000|  Albania|ALB|   2876591|
|         UEFA| Stadiumi Flamurtari|        Vlorë|    Flamurtari Vlorë|    8200|  Albania|ALB|   2876591|
|         UEFA|       Stadiumi Laçi|         Laçi|             KF Laçi|    5000|  Albania|ALB|   2876591|
|         UEFA|Stadiumi Niko Dovana|       Durrës|               Teuta|   12040|  Albania|ALB|   2876591|
|         UEFA|Stadiumi Selman S...|       Tirana|KF Tirana, Dinamo...|    9500|  Albania|ALB|   2876591|
|         UEFA| Stadiumi Skënderbeu|        Korçë|          Skënderbeu|    7500|  Albania|ALB|   2876591|
|          CAF|Stade 1er Novembr...|    Tizi O

In [7]:
#WRITE THE DATAFRAME TO HIVE DELTALAKE TABLE
st_bd \
    .select("Stadium","City","Hometeams","Capacity","Country","Population","Confederation") \
    .write \
    .mode("overwrite") \
    .partitionBy("Confederation") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/deltalake_table/")

In [8]:
spark.sql(
    """
    SELECT * FROM FootballStadiums.deltalake_table
    """
).show()

+--------------------+-----------+--------------------+--------+-------+----------+-------------+
|             Stadium|       City|           Hometeams|Capacity|Country|Population|Confederation|
+--------------------+-----------+--------------------+--------+-------+----------+-------------+
| Stadiumi Besëlidhja|      Lezhë|          Besëlidhja|    7000|Albania|   2876591|         UEFA|
| Stadiumi Flamurtari|      Vlorë|    Flamurtari Vlorë|    8200|Albania|   2876591|         UEFA|
|       Stadiumi Laçi|       Laçi|             KF Laçi|    5000|Albania|   2876591|         UEFA|
|Stadiumi Niko Dovana|     Durrës|               Teuta|   12040|Albania|   2876591|         UEFA|
|Stadiumi Selman S...|     Tirana|KF Tirana, Dinamo...|    9500|Albania|   2876591|         UEFA|
| Stadiumi Skënderbeu|      Korçë|          Skënderbeu|    7500|Albania|   2876591|         UEFA|
|   Alashkert Stadion|    Yerevan|           Alashkert|    6850|Armenia|   2924816|         UEFA|
|     Banants Stadio

In [9]:
#TRANSFORMATIONS

In [10]:
#CHANGE THE VALUE SO IT'S EASIER TO ESTABLISH A RELATION TO BALLONDOR DELTALAKE TABLE
spark.sql(
    """
    UPDATE FootballStadiums.deltalake_table
    SET Country = REPLACE(Country, 'Trinidad and Tobago', 'Trinidad/Tobago')
    """
)

DataFrame[]

In [11]:
#CHANGE THE VALUE SO IT'S EASIER TO ESTABLISH A RELATION TO BALLONDOR DELTALAKE TABLE
spark.sql(
    """
    UPDATE FootballStadiums.deltalake_table
    SET Country = REPLACE(Country, 'Bosnia-Herzegovina', 'Bosnia & Herzegovina')
    """
)

DataFrame[]

In [12]:
#DATA QUALITY TREATMENT

In [13]:
#LOAD THE DATAFRAME
st_bd = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/deltalake_table")

In [14]:
#VERIFY IF EXISTS ANY DUPLICATED VALUES
import pyspark.sql.functions as f
st_bd.join(
    st_bd.groupBy(st_bd.columns).agg((f.count("*")>1).cast("int").alias("Duplicate_indicator")),
    on=st_bd.columns,
    how="inner"
).show()

+--------------------+----------------+--------------------+--------+--------------------+----------+-------------+-------------------+
|             Stadium|            City|           Hometeams|Capacity|             Country|Population|Confederation|Duplicate_indicator|
+--------------------+----------------+--------------------+--------+--------------------+----------+-------------+-------------------+
|Stadion Victorii ...|      Skalbmierz|            Victoria|     400|              Poland|  38433600|         UEFA|                  0|
|    Rosvalla Stadion|        Nyköping|       Nyköpings BIS|    6000|              Sweden|  10215250|         UEFA|                  0|
|      Ülker Stadyumu|        Istanbul|          Fenerbahçe|   50509|              Turkey|  80810525|         UEFA|                  0|
|      Bao’an Stadium|        Shenzhen|                   -|   40050|               China|1403500365|          AFC|                  0|
|  Yuexiushan Stadium|       Guangzhou|       Gu

In [15]:
#COUNT THE NUMBER OF LINES THAT ARE ON THE DELTALAKE TABLE
spark.sql(
    """
    Select COUNT(*) as Number_of_lines FROM FootballStadiums.deltalake_table
    """
).show()

+---------------+
|Number_of_lines|
+---------------+
|           2024|
+---------------+



In [16]:
#Identify distinct patterns 
#Column Stadium
st_bd.select('Stadium').distinct().collect()

[Row(Stadium='Paralimni'),
 Row(Stadium='Vonovia Ruhrstadion'),
 Row(Stadium='Tolka Park'),
 Row(Stadium='Stadion OSiR Skałka'),
 Row(Stadium='Estadio Nuevo Los Cármenes'),
 Row(Stadium='Stadion Illichivets'),
 Row(Stadium='Racecourse Ground'),
 Row(Stadium='Ernst-Happel-Stadion'),
 Row(Stadium='County Ground'),
 Row(Stadium='Vicarage Road'),
 Row(Stadium='Skansi Arena'),
 Row(Stadium='Nagyerdei Stadion'),
 Row(Stadium='Sportpark De Westmaat (Rode)'),
 Row(Stadium='Stadion GOSiR w Gdyni'),
 Row(Stadium='Stadion OSiR Czarnków'),
 Row(Stadium='Stadionul Ladislau Bölöni'),
 Row(Stadium='Ullevi Stadion'),
 Row(Stadium='Shenzhen Bay Sports Center'),
 Row(Stadium='AT&T Stadium'),
 Row(Stadium='University of Richmond Stadium'),
 Row(Stadium='Andrův Stadion'),
 Row(Stadium='Gípedo Neápolis'),
 Row(Stadium='Stadion ŁKS-u'),
 Row(Stadium='Stadion Stali Poniatowa'),
 Row(Stadium='Estádio Municipal de Albufeira'),
 Row(Stadium='Stadionul Naţional'),
 Row(Stadium='Stadion Centralnyj'),
 Row(Stadium

In [17]:
#Identify of null values
st_bd.filter(st_bd.Stadium.isNull()).count()

0

In [18]:
#Identify distinct patterns 
#Column City
st_bd.select('City').distinct().collect()

[Row(City='Antwerp'),
 Row(City='Paralimni'),
 Row(City='Hanover'),
 Row(City='Magdeburg'),
 Row(City='Thessaloniki'),
 Row(City='Palermo'),
 Row(City='Olecko'),
 Row(City='Volgograd'),
 Row(City='Geneve'),
 Row(City='Bangalore'),
 Row(City='Morelia'),
 Row(City='Linz'),
 Row(City='Frankfurt nad Menem'),
 Row(City='Jahra'),
 Row(City='Oaxaca de Juárez'),
 Row(City='Tempe'),
 Row(City='Temuco'),
 Row(City='Kaiserslautern'),
 Row(City='Málaga'),
 Row(City='Badajoz'),
 Row(City='Lund'),
 Row(City='Auburn'),
 Row(City='Harrison'),
 Row(City='Cairo'),
 Row(City='Casablanca'),
 Row(City='Abuja'),
 Row(City='Wellington'),
 Row(City='Kirkcaldy'),
 Row(City='Gornji Milanovac'),
 Row(City='Suining'),
 Row(City='Kochi'),
 Row(City='Winnipeg'),
 Row(City='Orašje'),
 Row(City='Peiraiefs'),
 Row(City='Esch-sur-Alzette'),
 Row(City='Konin'),
 Row(City='Târgu-Jiu'),
 Row(City='Barakaldo'),
 Row(City='East Rutherford'),
 Row(City='Gaborone'),
 Row(City='Tamale'),
 Row(City='Agadir'),
 Row(City='Salzbur

In [19]:
#Identify of null values
st_bd.filter(st_bd.City.isNull()).count()

0

In [20]:
#Identify distinct patterns 
#Column Hometeams
st_bd.select('Hometeams').distinct().collect()

[Row(Hometeams='Torpedo-BelAZ'),
 Row(Hometeams='Magdeburg'),
 Row(Hometeams='Twente'),
 Row(Hometeams='Azalea'),
 Row(Hometeams='Roosters, Waratahs, Sydney FC'),
 Row(Hometeams='Bangkok United'),
 Row(Hometeams='Puntarenas FC'),
 Row(Hometeams='Black Knights'),
 Row(Hometeams='América FC, ABC FC'),
 Row(Hometeams='Coritiba FC'),
 Row(Hometeams='Atlético Malabo, Atlético Semu, Deportivo Unidad,'),
 Row(Hometeams='Yeovil Town'),
 Row(Hometeams='Salford City'),
 Row(Hometeams='FC Erzgebirge'),
 Row(Hometeams='Levadeiakós'),
 Row(Hometeams='Mieszko'),
 Row(Hometeams='Resovia'),
 Row(Hometeams='Astra'),
 Row(Hometeams='Edinburgh City'),
 Row(Hometeams='Västerås SK'),
 Row(Hometeams='Shonan Bellmare'),
 Row(Hometeams='Tennessee Titans'),
 Row(Hometeams='Mouloudia'),
 Row(Hometeams='APOEL, Omonia, Olympiakos'),
 Row(Hometeams='FC Chiatura'),
 Row(Hometeams='Kaiserslautern'),
 Row(Hometeams='TSV 1860'),
 Row(Hometeams='Cartusia'),
 Row(Hometeams='FK Krasnodar'),
 Row(Hometeams='Espanyol'),
 R

In [21]:
#Identify of null values
st_bd.filter(st_bd.Hometeams.isNull()).count()

0

In [22]:
#Identify distinct patterns 
#Column Capacity
st_bd.select('Capacity').distinct().collect()

[Row(Capacity=18944),
 Row(Capacity=9900),
 Row(Capacity=18800),
 Row(Capacity=30970),
 Row(Capacity=22346),
 Row(Capacity=32539),
 Row(Capacity=23336),
 Row(Capacity=13261),
 Row(Capacity=39460),
 Row(Capacity=11500),
 Row(Capacity=14465),
 Row(Capacity=16500),
 Row(Capacity=47300),
 Row(Capacity=858),
 Row(Capacity=19200),
 Row(Capacity=1025),
 Row(Capacity=540),
 Row(Capacity=1483),
 Row(Capacity=12006),
 Row(Capacity=15100),
 Row(Capacity=20396),
 Row(Capacity=106572),
 Row(Capacity=18467),
 Row(Capacity=19325),
 Row(Capacity=20268),
 Row(Capacity=29022),
 Row(Capacity=3000),
 Row(Capacity=18221),
 Row(Capacity=20247),
 Row(Capacity=37593),
 Row(Capacity=7066),
 Row(Capacity=2025),
 Row(Capacity=3220),
 Row(Capacity=114000),
 Row(Capacity=4000),
 Row(Capacity=6500),
 Row(Capacity=11352),
 Row(Capacity=14944),
 Row(Capacity=18614),
 Row(Capacity=7152),
 Row(Capacity=1977),
 Row(Capacity=8211),
 Row(Capacity=24890),
 Row(Capacity=76092),
 Row(Capacity=25520),
 Row(Capacity=60540),
 R

In [23]:
#Identify of null values
st_bd.filter(st_bd.Capacity.isNull()).count()

0

In [24]:
#Identify distinct patterns 
#Column Country
st_bd.select('Country').distinct().collect()

[Row(Country='Russia'),
 Row(Country='Paraguay'),
 Row(Country='Senegal'),
 Row(Country='Sweden'),
 Row(Country='Republic of South Africa'),
 Row(Country='Burma'),
 Row(Country='Eritrea'),
 Row(Country='Malaysia'),
 Row(Country='Singapore'),
 Row(Country='Turkey'),
 Row(Country='Malawi'),
 Row(Country='Iraq'),
 Row(Country='Germany'),
 Row(Country='Cambodia'),
 Row(Country='Jordan'),
 Row(Country='Ivory Coast'),
 Row(Country='Rwanda'),
 Row(Country='New Zeland'),
 Row(Country='France'),
 Row(Country='Greece'),
 Row(Country='Taiwan'),
 Row(Country='Butan'),
 Row(Country='Algeria'),
 Row(Country='Equatorial Guinea'),
 Row(Country='Slovakia'),
 Row(Country='Wales'),
 Row(Country='Argentina'),
 Row(Country='Belgium'),
 Row(Country='Angola'),
 Row(Country='Qatar'),
 Row(Country='Ecuador'),
 Row(Country='Lesotho'),
 Row(Country='Albania'),
 Row(Country='Finland'),
 Row(Country='Ghana'),
 Row(Country='Peru'),
 Row(Country='China'),
 Row(Country='India'),
 Row(Country='Belarus'),
 Row(Country=

In [25]:
#Identify of null values
st_bd.filter(st_bd.Country.isNull()).count()

0

In [26]:
#Identify distinct patterns 
#Column Population
st_bd.select('Population').distinct().collect()

[Row(Population=10768477),
 Row(Population=3444006),
 Row(Population=42418235),
 Row(Population=37202572),
 Row(Population=32162184),
 Row(Population=162951560),
 Row(Population=1170125),
 Row(Population=51095),
 Row(Population=5323933),
 Row(Population=68863514),
 Row(Population=4154200),
 Row(Population=417200),
 Row(Population=8857960),
 Row(Population=10171480),
 Row(Population=11420163),
 Row(Population=55619400),
 Row(Population=32979000),
 Row(Population=1324171354),
 Row(Population=4857000),
 Row(Population=5424800),
 Row(Population=57725600),
 Row(Population=7001444),
 Row(Population=23439189),
 Row(Population=210147125),
 Row(Population=5662544),
 Row(Population=10610947),
 Row(Population=10524117),
 Row(Population=17100715),
 Row(Population=5638700),
 Row(Population=16385068),
 Row(Population=7050034),
 Row(Population=4052584),
 Row(Population=11262564),
 Row(Population=17263239),
 Row(Population=81672300),
 Row(Population=55572201),
 Row(Population=17574003),
 Row(Populatio

In [27]:
#Identify of null values
st_bd.filter(st_bd.Population.isNull()).count()

0

In [28]:
#Identify distinct patterns 
#Column Confederation
st_bd.select('Confederation').distinct().collect()

[Row(Confederation='CONCACAF'),
 Row(Confederation='OFC'),
 Row(Confederation='AFC'),
 Row(Confederation='CAF'),
 Row(Confederation='CONMEBOL'),
 Row(Confederation='UEFA')]

In [29]:
#Identify of null values
st_bd.filter(st_bd.Confederation.isNull()).count()

0

In [30]:
st_bd \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/deltalake_table/")