In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import substring, avg, sum
import pyspark.sql.functions as f

In [2]:
#CONFIGURAR O SPARK
warehouse_location = 'hdfs://hdfs-nn:9000/demo/'

builder = SparkSession \
    .builder \
    .appName("Python Spark") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
stadiums = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/FootballStadiums.db/deltalake_table/")
ballondor = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/BallonDOr.db/deltalake_table/")

In [4]:
spark.table("FootballStadiums.deltalake_table").show()

+--------------------+-----------+--------------------+--------+-------+----------+-------------+
|             Stadium|       City|           Hometeams|Capacity|Country|Population|Confederation|
+--------------------+-----------+--------------------+--------+-------+----------+-------------+
| Stadiumi Besëlidhja|      Lezhë|          Besëlidhja|    7000|Albania|   2876591|         UEFA|
| Stadiumi Flamurtari|      Vlorë|    Flamurtari Vlorë|    8200|Albania|   2876591|         UEFA|
|       Stadiumi Laçi|       Laçi|             KF Laçi|    5000|Albania|   2876591|         UEFA|
|Stadiumi Niko Dovana|     Durrës|               Teuta|   12040|Albania|   2876591|         UEFA|
|Stadiumi Selman S...|     Tirana|KF Tirana, Dinamo...|    9500|Albania|   2876591|         UEFA|
| Stadiumi Skënderbeu|      Korçë|          Skënderbeu|    7500|Albania|   2876591|         UEFA|
|   Alashkert Stadion|    Yerevan|           Alashkert|    6850|Armenia|   2924816|         UEFA|
|     Banants Stadio

In [5]:
spark.table("BallonDOr.deltalake_table").show()

+----+--------------------+-----------------+----------------+------+---+---+---+---+---+-----+-------+----+
|Rank|              Player|             Club|     Nationality|Points| P1| P2| P3| P4| P5|Votes|Rankpts|Year|
+----+--------------------+-----------------+----------------+------+---+---+---+---+---+-----+-------+----+
|   1|         Omar Sívori|         Juventus|           Italy|    46|  5|  3|  2|  1|  1|   12|   50.0|1961|
|   2|Luis Suárez Miram...|   Internazionale|           Spain|    40|  4|  3|  2|  1|  0|   10|   25.0|1961|
|   3|       Johnny Haynes|           Fulham|         England|    22|  2|  2|  1|  0|  1|    6|   16.7|1961|
|   4|          Lev Yashin|    Dynamo Moscow|          Russia|    21|  3|  0|  2|  0|  0|    5|   12.5|1961|
|   5|       Ferenc Puskás|   Real Madrid CF|         Hungary|    16|  1|  1|  1|  2|  0|    5|   10.0|1961|
|   6|  Alfredo Di Stéfano|   Real Madrid CF|           Spain|    13|  0|  1|  1|  2|  2|    6|    8.3|1961|
|   6|          Uwe

In [6]:
spark.sql(
    """
    DROP DATABASE IF EXISTS UC4_gl CASCADE
    """
)

DataFrame[]

In [7]:
#CREATE GOLD DATABASE
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS UC4_gl LOCATION 'hdfs://hdfs-nn:9000/demo/gold/UC4_gl.db/'
    """
)

#DROP TABLE
spark.sql(
    """
    DROP TABLE IF EXISTS UC4_gl.ballon
    """
)

#CREATE THE TABLE IN GOLD
spark.sql(
    """
    CREATE EXTERNAL TABLE UC4_gl.ballon (
        Year INT,
        Player STRING,
        Club STRING,
        Nationality STRING,
        Population INT,
        Confederation STRING,
        Points INT,
        Points_avg DOUBLE, 
        PointsForN1 INT,
        PointsForN1_avg DOUBLE,
        PointsForN2 INT,
        PointsForN2_avg DOUBLE,
        PointsForN3 INT,
        PointsForN3_avg DOUBLE,
        PointsForN4 INT,
        PointsForN4_avg DOUBLE,
        PointsForN5 INT,
        PointsForN5_avg DOUBLE,
        Votes INT,
        Votes_avg DOUBLE,
        RankPoints DOUBLE,
        RankPoints_avg DOUBLE
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/demo/gold/UC4_gl.db/ballon/'
    """
)

DataFrame[]

In [8]:
from pyspark.sql.functions import substring, avg, sum
from pyspark.sql.functions import col, concat_ws, concat, lit, desc, asc

#JOIN THE TABLES
flat_balst_df = ballondor \
    .join(stadiums, ballondor.Nationality == stadiums.Country) \
    .withColumnRenamed("P1", "PointsForN1") \
    .withColumnRenamed("P2", "PointsForN2") \
    .withColumnRenamed("P3", "PointsForN3") \
    .withColumnRenamed("P4", "PointsForN4") \
    .withColumnRenamed("P5", "PointsForN5") \
    .withColumnRenamed("Rankpts", "RankPoints") \
    .select("Year", "Player", "Club", "Nationality", "Confederation", "Population", "Points", "PointsForN1", "PointsForN2", "PointsForN3", "PointsForN4", "PointsForN5", "Votes", "RankPoints")

#AGGREGATE DATA
balst_per_year_df = flat_balst_df \
    .orderBy(desc("RankPoints")) \
    .groupBy("Year", "Player", "Club", "Nationality", "Confederation", "Population") \
    .agg(
        sum(flat_balst_df.Points).cast('INT').alias("Points"),
        avg(flat_balst_df.Points).alias("Points_avg"),
        sum(flat_balst_df.PointsForN1).cast('INT').alias("PointsForN1"),
        avg(flat_balst_df.PointsForN1).alias("PointsForN1_avg"),
        sum(flat_balst_df.PointsForN2).cast('INT').alias("PointsForN2"),
        avg(flat_balst_df.PointsForN2).alias("PointsForN2_avg"),
        sum(flat_balst_df.PointsForN3).cast('INT').alias("PointsForN3"),
        avg(flat_balst_df.PointsForN3).alias("PointsForN3_avg"),
        sum(flat_balst_df.PointsForN4).cast('INT').alias("PointsForN4"),
        avg(flat_balst_df.PointsForN4).alias("PointsForN4_avg"),
        sum(flat_balst_df.PointsForN5).cast('INT').alias("PointsForN5"),
        avg(flat_balst_df.PointsForN5).alias("PointsForN5_avg"),
        sum(flat_balst_df.Votes).cast('INT').alias("Votes"),
        avg(flat_balst_df.Votes).alias("Votes_avg"),
        sum(flat_balst_df.RankPoints).alias("RankPoints"),
        avg(flat_balst_df.RankPoints).alias("RankPoints_avg"),
    )

#WRITE TO DELTA TABLE
balst_per_year_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/demo/gold/UC4_gl.db/ballon/")

balst_per_year_df.show()

#CHECK RESULTS
spark.table("UC4_gl.ballon").count()

+----+------------------+-------------------+-----------+-------------+----------+------+----------+-----------+---------------+-----------+---------------+-----------+---------------+-----------+---------------+-----------+---------------+-----+---------+------------------+------------------+
|Year|            Player|               Club|Nationality|Confederation|Population|Points|Points_avg|PointsForN1|PointsForN1_avg|PointsForN2|PointsForN2_avg|PointsForN3|PointsForN3_avg|PointsForN4|PointsForN4_avg|PointsForN5|PointsForN5_avg|Votes|Votes_avg|        RankPoints|    RankPoints_avg|
+----+------------------+-------------------+-----------+-------------+----------+------+----------+-----------+---------------+-----------+---------------+-----------+---------------+-----------+---------------+-----------+---------------+-----+---------+------------------+------------------+
|1972|       Piet Keizer|               Ajax|Netherlands|         UEFA|  17100715|   468|      13.0|          0|   

1696

In [9]:
balst_per_year_df.toPandas()

Unnamed: 0,Year,Player,Club,Nationality,Confederation,Population,Points,Points_avg,PointsForN1,PointsForN1_avg,...,PointsForN3,PointsForN3_avg,PointsForN4,PointsForN4_avg,PointsForN5,PointsForN5_avg,Votes,Votes_avg,RankPoints,RankPoints_avg
0,1972,Piet Keizer,Ajax,Netherlands,UEFA,17100715,468.0,13.0,0.0,0.0,...,36.0,1.0,0.0,0.0,216.0,6.0,288,8.0,360.0,10.0
1,2014,Philipp Lahm,Bayern Munich,Germany,UEFA,82800000,,,,,...,,,,,,,0,0.0,813.4,8.3
2,1970,Rinus Israël,Feyenoord,Netherlands,UEFA,17100715,252.0,7.0,36.0,1.0,...,0.0,0.0,36.0,1.0,0.0,0.0,72,2.0,180.0,5.0
3,1962,Omar Sívori,Juventus,Argentina,CONMEBOL,43847430,72.0,4.0,0.0,0.0,...,18.0,1.0,0.0,0.0,18.0,1.0,36,2.0,81.0,4.5
4,2015,Zlatan Ibrahimović,Paris Saint-Germain,Sweden,UEFA,10215250,1504.0,47.0,,,...,,,,,,,0,0.0,144.0,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,1982,Marco Tardelli,Juventus,Italy,UEFA,60483973,52.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,52.0,1.0,52,1.0,171.6,3.3
1692,1994,Krassimir Balakov,Sporting CP,Bulgaria,UEFA,7050034,48.0,3.0,0.0,0.0,...,16.0,1.0,0.0,0.0,0.0,0.0,16,1.0,49.6,3.1
1693,1961,Viktor Ponedelnik,SKA Rostov-on-Don,Russia,UEFA,144526636,136.0,4.0,0.0,0.0,...,34.0,1.0,0.0,0.0,34.0,1.0,68,2.0,98.6,2.9
1694,1978,João Resende Alves,Benfica,Portugal,UEFA,10291027,38.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,38.0,1.0,38,1.0,91.2,2.4


In [10]:
spark.sql(
    """
    GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/demo/gold/UC4_gl.db/ballon/`
    """
).show()

++
||
++
++



In [11]:
spark.sql("""
DROP TABLE IF EXISTS UC4_gl.ballon_presto
""").show()

spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS UC4_gl.ballon_presto (
        Year INT,
        Player STRING,
        Club STRING,
        Nationality STRING,
        Population INT,
        Confederation STRING,
        Points INT,
        Points_avg DOUBLE, 
        PointsForN1 INT,
        PointsForN1_avg DOUBLE,
        PointsForN2 INT,
        PointsForN2_avg DOUBLE,
        PointsForN3 INT,
        PointsForN3_avg DOUBLE,
        PointsForN4 INT,
        PointsForN4_avg DOUBLE,
        PointsForN5 INT,
        PointsForN5_avg DOUBLE,
        Votes INT,
        Votes_avg DOUBLE,
        RankPoints DOUBLE,
        RankPoints_avg DOUBLE
    )
    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION 'hdfs://hdfs-nn:9000/demo/gold/UC4_gl.db/ballon/_symlink_format_manifest/'
    """
).show()

++
||
++
++

++
||
++
++

