In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DecimalType, FloatType, DoubleType

In [2]:
#SPARK CONFIG
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

#Warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/demo/silver'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark.sql(
    """
    DROP DATABASE IF EXISTS BallonDOr CASCADE
    """
)

DataFrame[]

In [4]:
#CREATE DATABASE CALLED BALLONDOR
spark.sql(
    """
    CREATE DATABASE BallonDOr LOCATION 'hdfs://hdfs-nn:9000/demo/silver/BallonDOr.db/'
    """
)

#DROP TABLE
spark.sql(
    """
    DROP TABLE IF EXISTS BallonDOr.deltalake_table
    """
)

#CREATE A DELTALAKE TABLE CALLED BALLONDOR
spark.sql(
    """
    CREATE EXTERNAL TABLE BallonDOr.deltalake_table (
        Rank INT,
        Player STRING,
        Club STRING,
        Nationality STRING,
        Points INT,
        P1 INT,
        P2 INT,
        P3 INT,
        P4 INT,
        P5 INT,
        Votes INT,
        Rankpts DOUBLE
    )
    USING DELTA
    PARTITIONED BY (
         Year INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/demo/silver/BallonDOr.db/deltalake_table/'
    """
)

DataFrame[]

In [5]:
from pyspark.sql.functions import col

#READ THE .CSV FILE IN HDFS AND PUT IT IN A DATAFRAME
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/BallonDOr.csv"

#DEFINE THE SCHEMA OF THE DATAFRAME
customSchema = StructType([
    StructField("Year", IntegerType(), True),        
    StructField("Rank", IntegerType(), True),
    StructField("Player", StringType(), True),
    StructField("Club", StringType(), True),
    StructField("Nationality", StringType(), True),
    StructField("Points", IntegerType(), True),
    StructField("P1", IntegerType(), True),
    StructField("P2", IntegerType(), True),
    StructField("P3", IntegerType(), True),
    StructField("P4", IntegerType(), True),
    StructField("P5", IntegerType(), True),
    StructField("Votes", IntegerType(), True),
    StructField("Rankpts", DoubleType(), True)
])

st_bd = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(customSchema) \
    .csv(hdfs_path) \

st_bd.show()
st_bd.printSchema()

+----+----+------------------+--------------------+------------+------+---+---+---+---+---+-----+-------+
|Year|Rank|            Player|                Club| Nationality|Points| P1| P2| P3| P4| P5|Votes|Rankpts|
+----+----+------------------+--------------------+------------+------+---+---+---+---+---+-----+-------+
|1956|   1|  Stanley Matthews|           Blackpool|     England|    47|  6|  2|  2|  1|  1|   12|   50.0|
|1956|   2|Alfredo Di Stéfano|      Real Madrid CF|       Spain|    44|  5|  3|  2|  0|  1|   11|   25.0|
|1956|   3|      Raymond Kopa|      Real Madrid CF|      France|    33|  1|  2|  3|  5|  1|   12|   16.7|
|1956|   4|     Ferenc Puskás|              Honvéd|     Hungary|    32|  3|  3|  1|  1|  0|    8|   12.5|
|1956|   5|        Lev Yashin|       Dynamo Moskva|Soviet Union|    19|  0|  2|  2|  1|  3|    8|   10.0|
|1956|   6|     József Bozsik|              Honvéd|     Hungary|    15|  0|  3|  1|  0|  0|    4|    8.3|
|1956|   7|      Ernst Ocwirk|           Sampd

In [6]:
#WRITE THE DATAFRAME TO HIVE DELTALAKE TABLE
st_bd \
    .select("Rank","Player","Club","Nationality","Points","P1","P2","P3","P4","P5","Votes","Rankpts","Year") \
    .write \
    .mode("overwrite") \
    .partitionBy("Year") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/BallonDOr.db/deltalake_table/")

In [7]:
spark.sql(
    """
    SELECT *
    FROM BallonDOr.deltalake_table
    """
).show()

+----+--------------------+--------------------+------------+------+---+---+---+---+---+-----+-------+----+
|Rank|              Player|                Club| Nationality|Points| P1| P2| P3| P4| P5|Votes|Rankpts|Year|
+----+--------------------+--------------------+------------+------+---+---+---+---+---+-----+-------+----+
|   1|      Michel Platini|            Juventus|      France|   127| 23|  3|  0|  0|  0|   26|   50.0|1985|
|   2|Preben Elkjar-Larsen|              Verona|     Denmark|    71|  1| 13|  4|  1|  0|   19|   25.0|1985|
|   3|      Bernd Schuster|        FC Barcelona|West Germany|    46|  2|  3|  5|  4|  1|   15|   16.7|1985|
|   4|     Michael Laudrup|            Juventus|     Denmark|    14|  0|  3|  0|  0|  2|    5|   12.5|1985|
|   5|Karl-Heinz Rummen...|      Internazionale|West Germany|    13|  0|  1|  1|  3|  0|    5|   10.0|1985|
|   6|     Zbigniew Boniek|                Roma|      Poland|    12|  0|  0|  1|  4|  1|    6|    8.3|1985|
|   7|      Oleg Protassov|D

In [8]:
#TRANSFORMATIONS

In [9]:
spark.sql(
    """
    UPDATE BallonDOr.deltalake_table
    SET Nationality = REPLACE(Nationality, 'West Germany', 'Germany')
    """
)

DataFrame[]

In [10]:
spark.sql(
    """
    UPDATE BallonDOr.deltalake_table
    SET Nationality = REPLACE(Nationality, 'Germany (East)', 'Germany')
    """
)

DataFrame[]

In [11]:
spark.sql(
    """
    UPDATE BallonDOr.deltalake_table
    SET Nationality = REPLACE(Nationality, 'Soviet Union', 'Russia')
    """
)

DataFrame[]

In [13]:
spark.sql(
    """
    SELECT *
    FROM BallonDOr.deltalake_table
    """
).show()

+----+--------------------+-----------------+----------------+------+---+---+---+---+---+-----+-------+----+
|Rank|              Player|             Club|     Nationality|Points| P1| P2| P3| P4| P5|Votes|Rankpts|Year|
+----+--------------------+-----------------+----------------+------+---+---+---+---+---+-----+-------+----+
|   1|         Omar Sívori|         Juventus|           Italy|    46|  5|  3|  2|  1|  1|   12|   50.0|1961|
|   2|Luis Suárez Miram...|   Internazionale|           Spain|    40|  4|  3|  2|  1|  0|   10|   25.0|1961|
|   3|       Johnny Haynes|           Fulham|         England|    22|  2|  2|  1|  0|  1|    6|   16.7|1961|
|   4|          Lev Yashin|    Dynamo Moscow|          Russia|    21|  3|  0|  2|  0|  0|    5|   12.5|1961|
|   5|       Ferenc Puskás|   Real Madrid CF|         Hungary|    16|  1|  1|  1|  2|  0|    5|   10.0|1961|
|   6|  Alfredo Di Stéfano|   Real Madrid CF|           Spain|    13|  0|  1|  1|  2|  2|    6|    8.3|1961|
|   6|          Uwe

In [14]:
#DATA QUALITY TREATMENT

In [15]:
#LOAD THE DATAFRAME
st_bd = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/BallonDOr.db/deltalake_table")

In [16]:
#VERIFY IF EXISTS ANY DUPLICATED VALUES
import pyspark.sql.functions as f
st_bd.join(
    st_bd.groupBy(st_bd.columns).agg((f.count("*")>1).cast("int").alias("Duplicate_indicator")),
    on=st_bd.columns,
    how="inner"
).show()

+----+--------------------+------------------+--------------+------+---+---+---+---+---+-----+-------+----+-------------------+
|Rank|              Player|              Club|   Nationality|Points| P1| P2| P3| P4| P5|Votes|Rankpts|Year|Duplicate_indicator|
+----+--------------------+------------------+--------------+------+---+---+---+---+---+-----+-------+----+-------------------+
|   4|          Lev Yashin|     Dynamo Moscow|        Russia|    21|  3|  0|  2|  0|  0|    5|   12.5|1961|                  0|
|  27|        José Augusto|           Benfica|      Portugal|     1|  0|  0|  0|  0|  1|    1|    1.9|1967|                  0|
|   8|  Alexsandr Chivadze|    Dinamo Tbilisi|        Russia|     9|  0|  1|  1|  1|  0|    3|    6.3|1981|                  0|
|  24|       Tommy Gemmell|            Celtic|      Scotland|     1|  0|  0|  0|  0|  1|    1|    2.1|1968|                  0|
|   6|    Jürgen Klinsmann| Tottenham Hotspur|       Germany|    43|  1|  1|  5|  5|  9|   21|    8.3|19

In [17]:
#COUNT THE NUMBER OF LINES THAT ARE ON THE DELTALAKE TABLE
spark.sql(
    """
    Select COUNT(*) as Number_of_lines FROM BallonDOr.deltalake_table
    """
).show()

+---------------+
|Number_of_lines|
+---------------+
|           1780|
+---------------+



In [18]:
#Identify distinct patterns 
#Column Rank
st_bd.select('Rank').distinct().collect()

[Row(Rank=28),
 Row(Rank=26),
 Row(Rank=27),
 Row(Rank=12),
 Row(Rank=22),
 Row(Rank=1),
 Row(Rank=13),
 Row(Rank=6),
 Row(Rank=16),
 Row(Rank=3),
 Row(Rank=20),
 Row(Rank=5),
 Row(Rank=19),
 Row(Rank=15),
 Row(Rank=9),
 Row(Rank=17),
 Row(Rank=35),
 Row(Rank=4),
 Row(Rank=8),
 Row(Rank=23),
 Row(Rank=7),
 Row(Rank=10),
 Row(Rank=50),
 Row(Rank=25),
 Row(Rank=24),
 Row(Rank=29),
 Row(Rank=21),
 Row(Rank=32),
 Row(Rank=11),
 Row(Rank=33),
 Row(Rank=14),
 Row(Rank=2),
 Row(Rank=30),
 Row(Rank=18)]

In [19]:
#Identify of null values
st_bd.filter(st_bd.Rank.isNull()).count()

0

In [20]:
#Identify distinct patterns 
#Column Player
st_bd.select('Player').distinct().collect()

[Row(Player='Michel Platini'),
 Row(Player='Gianni Rivera'),
 Row(Player='Fabio Cannavaro'),
 Row(Player='Hakan Şükür'),
 Row(Player='Henrik Larsson'),
 Row(Player='Vujadin Boškov'),
 Row(Player='Arturo Vidal'),
 Row(Player='Hans-Peter Briegel'),
 Row(Player='Horst Hrubesch'),
 Row(Player='Vladislav Bogićević'),
 Row(Player='Dida'),
 Row(Player='Ove Kindvall'),
 Row(Player='Edinson Cavani'),
 Row(Player='Nemanja Vidić'),
 Row(Player='Costa Pereira'),
 Row(Player='John Greig'),
 Row(Player='Paul McGrath'),
 Row(Player='Dimitri Payet'),
 Row(Player='Mohamed Salah'),
 Row(Player='Sami Hyypia'),
 Row(Player='Gernot Fraydl'),
 Row(Player='Jürgen Köhler'),
 Row(Player='Mario Jardel'),
 Row(Player='Tibor Nyilasi'),
 Row(Player='Emmanuel Adebayor'),
 Row(Player='Sigfried Held'),
 Row(Player='Robert Gadocha'),
 Row(Player='Ferenc Puskás'),
 Row(Player='Włodzimierz Lubański'),
 Row(Player='Heinz Hermann'),
 Row(Player='Giorgos Sideris'),
 Row(Player='Koke'),
 Row(Player='Jari Litmanen'),
 Row(Pl

In [21]:
#Identify of null values
st_bd.filter(st_bd.Player.isNull()).count()

0

In [22]:
#Identify distinct patterns 
#Column Club
st_bd.select('Club').distinct().collect()

[Row(Club='Magdeburg'),
 Row(Club='Slovan Bratislava'),
 Row(Club='Kaiserslautern'),
 Row(Club='Karlsruher SC'),
 Row(Club='Schalke 04'),
 Row(Club='Austria Salzburg'),
 Row(Club='Santos'),
 Row(Club='Rapid București'),
 Row(Club='Lokomotiv Sofia'),
 Row(Club='Vitesse Arnhem'),
 Row(Club="Queen's Park Rangers"),
 Row(Club='Košice'),
 Row(Club='Sunderland'),
 Row(Club='Chemie Halle'),
 Row(Club='Górnik Zabrze'),
 Row(Club='Wiener SC'),
 Row(Club='Honvéd'),
 Row(Club='Beveren'),
 Row(Club='Zenit St. Petersburg'),
 Row(Club='Dynamo Moscow'),
 Row(Club='Iraklis'),
 Row(Club='Dynamo Dresden'),
 Row(Club='Austria Wien'),
 Row(Club='MTK Budapest'),
 Row(Club='Manchester United'),
 Row(Club='Legia Warsaw'),
 Row(Club='Sparta Prague'),
 Row(Club='Cardiff City'),
 Row(Club='Wismut Aue'),
 Row(Club='Angers'),
 Row(Club='First Vienna'),
 Row(Club='Arsenal'),
 Row(Club='Bologna'),
 Row(Club='Young Fellows Zürich'),
 Row(Club='Red Star Belgrade'),
 Row(Club='Ajax'),
 Row(Club='Panathinaikos'),
 Row(

In [23]:
#Identify of null values
st_bd.filter(st_bd.Club.isNull()).count()

0

In [24]:
#Identify distinct patterns 
#Column Nationality
st_bd.select('Nationality').distinct().collect()

[Row(Nationality='Russia'),
 Row(Nationality='Senegal'),
 Row(Nationality='Sweden'),
 Row(Nationality='Turkey'),
 Row(Nationality='Iraq'),
 Row(Nationality='Germany'),
 Row(Nationality='Ivory Coast'),
 Row(Nationality='France'),
 Row(Nationality='Greece'),
 Row(Nationality='Togo'),
 Row(Nationality='Algeria'),
 Row(Nationality='Wales'),
 Row(Nationality='Argentina'),
 Row(Nationality='Belgium'),
 Row(Nationality='Albania'),
 Row(Nationality='Finland'),
 Row(Nationality='Ghana'),
 Row(Nationality='Chile'),
 Row(Nationality='Czechoslovakia'),
 Row(Nationality='Croatia'),
 Row(Nationality='Nigeria'),
 Row(Nationality='Gabon'),
 Row(Nationality='Italy'),
 Row(Nationality='Norway'),
 Row(Nationality='Spain'),
 Row(Nationality='Denmark'),
 Row(Nationality='Trinidad/Tobago'),
 Row(Nationality='Ireland'),
 Row(Nationality='Ukraine'),
 Row(Nationality='Uruguay'),
 Row(Nationality='Mexico'),
 Row(Nationality='Liberia'),
 Row(Nationality='Yugoslavia'),
 Row(Nationality='Switzerland'),
 Row(Nation

In [25]:
#Identify of null values
st_bd.filter(st_bd.Nationality.isNull()).count()

0

In [26]:
#Identify distinct patterns 
#Column Points
st_bd.select('Points').distinct().collect()

[Row(Points=148),
 Row(Points=31),
 Row(Points=85),
 Row(Points=137),
 Row(Points=65),
 Row(Points=255),
 Row(Points=53),
 Row(Points=133),
 Row(Points=78),
 Row(Points=108),
 Row(Points=34),
 Row(Points=115),
 Row(Points=81),
 Row(Points=28),
 Row(Points=210),
 Row(Points=27),
 Row(Points=26),
 Row(Points=44),
 Row(Points=12),
 Row(Points=336),
 Row(Points=91),
 Row(Points=222),
 Row(Points=22),
 Row(Points=128),
 Row(Points=122),
 Row(Points=233),
 Row(Points=190),
 Row(Points=225),
 Row(Points=476),
 Row(Points=47),
 Row(Points=140),
 Row(Points=444),
 Row(Points=473),
 Row(Points=None),
 Row(Points=1),
 Row(Points=52),
 Row(Points=13),
 Row(Points=280),
 Row(Points=16),
 Row(Points=6),
 Row(Points=86),
 Row(Points=3),
 Row(Points=142),
 Row(Points=40),
 Row(Points=20),
 Row(Points=169),
 Row(Points=139),
 Row(Points=57),
 Row(Points=54),
 Row(Points=96),
 Row(Points=48),
 Row(Points=5),
 Row(Points=19),
 Row(Points=64),
 Row(Points=41),
 Row(Points=154),
 Row(Points=347),
 Row(Poin

In [27]:
#Identify of null values
st_bd.filter(st_bd.Points.isNull()).count()

115

In [28]:
#Identify distinct patterns 
#Column P1
st_bd.select('P1').distinct().collect()

[Row(P1=31),
 Row(P1=78),
 Row(P1=28),
 Row(P1=26),
 Row(P1=27),
 Row(P1=12),
 Row(P1=22),
 Row(P1=None),
 Row(P1=1),
 Row(P1=13),
 Row(P1=6),
 Row(P1=16),
 Row(P1=3),
 Row(P1=20),
 Row(P1=5),
 Row(P1=19),
 Row(P1=15),
 Row(P1=17),
 Row(P1=9),
 Row(P1=4),
 Row(P1=8),
 Row(P1=23),
 Row(P1=7),
 Row(P1=10),
 Row(P1=77),
 Row(P1=45),
 Row(P1=38),
 Row(P1=25),
 Row(P1=24),
 Row(P1=29),
 Row(P1=21),
 Row(P1=90),
 Row(P1=11),
 Row(P1=33),
 Row(P1=14),
 Row(P1=2),
 Row(P1=79),
 Row(P1=0),
 Row(P1=18)]

In [29]:
#Identify of null values
st_bd.filter(st_bd.P1.isNull()).count()

198

In [30]:
#Identify distinct patterns 
#Column P2
st_bd.select('P2').distinct().collect()

[Row(P2=28),
 Row(P2=27),
 Row(P2=12),
 Row(P2=None),
 Row(P2=1),
 Row(P2=13),
 Row(P2=6),
 Row(P2=16),
 Row(P2=3),
 Row(P2=5),
 Row(P2=41),
 Row(P2=15),
 Row(P2=9),
 Row(P2=17),
 Row(P2=4),
 Row(P2=8),
 Row(P2=23),
 Row(P2=7),
 Row(P2=10),
 Row(P2=45),
 Row(P2=21),
 Row(P2=32),
 Row(P2=33),
 Row(P2=11),
 Row(P2=14),
 Row(P2=2),
 Row(P2=0),
 Row(P2=18)]

In [31]:
#Identify of null values
st_bd.filter(st_bd.P2.isNull()).count()

198

In [32]:
#Identify distinct patterns 
#Column P3
st_bd.select('P3').distinct().collect()

[Row(P3=34),
 Row(P3=26),
 Row(P3=27),
 Row(P3=12),
 Row(P3=22),
 Row(P3=None),
 Row(P3=1),
 Row(P3=16),
 Row(P3=6),
 Row(P3=3),
 Row(P3=20),
 Row(P3=5),
 Row(P3=19),
 Row(P3=15),
 Row(P3=9),
 Row(P3=4),
 Row(P3=8),
 Row(P3=23),
 Row(P3=7),
 Row(P3=10),
 Row(P3=25),
 Row(P3=24),
 Row(P3=11),
 Row(P3=14),
 Row(P3=2),
 Row(P3=0),
 Row(P3=18)]

In [33]:
#Identify of null values
st_bd.filter(st_bd.P3.isNull()).count()

198

In [34]:
#Identify distinct patterns 
#Column P4
st_bd.select('P4').distinct().collect()

[Row(P4=22),
 Row(P4=None),
 Row(P4=1),
 Row(P4=13),
 Row(P4=6),
 Row(P4=3),
 Row(P4=20),
 Row(P4=5),
 Row(P4=19),
 Row(P4=15),
 Row(P4=9),
 Row(P4=17),
 Row(P4=4),
 Row(P4=8),
 Row(P4=7),
 Row(P4=10),
 Row(P4=33),
 Row(P4=11),
 Row(P4=14),
 Row(P4=2),
 Row(P4=30),
 Row(P4=0),
 Row(P4=18)]

In [35]:
#Identify of null values
st_bd.filter(st_bd.P4.isNull()).count()

198

In [36]:
#Identify distinct patterns 
#Column P5
st_bd.select('P5').distinct().collect()

[Row(P5=27),
 Row(P5=12),
 Row(P5=None),
 Row(P5=1),
 Row(P5=6),
 Row(P5=3),
 Row(P5=20),
 Row(P5=5),
 Row(P5=9),
 Row(P5=35),
 Row(P5=4),
 Row(P5=8),
 Row(P5=7),
 Row(P5=10),
 Row(P5=29),
 Row(P5=11),
 Row(P5=2),
 Row(P5=0),
 Row(P5=18)]

In [37]:
#Identify of null values
st_bd.filter(st_bd.P5.isNull()).count()

198

In [38]:
#Identify distinct patterns 
#Column Votes
st_bd.select('Votes').distinct().collect()

[Row(Votes=34),
 Row(Votes=28),
 Row(Votes=76),
 Row(Votes=26),
 Row(Votes=27),
 Row(Votes=44),
 Row(Votes=12),
 Row(Votes=22),
 Row(Votes=47),
 Row(Votes=1),
 Row(Votes=13),
 Row(Votes=6),
 Row(Votes=16),
 Row(Votes=3),
 Row(Votes=20),
 Row(Votes=40),
 Row(Votes=57),
 Row(Votes=96),
 Row(Votes=48),
 Row(Votes=5),
 Row(Votes=163),
 Row(Votes=19),
 Row(Votes=41),
 Row(Votes=15),
 Row(Votes=43),
 Row(Votes=37),
 Row(Votes=107),
 Row(Votes=9),
 Row(Votes=17),
 Row(Votes=35),
 Row(Votes=4),
 Row(Votes=59),
 Row(Votes=8),
 Row(Votes=23),
 Row(Votes=39),
 Row(Votes=7),
 Row(Votes=87),
 Row(Votes=63),
 Row(Votes=10),
 Row(Votes=50),
 Row(Votes=45),
 Row(Votes=38),
 Row(Votes=82),
 Row(Votes=25),
 Row(Votes=113),
 Row(Votes=24),
 Row(Votes=95),
 Row(Votes=29),
 Row(Votes=21),
 Row(Votes=32),
 Row(Votes=90),
 Row(Votes=145),
 Row(Votes=11),
 Row(Votes=33),
 Row(Votes=14),
 Row(Votes=42),
 Row(Votes=2),
 Row(Votes=79),
 Row(Votes=46),
 Row(Votes=0),
 Row(Votes=18),
 Row(Votes=36)]

In [39]:
#Identify of null values
st_bd.filter(st_bd.Votes.isNull()).count()

0

In [40]:
#Identify distinct patterns 
#Column Rankpts
st_bd.select('Rankpts').distinct().collect()

[Row(Rankpts=2.4),
 Row(Rankpts=2.9),
 Row(Rankpts=12.5),
 Row(Rankpts=4.5),
 Row(Rankpts=1.4),
 Row(Rankpts=1.7),
 Row(Rankpts=2.3),
 Row(Rankpts=16.7),
 Row(Rankpts=2.5),
 Row(Rankpts=1.0),
 Row(Rankpts=3.1),
 Row(Rankpts=25.0),
 Row(Rankpts=2.2),
 Row(Rankpts=2.8),
 Row(Rankpts=1.9),
 Row(Rankpts=50.0),
 Row(Rankpts=3.8),
 Row(Rankpts=4.2),
 Row(Rankpts=1.6),
 Row(Rankpts=2.0),
 Row(Rankpts=1.8),
 Row(Rankpts=1.5),
 Row(Rankpts=6.3),
 Row(Rankpts=2.6),
 Row(Rankpts=10.0),
 Row(Rankpts=3.6),
 Row(Rankpts=5.0),
 Row(Rankpts=2.1),
 Row(Rankpts=8.3),
 Row(Rankpts=3.3),
 Row(Rankpts=7.1),
 Row(Rankpts=5.6)]

In [41]:
#Identify of null values
st_bd.filter(st_bd.Rankpts.isNull()).count()

0

In [42]:
#Identify distinct patterns 
#Column Year
st_bd.select('Year').distinct().collect()

[Row(Year=1959),
 Row(Year=1990),
 Row(Year=1975),
 Row(Year=1977),
 Row(Year=2003),
 Row(Year=2007),
 Row(Year=2018),
 Row(Year=1974),
 Row(Year=2015),
 Row(Year=2006),
 Row(Year=1978),
 Row(Year=1961),
 Row(Year=2013),
 Row(Year=1956),
 Row(Year=1997),
 Row(Year=1988),
 Row(Year=1994),
 Row(Year=1968),
 Row(Year=2014),
 Row(Year=1973),
 Row(Year=1979),
 Row(Year=1971),
 Row(Year=1966),
 Row(Year=2004),
 Row(Year=1991),
 Row(Year=1967),
 Row(Year=1969),
 Row(Year=1982),
 Row(Year=1957),
 Row(Year=1996),
 Row(Year=1989),
 Row(Year=1998),
 Row(Year=1963),
 Row(Year=1965),
 Row(Year=1985),
 Row(Year=1960),
 Row(Year=1970),
 Row(Year=2012),
 Row(Year=1987),
 Row(Year=2009),
 Row(Year=2016),
 Row(Year=1995),
 Row(Year=1980),
 Row(Year=2001),
 Row(Year=1972),
 Row(Year=1958),
 Row(Year=1992),
 Row(Year=1983),
 Row(Year=2005),
 Row(Year=1984),
 Row(Year=2000),
 Row(Year=1964),
 Row(Year=1981),
 Row(Year=2010),
 Row(Year=1986),
 Row(Year=2011),
 Row(Year=1976),
 Row(Year=2008),
 Row(Year=2017

In [43]:
#Identify of null values
st_bd.filter(st_bd.Year.isNull()).count()

0

In [44]:
st_bd \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/BallonDOr.db/deltalake_table/")