In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
import pyspark.sql.functions as f

In [2]:
# CONFIGURAR O SPARK
warehouse_location = 'hdfs://hdfs-nn:9000/demo/silver/projeto'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
#DROP DO DB
spark.sql(
    """
    DROP DATABASE IF EXISTS transfers CASCADE
    """
)
# CRIAR A DATABASE - transfers.db
spark.sql(
    """
    CREATE DATABASE transfers LOCATION 'hdfs://hdfs-nn:9000/demo/silver/projeto/transfers.db/'
    """
)
#DROP DO DELTALAKE TABLE
spark.sql(
    """
    DROP TABLE IF EXISTS transfers.deltalake_table 
    """
)
# CRIAR O DELTALAKE TABLE
spark.sql(
    """
    CREATE EXTERNAL TABLE transfers.deltalake_table (
        Name CHAR(100),
        Position CHAR(100),
        Age INT,       
        Team_from CHAR(100),
        League_from CHAR(100),
        Team_to CHAR(100),
        League_to CHAR(100),
        Transfer_fee INT
        
    )
    USING DELTA
    PARTITIONED BY (Season CHAR(100)
    )
    LOCATION 'hdfs://hdfs-nn:9000/demo/silver/projeto/transfers.db/deltalake_table/'
    """
)

DataFrame[]

In [4]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+--------------+
|     namespace|
+--------------+
|       default|
|          demo|
|gold_transfers|
|    mls_salary|
|      products|
|         sales|
|     transfers|
+--------------+



In [5]:
spark.sql(
    """
    SHOW TABLES FROM transfers
    """
).show()

+---------+---------------+-----------+
| database|      tableName|isTemporary|
+---------+---------------+-----------+
|transfers|deltalake_table|      false|
+---------+---------------+-----------+



In [6]:
# VER O DELTALAKE TABLE
spark.sql(
    """
    SELECT *
    FROM transfers.deltalake_table
    """
).show()

+----+--------+---+---------+-----------+-------+---------+------------+------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Transfer_fee|Season|
+----+--------+---+---------+-----------+-------+---------+------------+------+
+----+--------+---+---------+-----------+-------+---------+------------+------+



In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED transfers.deltalake_table
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,Name,string,
1,Position,string,
2,Age,int,
3,Team_from,string,
4,League_from,string,
5,Team_to,string,
6,League_to,string,
7,Transfer_fee,int,
8,Season,string,
9,,,


In [8]:
# LER O FICHEIRO CSV NO HDFS E COLOCAR NUM DATAFRAME
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/projeto/top250-00-19.csv"
# DEFINIR OS ESQUEMA DO DATAFRAME

customSchema = StructType([
    StructField("Name", StringType(), True),        
    StructField("Position", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Team_from", StringType(), True),
    StructField("League_from", StringType(), True),
    StructField("Team_to", StringType(), True),
    StructField("League_to", StringType(), True),
    StructField("Season", StringType(), True),
    StructField("Transfer_fee", IntegerType(), True),  
])

transfers_df = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
transfers_df.show()
transfers_df.printSchema()

+--------------------+------------------+---+---------------+--------------+-------------+--------------+---------+------------+
|                Name|          Position|Age|      Team_from|   League_from|      Team_to|     League_to|   Season|Transfer_fee|
+--------------------+------------------+---+---------------+--------------+-------------+--------------+---------+------------+
|           Luís Figo|      Right Winger| 27|   FC Barcelona|        LaLiga|  Real Madrid|        LaLiga|2000-2001|        null|
|       Hernán Crespo|    Centre-Forward| 25|          Parma|       Serie A|        Lazio|       Serie A|2000-2001|        null|
|       Marc Overmars|       Left Winger| 27|        Arsenal|Premier League| FC Barcelona|        LaLiga|2000-2001|        null|
|   Gabriel Batistuta|    Centre-Forward| 31|     Fiorentina|       Serie A|      AS Roma|       Serie A|2000-2001|        null|
|      Nicolas Anelka|    Centre-Forward| 21|    Real Madrid|        LaLiga|     Paris SG|       

In [9]:
#VIEW
transfers_df.createOrReplaceTempView ("TransfersTemp")

In [10]:
spark.sql(
    """
    SELECT *
    FROM TransfersTemp
    """
).show()

+--------------------+------------------+---+---------------+--------------+-------------+--------------+---------+------------+
|                Name|          Position|Age|      Team_from|   League_from|      Team_to|     League_to|   Season|Transfer_fee|
+--------------------+------------------+---+---------------+--------------+-------------+--------------+---------+------------+
|           Luís Figo|      Right Winger| 27|   FC Barcelona|        LaLiga|  Real Madrid|        LaLiga|2000-2001|        null|
|       Hernán Crespo|    Centre-Forward| 25|          Parma|       Serie A|        Lazio|       Serie A|2000-2001|        null|
|       Marc Overmars|       Left Winger| 27|        Arsenal|Premier League| FC Barcelona|        LaLiga|2000-2001|        null|
|   Gabriel Batistuta|    Centre-Forward| 31|     Fiorentina|       Serie A|      AS Roma|       Serie A|2000-2001|        null|
|      Nicolas Anelka|    Centre-Forward| 21|    Real Madrid|        LaLiga|     Paris SG|       

In [11]:
# ESCREVER A TABELA ANTERIOR NO DELTALAKE
transfers_df
transfers_df \
    .select("Name","Position","Age","Team_from","League_from", "Team_to","League_to", "Transfer_fee", "Season") \
    .write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("Season") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/projeto/transfers.db/deltalake_table/")

In [12]:
spark.sql(
    """
    SELECT * FROM transfers.deltalake_table
    """
).show()

+--------------------+------------------+---+---------------+--------------+--------------+--------------+------------+---------+
|                Name|          Position|Age|      Team_from|   League_from|       Team_to|     League_to|Transfer_fee|   Season|
+--------------------+------------------+---+---------------+--------------+--------------+--------------+------------+---------+
|      Michael Essien|Defensive Midfield| 22| Olympique Lyon|       Ligue 1|       Chelsea|Premier League|    45000000|2005-2006|
|Shaun Wright-Phil...|      Right Winger| 23|       Man City|Premier League|       Chelsea|Premier League|     9500000|2005-2006|
|        Sergio Ramos|       Centre-Back| 19|     Sevilla FC|        LaLiga|   Real Madrid|        LaLiga|    27000000|2005-2006|
|        Michael Owen|    Centre-Forward| 25|    Real Madrid|        LaLiga|     Newcastle|Premier League|    22000000|2005-2006|
|   Alberto Gilardino|    Centre-Forward| 23|          Parma|       Serie A|      AC Milan

In [13]:
# FAZER VERIFICAÇÕES
spark.sql(
    """
    Select COUNT(*) FROM transfers.deltalake_table
    """
).show()

+--------+
|count(1)|
+--------+
|    4700|
+--------+



In [14]:
#ELIMINAR AS LINHAS ONDE O VALOR DA TRANSFERÊNCIA NÃO ESTÁ DEFINIDO
#Isto só está a eliminar no dataframe - não pode estar no deltalake
spark.sql(
"""
DELETE FROM transfers.deltalake_table
WHERE Transfer_fee is null
"""
).show()

++
||
++
++



In [15]:
# VERIFICAR QUANTAS LINHAS FORAM ELIMINADAS
spark.sql(
    """
    Select COUNT(*) FROM transfers.deltalake_table
    """
).show()

+--------+
|count(1)|
+--------+
|    3440|
+--------+



In [16]:
# IDENTIFICAR AS LINHAS DUPLICADAS
import pyspark.sql.functions as f
transfers_df.join(
  transfers_df.groupBy(transfers_df.columns).agg((f.count("*")>1).cast("int").alias("Duplicate_indicator")),
    on=transfers_df.columns,
    how="inner"
).show()

+------------------+------------------+---+---------------+------------------+--------------+--------------+---------+------------+-------------------+
|              Name|          Position|Age|      Team_from|       League_from|       Team_to|     League_to|   Season|Transfer_fee|Duplicate_indicator|
+------------------+------------------+---+---------------+------------------+--------------+--------------+---------+------------+-------------------+
|Cristian Silvestri|       Centre-Back| 29|          Lecce|           Serie A|       Catania|       Serie B|2004-2005|     1250000|                  0|
|      Sergey Semak|  Central Midfield| 29|       Paris SG|           Ligue 1|     FK Moskau|  Premier Liga|2005-2006|     2500000|                  0|
|    Ricardo Osorio|        Right-Back| 26|   CD Cruz Azul|  Liga MX Clausura| VfB Stuttgart|  1.Bundesliga|2006-2007|     3000000|                  0|
|   Nenad Kovacevic|Defensive Midfield| 25|       Red Star|         SuperLiga|          

In [17]:
#VER TODOS OS VALORES DISTINCTOS DE CADA COLUNA 
transfers_df.select('Name').distinct().collect()

[Row(Name='Robert Pirès'),
 Row(Name="Patrick M'Boma"),
 Row(Name='John Curtis'),
 Row(Name='Claudio Pizarro'),
 Row(Name='Robert Kovac'),
 Row(Name='Matthew Upson'),
 Row(Name='Marco Storari'),
 Row(Name='Bosingwa'),
 Row(Name='Damian Gorawski'),
 Row(Name='John Welsh'),
 Row(Name='Welliton'),
 Row(Name='Heurelho Gomes'),
 Row(Name='Mauro Formica'),
 Row(Name='Anwar El Ghazi'),
 Row(Name='Lucas Castro'),
 Row(Name='Fábio Rochemback'),
 Row(Name='Fabio Cannavaro'),
 Row(Name='Christian Timm'),
 Row(Name='David Dunn'),
 Row(Name='Sören Larsen'),
 Row(Name='David Odonkor'),
 Row(Name='Arturo Vidal'),
 Row(Name='Ismail Aissati'),
 Row(Name='Andrea Rispoli'),
 Row(Name='Antonio Candreva'),
 Row(Name='Raúl Jiménez'),
 Row(Name='Isaac Brizuela'),
 Row(Name='Yuhao Zhao'),
 Row(Name='Lorenzo Pellegrini'),
 Row(Name='Raúl Ruidíaz'),
 Row(Name='Craig Hignett'),
 Row(Name='Dean Richards'),
 Row(Name='Ianis Zicu'),
 Row(Name='Denis Kolodin'),
 Row(Name='Luton Shelton'),
 Row(Name='Wellington'),
 R

In [18]:
transfers_df.select('Position').distinct().collect()

[Row(Position='Centre-Back'),
 Row(Position='Right-Back'),
 Row(Position='Sweeper'),
 Row(Position='Central Midfield'),
 Row(Position='Defender'),
 Row(Position='Attacking Midfield'),
 Row(Position='Defensive Midfield'),
 Row(Position='Right Winger'),
 Row(Position='Left Midfield'),
 Row(Position='Centre-Forward'),
 Row(Position='Right Midfield'),
 Row(Position='Second Striker'),
 Row(Position='Goalkeeper'),
 Row(Position='Forward'),
 Row(Position='Left Winger'),
 Row(Position='Left-Back'),
 Row(Position='Midfielder')]

In [19]:
transfers_df.select('Age').distinct().collect()

[Row(Age=31),
 Row(Age=34),
 Row(Age=28),
 Row(Age=27),
 Row(Age=26),
 Row(Age=22),
 Row(Age=16),
 Row(Age=20),
 Row(Age=19),
 Row(Age=15),
 Row(Age=17),
 Row(Age=35),
 Row(Age=23),
 Row(Age=25),
 Row(Age=24),
 Row(Age=29),
 Row(Age=21),
 Row(Age=32),
 Row(Age=33),
 Row(Age=30),
 Row(Age=0),
 Row(Age=18)]

In [20]:
transfers_df.select('Team_from').distinct().collect()

[Row(Team_from='Morelia'),
 Row(Team_from='Coritiba FC'),
 Row(Team_from='Beerschot AC'),
 Row(Team_from='Espanyol'),
 Row(Team_from='Karlsruher SC'),
 Row(Team_from='FC Basel'),
 Row(Team_from='Bröndby IF'),
 Row(Team_from='Inter U19'),
 Row(Team_from='Tigres UANL'),
 Row(Team_from='SC Freiburg'),
 Row(Team_from='Asante Kotoko'),
 Row(Team_from='Samsunspor'),
 Row(Team_from='Brighton'),
 Row(Team_from='QD Jonoon'),
 Row(Team_from='SSV Reutlingen'),
 Row(Team_from='Bolton'),
 Row(Team_from='Nordsjaelland'),
 Row(Team_from='Toronto FC'),
 Row(Team_from='Metalurh Z.'),
 Row(Team_from='FC St. Gallen'),
 Row(Team_from='Independiente'),
 Row(Team_from='Inter Bratislava'),
 Row(Team_from='Racing Club'),
 Row(Team_from='BJ Sinobo Guoan'),
 Row(Team_from='Liaoning FC'),
 Row(Team_from='Pol. Warsaw'),
 Row(Team_from='Hellas Verona'),
 Row(Team_from='Avaí FC'),
 Row(Team_from='Besiktas'),
 Row(Team_from='Revolution'),
 Row(Team_from='Lada Togliatti'),
 Row(Team_from='Chicago Fire'),
 Row(Team_fr

In [21]:
transfers_df.select('League_from').distinct().collect()

[Row(League_from='Serie A'),
 Row(League_from=' Uruguay'),
 Row(League_from='Jupiler Pro League'),
 Row(League_from=' Tunisia'),
 Row(League_from=' Moldova'),
 Row(League_from='Primavera B'),
 Row(League_from='Liga MX Clausura'),
 Row(League_from=' Colombia'),
 Row(League_from='Liga Águila II'),
 Row(League_from='A Grupa - Championship gr.'),
 Row(League_from='NB I.'),
 Row(League_from='Challenge League'),
 Row(League_from='U19 Eredivisie'),
 Row(League_from='Serie C - A'),
 Row(League_from=' Iran'),
 Row(League_from='Premier League'),
 Row(League_from='Primera B Nacional'),
 Row(League_from='Premier Liga'),
 Row(League_from=' Croatia'),
 Row(League_from='K League 1'),
 Row(League_from=' Australia'),
 Row(League_from='1.Liga gr. 1'),
 Row(League_from='Ligue I Pro'),
 Row(League_from=' Ghana'),
 Row(League_from='Eredivisie'),
 Row(League_from='Rel. Ligue 1'),
 Row(League_from='Liga MX Apertura'),
 Row(League_from='Primera División'),
 Row(League_from='LaLiga'),
 Row(League_from='J1 - 2n

In [22]:
transfers_df.select('Team_to').distinct().collect()

[Row(Team_to='Espanyol'),
 Row(Team_to='FC Basel'),
 Row(Team_to='Tigres UANL'),
 Row(Team_to='Bröndby IF'),
 Row(Team_to='SC Freiburg'),
 Row(Team_to='Brighton'),
 Row(Team_to='Bolton'),
 Row(Team_to='Toronto FC'),
 Row(Team_to='Independiente'),
 Row(Team_to='Racing Club'),
 Row(Team_to='BJ Sinobo Guoan'),
 Row(Team_to='Liaoning FC'),
 Row(Team_to='Hellas Verona'),
 Row(Team_to='Besiktas'),
 Row(Team_to='Al-Wahda FC'),
 Row(Team_to='Benfica B'),
 Row(Team_to='G. Bordeaux'),
 Row(Team_to='Sunderland'),
 Row(Team_to='Al-Wasl'),
 Row(Team_to='Botafogo'),
 Row(Team_to='Ternana'),
 Row(Team_to='Al Duhail'),
 Row(Team_to='Braga'),
 Row(Team_to='Atlanta United'),
 Row(Team_to='Nottm Forest'),
 Row(Team_to='SH Shenhua'),
 Row(Team_to='Pohang Steelers'),
 Row(Team_to='FC Barcelona B'),
 Row(Team_to='Montpellier'),
 Row(Team_to='Flamengo'),
 Row(Team_to='Ulsan'),
 Row(Team_to='Dep. La Coruña'),
 Row(Team_to='Anderlecht U19'),
 Row(Team_to='Iraklis'),
 Row(Team_to='Reggina'),
 Row(Team_to='Atlét

In [23]:
transfers_df.select('League_to').distinct().collect()

[Row(League_to='Serie A'),
 Row(League_to='Jupiler Pro League'),
 Row(League_to=' Uruguay'),
 Row(League_to='Primavera B'),
 Row(League_to='Liga MX Clausura'),
 Row(League_to='Premier League'),
 Row(League_to='Premier Liga'),
 Row(League_to=' Croatia'),
 Row(League_to=' Israel'),
 Row(League_to='Eredivisie'),
 Row(League_to='Liga MX Apertura'),
 Row(League_to='Primera División'),
 Row(League_to='LaLiga'),
 Row(League_to='J1 - 2nd Stage'),
 Row(League_to=' Denmark'),
 Row(League_to='Ligue 2'),
 Row(League_to=' United Arab Emirates'),
 Row(League_to='Bundesliga'),
 Row(League_to=' Qatar'),
 Row(League_to='Championship'),
 Row(League_to='Série A'),
 Row(League_to='J2 League'),
 Row(League_to='LaLiga2'),
 Row(League_to='League One'),
 Row(League_to=' Belgium'),
 Row(League_to='Serie C - B'),
 Row(League_to='Super League'),
 Row(League_to=' Russia'),
 Row(League_to=' Romania'),
 Row(League_to='Stars League'),
 Row(League_to=' Libya'),
 Row(League_to='SuperLiga'),
 Row(League_to='Professiona

In [24]:
transfers_df.select('Transfer_fee').distinct().collect()

[Row(Transfer_fee=450000),
 Row(Transfer_fee=70000000),
 Row(Transfer_fee=8200000),
 Row(Transfer_fee=950000),
 Row(Transfer_fee=4000000),
 Row(Transfer_fee=120000000),
 Row(Transfer_fee=500000),
 Row(Transfer_fee=16700000),
 Row(Transfer_fee=5750000),
 Row(Transfer_fee=850000),
 Row(Transfer_fee=12000000),
 Row(Transfer_fee=7350000),
 Row(Transfer_fee=24500000),
 Row(Transfer_fee=25000000),
 Row(Transfer_fee=18500000),
 Row(Transfer_fee=8250000),
 Row(Transfer_fee=7000000),
 Row(Transfer_fee=125000),
 Row(Transfer_fee=840000),
 Row(Transfer_fee=1750000),
 Row(Transfer_fee=5800000),
 Row(Transfer_fee=4450000),
 Row(Transfer_fee=22800000),
 Row(Transfer_fee=4900000),
 Row(Transfer_fee=2900000),
 Row(Transfer_fee=9000000),
 Row(Transfer_fee=18000000),
 Row(Transfer_fee=20000000),
 Row(Transfer_fee=100000),
 Row(Transfer_fee=26500000),
 Row(Transfer_fee=1900000),
 Row(Transfer_fee=13500000),
 Row(Transfer_fee=None),
 Row(Transfer_fee=4700000),
 Row(Transfer_fee=1600000),
 Row(Transfer_fee

In [25]:
transfers_df.select('Season').distinct().collect()

[Row(Season='2000-2001'),
 Row(Season='2003-2004'),
 Row(Season='2011-2012'),
 Row(Season='2012-2013'),
 Row(Season='2004-2005'),
 Row(Season='2013-2014'),
 Row(Season='2010-2011'),
 Row(Season='2016-2017'),
 Row(Season='2007-2008'),
 Row(Season='2014-2015'),
 Row(Season='2006-2007'),
 Row(Season='2017-2018'),
 Row(Season='2002-2003'),
 Row(Season='2018-2019'),
 Row(Season='2005-2006'),
 Row(Season='2009-2010'),
 Row(Season='2008-2009'),
 Row(Season='2015-2016'),
 Row(Season='2001-2002')]

In [26]:
#MOSTRA OS VALORES NULOS DE CADA COLUNA
transfers_df.filter(transfers_df.Age.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [27]:
transfers_df.filter(transfers_df.Transfer_fee.isNull()).show()

+--------------------+------------------+---+---------------+--------------+-------------+--------------+---------+------------+
|                Name|          Position|Age|      Team_from|   League_from|      Team_to|     League_to|   Season|Transfer_fee|
+--------------------+------------------+---+---------------+--------------+-------------+--------------+---------+------------+
|           Luís Figo|      Right Winger| 27|   FC Barcelona|        LaLiga|  Real Madrid|        LaLiga|2000-2001|        null|
|       Hernán Crespo|    Centre-Forward| 25|          Parma|       Serie A|        Lazio|       Serie A|2000-2001|        null|
|       Marc Overmars|       Left Winger| 27|        Arsenal|Premier League| FC Barcelona|        LaLiga|2000-2001|        null|
|   Gabriel Batistuta|    Centre-Forward| 31|     Fiorentina|       Serie A|      AS Roma|       Serie A|2000-2001|        null|
|      Nicolas Anelka|    Centre-Forward| 21|    Real Madrid|        LaLiga|     Paris SG|       

In [28]:
transfers_df.filter(transfers_df.Name.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [29]:
transfers_df.filter(transfers_df.Position.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [30]:
transfers_df.filter(transfers_df.Team_from.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [31]:
transfers_df.filter(transfers_df.Team_to.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [32]:
transfers_df.filter(transfers_df.League_from.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [33]:
transfers_df.filter(transfers_df.League_to.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [34]:
transfers_df.filter(transfers_df.Season.isNull()).show()

+----+--------+---+---------+-----------+-------+---------+------+------------+
|Name|Position|Age|Team_from|League_from|Team_to|League_to|Season|Transfer_fee|
+----+--------+---+---------+-----------+-------+---------+------+------------+
+----+--------+---+---------+-----------+-------+---------+------+------------+



In [35]:
transfers_df \
    .select("Name","Position","Age","Team_from","League_from", "Team_to","League_to", "Transfer_fee", "Season") \
    .write \
    .mode("overwrite") \
    .partitionBy("Season") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/projeto/transfers.db/deltalake_table/")

In [36]:
spark.stop()