In [3]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import *
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType

In [4]:
#Configuração do Spark
warehouse_location = 'hdfs://hdfs-nn:9000/demo/silver'
builder = SparkSession \
    .builder \
    .appName("Python Spark") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
#DROP DATABASE

spark.sql(
    """
    DROP DATABASE IF EXISTS league_stats CASCADE
    """
)

DataFrame[]

In [6]:
#CRIAR league_stats.db

spark.sql(
    """
    CREATE DATABASE league_stats LOCATION 'hdfs://hdfs-nn:9000/demo/silver/league_stats.db/'
    """
)

#(drop da deltalake table league_stats)

spark.sql(
    """
    DROP TABLE IF EXISTS league_stats.deltalake_table
    """
)

#CRIAR A DELTALAKE TABLE

spark.sql(
    """
    CREATE EXTERNAL TABLE league_stats.deltalake_table (
        HomeTeam STRING,
        AwayTeam STRING,
        FTR STRING,
        Date STRING,
        FTHG INT,
        FTAG INT,
        HS INT,
        AS INT,
        HST INT,
        AST INT,
        HC INT,
        AC INT
    )
    USING DELTA
    PARTITIONED BY (
         Div STRING
    )
    LOCATION 'hdfs://hdfs-nn:9000/demo/silver/league_stats.db/deltalake_table/'
    """
)

DataFrame[]

In [8]:
#LER O FICHEIRO .CSV NO HDFS E COLOCAR NUM DATAFRAME

hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/league_stats.csv"


#DEFINIR O ESQUEMA DO DATAFRAME

customSchema = StructType([
    StructField("Div", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("HomeTeam", StringType(), True),        
    StructField("AwayTeam", StringType(), True),
    StructField("FTHG", IntegerType(), True),
    StructField("FTAG", IntegerType(), True),
    StructField("FTR", StringType(), True),
    StructField("HTHG", IntegerType(), True),
    StructField("HTAG", IntegerType(), True),
    StructField("HTR", StringType(), True),
    StructField("Referee", StringType(), True),
    StructField("HS", IntegerType(), True),
    StructField("AS", IntegerType(), True),
    StructField("HST", IntegerType(), True),
    StructField("AST", IntegerType(), True),
    StructField("HF", IntegerType(), True),
    StructField("AF", IntegerType(), True),
    StructField("HC", IntegerType(), True),
    StructField("AC", IntegerType(), True),
    StructField("HY", IntegerType(), True),
    StructField("AY", IntegerType(), True),
    StructField("HR", IntegerType(), True),
    StructField("AR", IntegerType(), True),
    StructField("B365H", FloatType(), True),
    StructField("B365D", FloatType(), True),
    StructField("B365A", FloatType(), True),
    StructField("BWH", FloatType(), True),
    StructField("BWD", FloatType(), True),
    StructField("BWA", FloatType(), True),
    StructField("IWH", FloatType(), True),
    StructField("IWD", FloatType(), True),
    StructField("IWA", FloatType(), True),
    StructField("PSH", FloatType(), True),
    StructField("PSD", FloatType(), True),
    StructField("PSA", FloatType(), True),
    StructField("WHH", FloatType(), True),
    StructField("WHD", FloatType(), True),
    StructField("WHA", FloatType(), True),
    StructField("VCH", FloatType(), True),
    StructField("VCD", FloatType(), True),
    StructField("VCA", FloatType(), True),
    StructField("Bb1X2", IntegerType(), True),
    StructField("BbMxH", FloatType(), True),
    StructField("BbAvH", FloatType(), True),
    StructField("BbMxD", FloatType(), True),
    StructField("BbAvD", FloatType(), True),
    StructField("BbMxA", FloatType(), True),
    StructField("BbAvA", FloatType(), True),
    StructField("BbOU", IntegerType(), True),
    StructField("BbMx>2.5", FloatType(), True),
    StructField("BbAv>2.5", FloatType(), True),
    StructField("BbMx<2.5", FloatType(), True),
    StructField("BbAv<2.5", FloatType(), True),
    StructField("BbAH", IntegerType(), True),
    StructField("BbAHh", FloatType(), True),
    StructField("BbMxAHH", FloatType(), True),
    StructField("BbAvAHH", FloatType(), True),
    StructField("BbMxAHA", FloatType(), True),
    StructField("BbAvAHA", FloatType(), True),
    StructField("PSCH", FloatType(), True),
    StructField("PSCD", FloatType(), True),
    StructField("PSCA", FloatType(), True)
])

sales_df = spark \
           .read\
           .option("delimiter",",")\
           .option("header","true")\
           .schema(customSchema) \
           .csv(hdfs_path) \
           .drop_duplicates()

sales_df.show()
sales_df.printSchema()

+---+--------+--------------+-----------+----+----+---+----+----+---+----------+---+---+---+---+---+---+---+---+---+---+---+----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+-----+-----+-----+-----+-----+-----+----+--------+--------+--------+--------+----+-----+-------+-------+-------+-------+----+-----+-----+
|Div|    Date|      HomeTeam|   AwayTeam|FTHG|FTAG|FTR|HTHG|HTAG|HTR|   Referee| HS| AS|HST|AST| HF| AF| HC| AC| HY| AY| HR|  AR|B365H|B365D|B365A| BWH| BWD| BWA| IWH| IWD| IWA| PSH| PSD| PSA| WHH| WHD|  WHA| VCH| VCD| VCA|Bb1X2|BbMxH|BbAvH|BbMxD|BbAvD|BbMxA|BbAvA|BbOU|BbMx>2.5|BbAv>2.5|BbMx<2.5|BbAv<2.5|BbAH|BbAHh|BbMxAHH|BbAvAHH|BbMxAHA|BbAvAHA|PSCH| PSCD| PSCA|
+---+--------+--------------+-----------+----+----+---+----+----+---+----------+---+---+---+---+---+---+---+---+---+---+---+----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-

In [9]:
#ESCREVER A TABELA ANTERIOR NO DELTALAKE

sales_df \
          .select("HomeTeam","AwayTeam", "FTR", "Date", "HS", "AS", "FTHG", "FTAG", "HST", "AST", "HC", "AC", "Div") \
          .write \
          .mode("overwrite") \
          .partitionBy("Div") \
          .format("delta") \
          .save("hdfs://hdfs-nn:9000/demo/silver/league_stats.db/deltalake_table/")

In [10]:
#MOSTRAR A DELTALAKE TABLE

spark.sql(
    """
    SELECT *
    FROM league_stats.deltalake_table
    """
).show()

+--------------+-----------+---+--------+----+----+---+---+---+---+---+---+---+
|      HomeTeam|   AwayTeam|FTR|    Date|FTHG|FTAG| HS| AS|HST|AST| HC| AC|Div|
+--------------+-----------+---+--------+----+----+---+---+---+---+---+---+---+
|     Liverpool|    Arsenal|  D|21/12/14|   2|   2| 27|  7| 10|  3| 10|  6| E0|
|       Swansea|Aston Villa|  H|26/12/14|   1|   0|  7| 11|  2|  3|  4|  0| E0|
|     Liverpool|  Leicester|  D|01/01/15|   2|   2| 18| 16|  6|  3|  4|  5| E0|
|      West Ham|  Newcastle|  H|14/09/15|   2|   0| 17| 14|  3|  4|  6|  7| E0|
|      Man City|  Newcastle|  H|03/10/15|   6|   1| 23|  6| 11|  4|  6|  3| E0|
|    Sunderland|Southampton|  A|07/11/15|   0|   1|  9| 11|  3|  4| 10| 11| E0|
|   Aston Villa|    Everton|  A|01/03/16|   1|   3| 14| 10|  5|  7|  9|  8| E0|
|    Man United|    Watford|  H|02/03/16|   1|   0| 14| 13|  3|  3|  9|  7| E0|
|       Norwich|   Man City|  D|12/03/16|   0|   0|  5| 15|  0|  3|  3|  8| E0|
|      Man City|    Watford|  H|14/12/16

In [8]:
#TRANSFORMATIONS

In [11]:
#MOSTRAR A DELTALAKE TABLE

spark.sql(
    """
    SELECT *
    FROM league_stats.deltalake_table
    """
).show()

+--------------+-----------+---+--------+----+----+---+---+---+---+---+---+---+
|      HomeTeam|   AwayTeam|FTR|    Date|FTHG|FTAG| HS| AS|HST|AST| HC| AC|Div|
+--------------+-----------+---+--------+----+----+---+---+---+---+---+---+---+
|     Liverpool|    Arsenal|  D|21/12/14|   2|   2| 27|  7| 10|  3| 10|  6| E0|
|       Swansea|Aston Villa|  H|26/12/14|   1|   0|  7| 11|  2|  3|  4|  0| E0|
|     Liverpool|  Leicester|  D|01/01/15|   2|   2| 18| 16|  6|  3|  4|  5| E0|
|      West Ham|  Newcastle|  H|14/09/15|   2|   0| 17| 14|  3|  4|  6|  7| E0|
|      Man City|  Newcastle|  H|03/10/15|   6|   1| 23|  6| 11|  4|  6|  3| E0|
|    Sunderland|Southampton|  A|07/11/15|   0|   1|  9| 11|  3|  4| 10| 11| E0|
|   Aston Villa|    Everton|  A|01/03/16|   1|   3| 14| 10|  5|  7|  9|  8| E0|
|    Man United|    Watford|  H|02/03/16|   1|   0| 14| 13|  3|  3|  9|  7| E0|
|       Norwich|   Man City|  D|12/03/16|   0|   0|  5| 15|  0|  3|  3|  8| E0|
|      Man City|    Watford|  H|14/12/16

In [10]:
#ALTERAR A COLUNA DIVISION

#(D1 para Bundesliga)

spark.sql(
    """    
    UPDATE league_stats.deltalake_table
    SET Div = REPLACE(Div, 'D1', 'Germany')
    """
)

#(I1 para Serie A)

spark.sql(
    """    
    UPDATE league_stats.deltalake_table
    SET Div = REPLACE(Div, 'I1', 'Italy')
    """
)

#(E0 para Premier League)

spark.sql(
    """    
    UPDATE league_stats.deltalake_table
    SET Div = REPLACE(Div, 'E0', 'England')
    """
)

DataFrame[]

In [11]:
#ALTERAR A COLUNA FTR

#(H para Home)

spark.sql(
    """    
    UPDATE league_stats.deltalake_table
    SET FTR = REPLACE(FTR, 'H', 'Home')
    """
)

#(A para Away)

spark.sql(
    """    
    UPDATE league_stats.deltalake_table
    SET FTR = REPLACE(FTR, 'A', 'Away')
    """
)

#(D para Draw)

spark.sql(
    """    
    UPDATE league_stats.deltalake_table
    SET FTR = REPLACE(FTR, 'D', 'Draw')
    """
)

DataFrame[]

In [12]:
#MOSTRAR A DELTALAKE TABLE (já com as alterações)

spark.sql(
    """
    SELECT *
    FROM league_stats.deltalake_table
    """
).show()

+--------------+-----------+---+--------+----+----+---+---+---+---+---+---+---+
|      HomeTeam|   AwayTeam|FTR|    Date|FTHG|FTAG| HS| AS|HST|AST| HC| AC|Div|
+--------------+-----------+---+--------+----+----+---+---+---+---+---+---+---+
|     Liverpool|    Arsenal|  D|21/12/14|   2|   2| 27|  7| 10|  3| 10|  6| E0|
|       Swansea|Aston Villa|  H|26/12/14|   1|   0|  7| 11|  2|  3|  4|  0| E0|
|     Liverpool|  Leicester|  D|01/01/15|   2|   2| 18| 16|  6|  3|  4|  5| E0|
|      West Ham|  Newcastle|  H|14/09/15|   2|   0| 17| 14|  3|  4|  6|  7| E0|
|      Man City|  Newcastle|  H|03/10/15|   6|   1| 23|  6| 11|  4|  6|  3| E0|
|    Sunderland|Southampton|  A|07/11/15|   0|   1|  9| 11|  3|  4| 10| 11| E0|
|   Aston Villa|    Everton|  A|01/03/16|   1|   3| 14| 10|  5|  7|  9|  8| E0|
|    Man United|    Watford|  H|02/03/16|   1|   0| 14| 13|  3|  3|  9|  7| E0|
|       Norwich|   Man City|  D|12/03/16|   0|   0|  5| 15|  0|  3|  3|  8| E0|
|      Man City|    Watford|  H|14/12/16

In [13]:
#FAZER VERIFICAÇÕES

spark.sql(
    """
    Select COUNT(*) FROM league_stats.deltalake_table
    """
).show()

+--------+
|count(1)|
+--------+
|    5330|
+--------+



In [14]:
spark.sql(
    """
    SELECT HomeTeam, AwayTeam
    FROM league_stats.deltalake_table
    """
).show()

+--------------+-----------+
|      HomeTeam|   AwayTeam|
+--------------+-----------+
|     Liverpool|    Arsenal|
|       Swansea|Aston Villa|
|     Liverpool|  Leicester|
|      West Ham|  Newcastle|
|      Man City|  Newcastle|
|    Sunderland|Southampton|
|   Aston Villa|    Everton|
|    Man United|    Watford|
|       Norwich|   Man City|
|      Man City|    Watford|
|Crystal Palace|    Chelsea|
|         Stoke|    Watford|
|     Tottenham|    Burnley|
|     West Brom|      Stoke|
|Crystal Palace|Southampton|
|     West Brom|   West Ham|
|   Bournemouth|   West Ham|
|     Liverpool|  Leicester|
|     Tottenham|    Everton|
|       Swansea|        QPR|
+--------------+-----------+
only showing top 20 rows



In [15]:
#DATA QUALITY TREATMENT

#LER O DATAFRAME

sales_df = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/league_stats.db/deltalake_table/")

In [None]:
#IDENTIFICAR LINHAS DUPLICADAS NO DELTALAKE 

import pyspark.sql.functions as f
sales_df.join(
    sales_df.groupBy(sales_df.columns).agg((f.count("*")>1).cast("int").alias("Duplicate_indicator")),
    on=sales_df.columns,
    how="inner"
).show()

In [21]:
#IDENTIFICAR VALORES DISTINTOS 
#(por exemplo, da coluna HomeTeam)

sales_df.select('HomeTeam').distinct().collect()

[Row(HomeTeam='Palermo'),
 Row(HomeTeam='Mainz'),
 Row(HomeTeam='Carpi'),
 Row(HomeTeam='Tottenham'),
 Row(HomeTeam='Brighton'),
 Row(HomeTeam='Schalke 04'),
 Row(HomeTeam='Nurnberg'),
 Row(HomeTeam='Sunderland'),
 Row(HomeTeam='Hull'),
 Row(HomeTeam='Bologna'),
 Row(HomeTeam='Man City'),
 Row(HomeTeam='Arsenal'),
 Row(HomeTeam='Cardiff'),
 Row(HomeTeam='Hoffenheim'),
 Row(HomeTeam='Lazio'),
 Row(HomeTeam='Darmstadt'),
 Row(HomeTeam='RB Leipzig'),
 Row(HomeTeam='Norwich'),
 Row(HomeTeam='Spal'),
 Row(HomeTeam='Werder Bremen'),
 Row(HomeTeam='Verona'),
 Row(HomeTeam='Hertha'),
 Row(HomeTeam='Crystal Palace'),
 Row(HomeTeam='Burnley'),
 Row(HomeTeam='Leicester'),
 Row(HomeTeam='Aston Villa'),
 Row(HomeTeam='Leverkusen'),
 Row(HomeTeam='FC Koln'),
 Row(HomeTeam='Freiburg'),
 Row(HomeTeam='Bournemouth'),
 Row(HomeTeam='QPR'),
 Row(HomeTeam='Ein Frankfurt'),
 Row(HomeTeam='Hamburg'),
 Row(HomeTeam='Hannover'),
 Row(HomeTeam='Genoa'),
 Row(HomeTeam='Sassuolo'),
 Row(HomeTeam='Roma'),
 Row(Ho

In [22]:
#IDENTIFICAR VALORES DISTINTOS 
sales_df.select('Div').distinct().collect()

[Row(Div='Germany'), Row(Div='Italy'), Row(Div='England')]

In [23]:
#IDENTIFICAR VALORES NULOS
#(por exemplo, da coluna HST)

sales_df.filter(sales_df.HS.isNull()).show()

+--------------+-----------+----+--------+----+----+----+---+---+---+---+---+-------+
|      HomeTeam|   AwayTeam| FTR|    Date|FTHG|FTAG|  HS| AS|HST|AST| HC| AC|    Div|
+--------------+-----------+----+--------+----+----+----+---+---+---+---+---+-------+
|    Sunderland|  Liverpool|Away|10/01/15|   0|   1|null|  5| 21|  1| 11|  6|England|
|Crystal Palace| Sunderland|Away|23/11/15|   0|   1|null| 19| 12|  7| 11| 11|England|
|     West Brom|Aston Villa|Draw|23/01/16|   0|   0|null|  4| 15|  0| 13|  2|England|
|       Chelsea|  Leicester|Home|15/10/16|   3|   0|null| 16|  5|  6| 12|  8|England|
|       Burnley|   Man City|Away|26/11/16|   1|   2|null| 10| 21|  5| 15|  5|England|
|   Southampton|   West Ham|Away|04/02/17|   1|   3|null| 21|  6|  7| 15|  3|England|
|     Leicester| Sunderland|Home|04/04/17|   2|   0|null| 19| 15|  8| 13|  5|England|
|     Tottenham| Man United|Home|14/05/17|   2|   1|null| 14|  8|  7| 15|  8|England|
|       Swansea|    Arsenal|Home|30/01/18|   3|   1|nu

In [24]:
sales_df \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/league_stats.db/deltalake_table/")