In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
import pyspark.sql.functions as f

In [2]:
#CONFIGURAR O SPARK
warehouse_location = 'hdfs://hdfs-nn:9000/demo/silver/projeto'

builder = SparkSession \
    .builder \
    .appName("Python Spark") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
#DROP DO DATAFRAME
spark.sql(
"""
DROP DATABASE IF EXISTS mls_salary CASCADE
"""
).show()

++
||
++
++



In [4]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+-----------------+
|        namespace|
+-----------------+
|          default|
|             demo|
|     g_mls_salary|
|  gold_mls_salary|
|  gold_sales_demo|
|silver_sales_demo|
|        transfers|
+-----------------+



In [5]:
#CRIAR mls_salary.db
spark.sql(
    """
    CREATE DATABASE mls_salary LOCATION 'hdfs://hdfs-nn:9000/demo/silver/projeto/mls_salary.db/'
    """
)

DataFrame[]

In [6]:
#VER TODAS AS DATABASES NO HDFS
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+-----------------+
|        namespace|
+-----------------+
|          default|
|             demo|
|     g_mls_salary|
|  gold_mls_salary|
|  gold_sales_demo|
|       mls_salary|
|silver_sales_demo|
|        transfers|
+-----------------+



In [7]:
#DROP DO DELTALAKE
spark.sql(
    """
    DROP TABLE IF EXISTS mls_salary.deltalake_table
    """
)

DataFrame[]

In [8]:
#CRIAR O DELTALAKE TABLE
spark.sql(
    """
    CREATE EXTERNAL TABLE mls_salary.deltalake_table (
        Club CHAR(100),
        Position CHAR(100),
        Base_Salary CHAR(100)
    )
    USING DELTA
    PARTITIONED BY (
         Season INT
    )
    LOCATION 'hdfs://hdfs-nn:9000/demo/silver/projeto/mls_salary.db/deltalake_table/'
    """
)

DataFrame[]

In [9]:
#VER DATALAKE TABLE
spark.sql(
    """
    SELECT *
    FROM mls_salary.deltalake_table
    """
).show()

+----+--------+-----------+------+
|Club|Position|Base_Salary|Season|
+----+--------+-----------+------+
+----+--------+-----------+------+



In [10]:
#LER O FICHEIRO .CSV NO HDFS E COLOCAR NUM DATAFRAME
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/projeto/MLS_Salary.csv"
#DEFINIR O ESQUEMA DO DATAFRAME
customSchema = StructType([
    StructField("Season", IntegerType(), True),        
    StructField("Club(grouped)", StringType(), True),
    StructField("Club", StringType(), True),
    StructField("First Name", StringType(), True),
    StructField("Last Name", StringType(), True),
    StructField("Position", StringType(), True),
    StructField("Total Compensation", StringType(), True),
    StructField("Base_Salary", StringType(), True)
])

salary_df = spark \
            .read\
            .option("delimiter",";")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
salary_df.show()
salary_df.printSchema()

+------+-------------+-----+----------+--------------+--------+------------------+-----------+
|Season|Club(grouped)| Club|First Name|     Last Name|Position|Total Compensation|Base_Salary|
+------+-------------+-----+----------+--------------+--------+------------------+-----------+
|  2018|          TFC|  TOR| Sebastian|      Giovinco|       F|       $7,115,556 |$5,600,000 |
|  2018|          TFC|  TOR|   Michael|       Bradley|       M|       $6,500,000 |$6,000,000 |
|  2018|         LAFC| LAFC|    Carlos|          Vela|       F|       $6,292,500 |$4,500,000 |
|  2018|          CHI|  CHI|   Bastian|Schweinsteiger|       M|       $6,100,000 |$6,100,000 |
|  2018|          LAG|   LA|   Giovani|    dos Santos|       F|       $6,000,000 |$4,250,000 |
|  2018|          NYC|NYCFC|     David|         Villa|       F|       $5,610,000 |$5,610,000 |
|  2018|          TFC|  TOR|      Jozy|      Altidore|       F|       $5,000,000 |$5,000,000 |
|  2018|          MTL|  MTL|   Ignacio|        Pia

In [11]:
#VERIFICAR A EXISTENCIA DE VALORES DUPLICADOS NO DELTALAKE
salary_df.join(
    salary_df.groupBy(salary_df.columns).agg((f.count("*")>1).cast("int").alias("Duplicate_indicator")),
    on=salary_df.columns,
    how="inner"
).show()

+------+-------------+-----+----------+---------+--------+------------------+-----------+-------------------+
|Season|Club(grouped)| Club|First Name|Last Name|Position|Total Compensation|Base_Salary|Duplicate_indicator|
+------+-------------+-----+----------+---------+--------+------------------+-----------+-------------------+
|  2018|          HOU|  HOU|    Romell|   Quioto|       F|         $252,500 |  $240,000 |                  0|
|  2017|          RSL|  RSL|      Luis|    Silva|     M-F|         $208,671 |  $200,004 |                  0|
|  2017|          NYC|NYCFC|     Khiry|  Shelton|       F|         $110,450 |   $92,950 |                  0|
|  2017|          PHI|  PHI|   Giliano|Wijnaldum|       D|          $78,337 |   $65,004 |                  0|
|  2017|          FCD|  DAL|     Bryan| Reynolds|       F|          $55,000 |   $53,000 |                  0|
|  2017|          MTL|  MTL|   Michael|  Salazar|       F|          $54,075 |   $54,075 |                  0|
|  2016|  

In [12]:
#ESCREVER A TABELA ANTERIOR NO DELTALAKE
salary_df \
    .select("Club","Position", "Base_Salary", "Season") \
    .write \
    .mode("overwrite") \
    .partitionBy("Season") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/projeto/mls_salary.db/deltalake_table/")

In [13]:
#FAZER VERIFICAÇÕES
spark.sql(
    """
    Select COUNT(*) FROM mls_salary.deltalake_table
    """
).show()

+--------+
|count(1)|
+--------+
|    6219|
+--------+



In [14]:
spark.sql(
    """
    SELECT Season, Club
    FROM mls_salary.deltalake_table
    """
).show()

+------+-----+
|Season| Club|
+------+-----+
|  2017|  ORL|
|  2017|  TOR|
|  2017|  TOR|
|  2017|NYCFC|
|  2017|NYCFC|
|  2017|   LA|
|  2017|  CHI|
|  2017|  TOR|
|  2017|  SEA|
|  2017|  POR|
|  2017|  COL|
|  2017|  ATL|
|  2017|   LA|
|  2017|NYCFC|
|  2017|   LA|
|  2017|  RSL|
|  2017|  CHI|
|  2017|  VAN|
|  2017|  SEA|
|  2017|  POR|
+------+-----+
only showing top 20 rows



In [15]:
#LIMPEZA DA COLUNA BASE_SALARY
spark.sql(
    """    
    UPDATE mls_salary.deltalake_table
    SET Base_Salary = REPLACE(Base_Salary, '$', '')
    """
).show()

++
||
++
++



In [16]:
spark.sql(
    """    
    UPDATE mls_salary.deltalake_table
    SET Base_Salary = REPLACE(Base_Salary, ',', '')
    """
).show()

++
||
++
++



In [17]:
spark.sql(
    """    
    UPDATE mls_salary.deltalake_table
    SET Base_Salary = REPLACE(Base_Salary, ' ', '')
    """
).show() 

++
||
++
++



In [18]:
salary_df = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/projeto/mls_salary.db/deltalake_table/")

In [19]:
#VERIFICAR A EXISTENCIA DE VALORES DUPLICADOS NO DELTALAKE
salary_df.join(
    salary_df.groupBy(salary_df.columns).agg((f.count("*")>1).cast("int").alias("Duplicate_indicator")),
    on=salary_df.columns,
    how="inner"
).show()

+-----+--------+-----------+------+-------------------+
| Club|Position|Base_Salary|Season|Duplicate_indicator|
+-----+--------+-----------+------+-------------------+
|   SJ|       M|     200000|  2018|                  0|
|MNUFC|       F|     135000|  2018|                  0|
|   SJ|       M|     107993|  2018|                  0|
|   KC|       F|      80004|  2018|                  0|
|  CHI|       F|     500000|  2017|                  0|
|  MTL|       M|     450000|  2017|                  0|
|  PHI|       F|     300000|  2017|                  0|
|  DAL|       D|     205000|  2017|                  0|
|  HOU|       D|     110004|  2017|                  0|
|  TOR|       M|      65004|  2017|                  1|
|  TOR|       M|      65004|  2017|                  1|
|   NE|      GK|      54075|  2017|                  0|
|NYCFC|       F|     207276|  2016|                  0|
|  HOU|       F|      85000|  2016|                  0|
|  VAN|       D|     196900|  2013|             

In [20]:
#VER TODOS OS VALORES DISTINCTOS DE CADA COLUNA 
salary_df.select('Club').distinct().collect()

[Row(Club='LA'),
 Row(Club='DC'),
 Row(Club='SJ'),
 Row(Club='VAN'),
 Row(Club='MTL'),
 Row(Club='TFC'),
 Row(Club='CLB'),
 Row(Club='NE'),
 Row(Club='TOR'),
 Row(Club='CHV'),
 Row(Club='POR'),
 Row(Club='KC'),
 Row(Club='SEA'),
 Row(Club='NYRB'),
 Row(Club='DAL'),
 Row(Club='NY'),
 Row(Club='COL'),
 Row(Club='ATL'),
 Row(Club='PHI'),
 Row(Club='LAFC'),
 Row(Club='MLS'),
 Row(Club='RSL'),
 Row(Club='HOU'),
 Row(Club='MNUFC'),
 Row(Club='NYCFC'),
 Row(Club='CHI'),
 Row(Club='ORL')]

In [21]:
salary_df.select('Position').distinct().collect()

[Row(Position='F-D'),
 Row(Position='M-D'),
 Row(Position='F'),
 Row(Position='D-F'),
 Row(Position='M/D'),
 Row(Position=None),
 Row(Position='F-M'),
 Row(Position='M/F'),
 Row(Position='M-F'),
 Row(Position='M'),
 Row(Position='GK'),
 Row(Position='D'),
 Row(Position='D/M'),
 Row(Position='D-M'),
 Row(Position='D/F'),
 Row(Position='MF'),
 Row(Position='F/M')]

In [22]:
salary_df.select('Base_Salary').distinct().collect()

[Row(Base_Salary='825000'),
 Row(Base_Salary='175000'),
 Row(Base_Salary='155004'),
 Row(Base_Salary='615000'),
 Row(Base_Salary='761250'),
 Row(Base_Salary='66000'),
 Row(Base_Salary='57500'),
 Row(Base_Salary='108576'),
 Row(Base_Salary='175008'),
 Row(Base_Salary='189667'),
 Row(Base_Salary='78529'),
 Row(Base_Salary='76320'),
 Row(Base_Salary='44500'),
 Row(Base_Salary='495000'),
 Row(Base_Salary='53000'),
 Row(Base_Salary='68355'),
 Row(Base_Salary='240000'),
 Row(Base_Salary='150000'),
 Row(Base_Salary='142000'),
 Row(Base_Salary='119620'),
 Row(Base_Salary='31260'),
 Row(Base_Salary='242550'),
 Row(Base_Salary='1650000'),
 Row(Base_Salary='440000'),
 Row(Base_Salary='68254'),
 Row(Base_Salary='256500'),
 Row(Base_Salary='222500'),
 Row(Base_Salary='232750'),
 Row(Base_Salary='137812'),
 Row(Base_Salary='111711'),
 Row(Base_Salary='44825'),
 Row(Base_Salary='195300'),
 Row(Base_Salary='92500'),
 Row(Base_Salary='132624'),
 Row(Base_Salary='87120'),
 Row(Base_Salary='81000'),
 Row

In [23]:
salary_df.select('Season').distinct().collect()

[Row(Season=2007),
 Row(Season=2018),
 Row(Season=2015),
 Row(Season=2013),
 Row(Season=2014),
 Row(Season=2012),
 Row(Season=2009),
 Row(Season=2016),
 Row(Season=2010),
 Row(Season=2011),
 Row(Season=2008),
 Row(Season=2017)]

In [24]:
#MOSTRA OS VALORES NULOS DE CADA COLUNA
salary_df.filter(salary_df.Club.isNull()).show()

+----+--------+-----------+------+
|Club|Position|Base_Salary|Season|
+----+--------+-----------+------+
+----+--------+-----------+------+



In [25]:
salary_df.filter(salary_df.Position.isNull()).show()

+----+--------+-----------+------+
|Club|Position|Base_Salary|Season|
+----+--------+-----------+------+
|  SJ|    null|      50000|  2015|
+----+--------+-----------+------+



In [26]:
spark.sql(
    """
    SElect *  FROM mls_salary.deltalake_table WHERE Position is null
    """
).show()

+----+--------+-----------+------+
|Club|Position|Base_Salary|Season|
+----+--------+-----------+------+
|  SJ|    null|      50000|  2015|
+----+--------+-----------+------+



In [27]:
salary_df.filter(salary_df.Base_Salary.isNull()).show()

+----+--------+-----------+------+
|Club|Position|Base_Salary|Season|
+----+--------+-----------+------+
+----+--------+-----------+------+



In [28]:
salary_df.filter(salary_df.Season.isNull()).show()

+----+--------+-----------+------+
|Club|Position|Base_Salary|Season|
+----+--------+-----------+------+
+----+--------+-----------+------+



In [29]:
salary_df \
    .select("Club","Position", "Base_Salary", "Season") \
    .write \
    .mode("overwrite") \
    .partitionBy("Season") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/demo/silver/projeto/mls_salary.db/deltalake_table/")

In [30]:
spark.stop()