In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
import pyspark.sql.functions as f

In [2]:
# CONFIGURAR O SPARK
warehouse_location = 'hdfs://hdfs-nn:9000/demo/' #guardar no demo, porque vamos trabalhar com o silver e com o gold

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
#buscar o silver para trabalhar no gold
transfers_df = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/projeto/transfers.db/deltalake_table/")

In [5]:
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS gold_transfers LOCATION 'hdfs://hdfs-nn:9000/demo/gold/projeto/gold_transfers.db/'
    """
)

spark.sql(
    """
    DROP TABLE IF EXISTS gold_transfers.transfers_per_season
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE gold_transfers.transfers_per_season (
        Season CHAR(10),
        Position CHAR(100),
        Team_from CHAR(100),
        Team_to CHAR(100),
        League_from CHAR(100),
        League_to CHAR(100),
        Average_Transfer_fee INT
    
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/demo/gold/projeto/gold_transfers.db/transfers_per_season/'
    """
)

DataFrame[]

In [6]:
from pyspark.sql.functions import substring, avg, sum
from pyspark.sql.functions import desc

flat_transfers_df = transfers_df \
    .select("Season","Name","Position","Age","Team_from","League_from", "Team_to","League_to", "Transfer_fee")
flat_transfers_df.show()

tranfers_per_season_df = flat_transfers_df \
    .groupBy("Season","Position","Team_from", "Team_to", "League_from", "League_to") \
    .agg(
        avg(flat_transfers_df.Transfer_fee).cast('INT').alias("Average_Transfer_fee"),
    )

tranfers_per_season_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/demo/gold/projeto/gold_transfers.db/transfers_per_season/")

tranfers_per_season_df.orderBy(desc("Season")).show()


+---------+--------------------+------------------+---+---------------+--------------+--------------+--------------+------------+
|   Season|                Name|          Position|Age|      Team_from|   League_from|       Team_to|     League_to|Transfer_fee|
+---------+--------------------+------------------+---+---------------+--------------+--------------+--------------+------------+
|2005-2006|      Michael Essien|Defensive Midfield| 22| Olympique Lyon|       Ligue 1|       Chelsea|Premier League|    45000000|
|2005-2006|Shaun Wright-Phil...|      Right Winger| 23|       Man City|Premier League|       Chelsea|Premier League|     9500000|
|2005-2006|        Sergio Ramos|       Centre-Back| 19|     Sevilla FC|        LaLiga|   Real Madrid|        LaLiga|    27000000|
|2005-2006|        Michael Owen|    Centre-Forward| 25|    Real Madrid|        LaLiga|     Newcastle|Premier League|    22000000|
|2005-2006|   Alberto Gilardino|    Centre-Forward| 23|          Parma|       Serie A|    

In [7]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta. `hdfs://hdfs-nn:9000/demo/gold/projeto/gold_transfers.db/transfers_per_season/`
""").show()

++
||
++
++



In [8]:
spark.sql("""
DROP TABLE IF EXISTS gold_transfers.transfers_per_season_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS gold_transfers.transfers_per_season_presto (
        Season CHAR(10),
        Position CHAR(100),
        Team_from CHAR(100),
        Team_to CHAR(100),
        League_from CHAR(100),
        League_to CHAR(100),
        Average_Transfer_fee INT
        )
        
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/demo/gold/projeto/gold_transfers.db/transfers_per_season/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++



In [None]:
spark.stop()