In [16]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import substring, avg, sum
import pyspark.sql.functions as f

# GOLD

In [17]:
#CONFIGURAR O SPARK
warehouse_location = 'hdfs://hdfs-nn:9000/demo/'

builder = SparkSession \
    .builder \
    .appName("Python Spark") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [18]:
salary_df = spark.read.format("delta").load("hdfs://hdfs-nn:9000/demo/silver/projeto/mls_salary.db/deltalake_table/")

In [19]:
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS gd_mls_salary LOCATION 'hdfs://hdfs-nn:9000/demo/gold/projeto/gd_mls_salary.db/'
    """
)

spark.sql(
    """
    DROP TABLE IF EXISTS gd_mls_salary.mls_salary_per_season
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE gd_mls_salary.mls_salary_per_season (
        Season INT,
        Club CHAR(100),
        Position CHAR(5),
        Total_Salary INT,
        Average_Salary INT
        
    )
    USING DELTA
    
    LOCATION 'hdfs://hdfs-nn:9000/demo/gold/projeto/gd_mls_salary.db/mls_salary_per_season/'
    """
)

DataFrame[]

In [20]:
from pyspark.sql.functions import substring, avg, sum
from pyspark.sql.functions import desc

flat_salary_df = salary_df \
    .select("Season","Club","Position","Base_Salary")
flat_salary_df.show()

salary_per_season_df = flat_salary_df \
    .groupBy("Season","Club","Position") \
    .agg(
        sum(flat_salary_df.Base_Salary).cast('INT').alias("Total_Salary"),
        avg(flat_salary_df.Base_Salary).cast('INT').alias("Average_Salary"),
    )

salary_per_season_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/demo/gold/projeto/gd_mls_salary.db/mls_salary_per_season/")

salary_per_season_df.orderBy(desc("Season")).show()
spark.sql(
    """
    Select *  FROM gd_mls_salary.mls_salary_per_season WHERE
    
    """
).show()

+------+-----+--------+-----------+
|Season| Club|Position|Base_Salary|
+------+-----+--------+-----------+
|  2018|  TOR|       F|    5600000|
|  2018|  TOR|       M|    6000000|
|  2018| LAFC|       F|    4500000|
|  2018|  CHI|       M|    6100000|
|  2018|   LA|       F|    4250000|
|  2018|NYCFC|       F|    5610000|
|  2018|  TOR|       F|    5000000|
|  2018|  MTL|       M|     500000|
|  2018|  COL|      GK|    2000000|
|  2018|  POR|       M|    2320000|
|  2018|  SEA|       M|    1800000|
|  2018|  ATL|       M|    1912500|
|  2018|  MLS|       F|    1850000|
|  2018|NYCFC|       M|    2000000|
|  2018|   LA|       M|    2000000|
|  2018|  POR|       F|    1275000|
|  2018|  CHI|       F|    1700000|
|  2018|   LA|       M|    1539996|
|  2018|  PHI|       M|    1714286|
|  2018|  COL|       F|    1575000|
+------+-----+--------+-----------+
only showing top 20 rows

+------+-----+--------+------------+--------------+
|Season| Club|Position|Total_Salary|Average_Salary|
+-----

In [21]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/demo/gold/projeto/gd_mls_salary.db/mls_salary_per_season/`
""").show()

++
||
++
++



In [22]:
spark.sql("""
DROP TABLE IF EXISTS gd_mls_salary.mls_salary_per_season_presto
""").show()

spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS gd_mls_salary.mls_salary_per_season_presto (
        Season Int,
        Club CHAR(100),
        Position CHAR(5),
        Total_Salary INT,
        Average_Salary INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/demo/gold/projeto/gd_mls_salary.db/mls_salary_per_season/_symlink_format_manifest/'
""").show()



++
||
++
++

++
||
++
++

