In [None]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import *
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

In [None]:
# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse/project'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() 

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
spark.sql(
    """
    SHOW TABLES FROM silver_salaries
    """
).show()

In [None]:
spark.sql(
    """
    SELECT * FROM silver_salaries.deltalake_table
    """
).show()

In [None]:
#read hdfs file to dataframe
hdfs_path = "hdfs://hdfs-nn:9000/project/bronze/SoccerSalaries.csv"

#define the schema for the dataframe
customSchema = StructType([
    StructField("Player", StringType(), True),
    StructField("Weekly_Salary", IntegerType(), True),
    StructField("Position", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Status", StringType(), True),
    StructField("Length", IntegerType(), True),
    StructField("18_19_Salary", IntegerType(), True),
    StructField("19_20_Salary", IntegerType(), True),
    StructField("20_21_Salary", IntegerType(), True),
    StructField("21_22_Salary", IntegerType(), True),
    StructField("22_23_Salary", IntegerType(), True),
    StructField("23_24_Salary", IntegerType(), True),
    StructField("24_25_Salary", IntegerType(), True),
    StructField("Club", StringType(), True),
    StructField("League", StringType(), True)
])

salaries_df = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(customSchema) \
    .csv(hdfs_path)

salaries_df.show()
salaries_df.printSchema()

In [None]:
split_col = split(salaries_df['Player'], ' ')

salaries_df = salaries_df.withColumn("First_Name", split(col("Player"), " "). \
    getItem(0)).withColumn("Middle_Name", split(col("Player"), " "). \
    getItem(1)).withColumn("Last_Name", split(col("Player"), " ").getItem(2))

salaries_df.select("Player", "First_Name", "Middle_Name", "Last_Name").show()

In [None]:
salaries_df = salaries_df.withColumn('Middle_Name',when(col("Middle_Name").substr(1,5)=='(http', None). \
    otherwise(col("Middle_Name"))) \
    .withColumn('Last_Name',when(col("Last_Name").substr(1,5)=='(http', None).otherwise(col("Last_Name")))

salaries_df.select("Player", "First_Name", "Middle_Name", "Last_Name").show()

In [None]:
salaries_df = salaries_df.withColumn('Player_Name',
    when(col("Last_Name").isNotNull(), concat(col('First_Name'),lit(' '), col('Middle_Name'), lit(' '), col('Last_Name'))) \
    .otherwise(when(col("Middle_Name").isNotNull(), concat(col('First_Name'),lit(' '), col('Middle_Name'))) \
    .otherwise(col('First_Name')))) 

In [None]:
salaries_df = salaries_df.withColumn('Player', col('Player_Name'))

In [None]:
salaries_df = salaries_df.drop('First_Name', 'Middle_Name', 'Last_Name', 'Player_Name')

salaries_df.show()

In [None]:
salaries_df = salaries_df.withColumn('Position',
                                    when(col('Position')=='AM', 'Attacking Midfielder') \
                                    .when(col('Position')=='CF', 'Center Forward') \
                                    .when(col('Position')=='CM', 'Center Midfielder') \
                                    .when(col('Position')=='LW', 'Left Wing') \
                                    .when(col('Position')=='GK', 'Goalkeeper') \
                                    .when(col('Position')=='RW', 'Right Wing') \
                                    .when(col('Position')=='DM', 'Defensive Midfielder') \
                                    .when(col('Position')=='CB', 'Center Back') \
                                    .when(col('Position')=='SS', 'Second Striker') \
                                    .when(col('Position')=='LB', 'Left Back') \
                                    .when(col('Position')=='RB', 'Right Back') \
                                    .when(col('Position')=='LM', 'Left Midfielder') \
                                    .when(col('Position')=='RM', 'Right Midfielder') \
                                    .otherwise(col('Position')))
salaries_df.select("Position").show()

In [None]:
salaries_df = salaries_df.withColumn('Club',
                                      when(col('Club')=='SPAL', 'Societa Polisportiva Ars et Labor') \
                                    .when(col('Club')=='PSG', 'Societa Polisportiva Ars et Labor') \
                                    .otherwise(col('Club')))
salaries_df.select("Club").show()

In [None]:
salaries_df = salaries_df.withColumn('League',
                                      when(col('League')=='EPL', 'Premier League') \
                                    .when(col('League')=='Serie A', 'Brasileirao Serie A') \
                                    .otherwise(col('League')))
salaries_df.select("League").show()

In [None]:
salaries_df = salaries_df.select('Player', 'Weekly_Salary', 'Position', 'Age', 'Status', '18_19_Salary', '19_20_Salary',
                                    '20_21_Salary', '21_22_Salary', '22_23_Salary', '23_24_Salary', '24_25_Salary', 'Club', 
                                    'League', 'Length')
salaries_df.show()

In [None]:
salaries_df.createOrReplaceTempView("salaries")

In [None]:
spark.sql("""INSERT INTO silver_salaries.deltalake_table 
    SELECT * FROM salaries""")

In [None]:
spark.sql("""
    SELECT COUNT(*) FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT Player FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT Weekly_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT Position FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT Age FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT Status FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 18_19_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 19_20_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 20_21_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 21_22_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 22_23_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 23_24_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT 24_25_Salary FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT Club FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.sql("""
    SELECT DISTINCT League FROM silver_salaries.deltalake_table
    """).show()

In [None]:
spark.stop()