# Pipeline: Goalies Shootouts

In [64]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit, avg, sum, round, expr, concat, min, max, dense_rank

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GoaliesApp")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

## Load the datasets

### Prepare Goalies Shootout Table

In [65]:
# Load the goalies shootout table
goaliesShootout = spark.read.format("csv").option("header", "true").load("gs://data_de2023_qjsol/GoaliesShootout.csv") #  use your gcp bucket name. 

# Only keep columns of interest
goaliesShootout = goaliesShootout.select("playerID", "tmID", "SA", "GA", "year")
goaliesShootout.na.drop("any")

# Show schema
goaliesShootout.printSchema()

# Convert shots against (SA), goals against (GA) and year from string to integer type
goaliesShootout = goaliesShootout.withColumn("SA", col("SA").cast("int")).withColumn("GA", col("GA").cast("int")).withColumn("year", col("year").cast("int"))

# Show new schema and top 5 rows
goaliesShootout.printSchema()
goaliesShootout.show(5)

root
 |-- playerID: string (nullable = true)
 |-- tmID: string (nullable = true)
 |-- SA: string (nullable = true)
 |-- GA: string (nullable = true)
 |-- year: string (nullable = true)

root
 |-- playerID: string (nullable = true)
 |-- tmID: string (nullable = true)
 |-- SA: integer (nullable = true)
 |-- GA: integer (nullable = true)
 |-- year: integer (nullable = true)

+---------+----+---+---+----+
| playerID|tmID| SA| GA|year|
+---------+----+---+---+----+
|aebisda01| COL| 10|  2|2005|
|aebisda01| MTL| 18|  6|2006|
|andercr01| CHI|  7|  5|2005|
|andercr01| FLO|  2|  0|2006|
|andercr01| FLO| 11|  7|2008|
+---------+----+---+---+----+
only showing top 5 rows



### Prepare Master Table

In [66]:
#  Load the master table
master = spark.read.format("csv").option("header", "true").load("gs://data_de2023_qjsol/Master.csv") #  use your gcp bucket name. 

# Only keep columns of interest
master = master.select("playerID", "firstName", "lastName", "birthYear")
master.na.drop("any")

# Show schema
master.printSchema()

# Convert birth year from string to integer type
master = master.withColumn("birthYear", col("birthYear").cast("int"))

# Show new schema and top 5 rows
master.printSchema()
master.show(5)

root
 |-- playerID: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- birthYear: string (nullable = true)

root
 |-- playerID: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- birthYear: integer (nullable = true)

+---------+---------+----------+---------+
| playerID|firstName|  lastName|birthYear|
+---------+---------+----------+---------+
|aaltoan01|    Antti|     Aalto|     1975|
|abbeybr01|    Bruce|     Abbey|     1951|
|abbotge01|   George|    Abbott|     1911|
|abbotre01|      Reg|    Abbott|     1930|
|abdelju01|   Justin|Abdelkader|     1987|
+---------+---------+----------+---------+
only showing top 5 rows



In [70]:
# Sum shots againts (SA) and goals against (GA) of each goalie in each team
sumgoaliesShootout = goaliesShootout.groupBy("playerID", "tmID").agg(sum("SA").cast("int").alias("totalSA"), 
                                                                     sum("GA").cast("int").alias("totalGA"), 
                                                                     min("year").alias("startYear"),
                                                                     max("year").alias("endYear"))

# Compute the performance (percentage of shots that resulted in a goal) for each goalie in each team
performance = sumgoaliesShootout.withColumn("performance", (lit(1) - col("totalGA") / col("totalSA")) * 100)
performance.printSchema()

# Define window: a window for each team, within each window sort on performance then on total shots against (totalGA)
windowSpec = Window.partitionBy("tmID").orderBy(col("performance").desc(), col("totalGA").desc())

# Apply dense rank on the windows
rankedGoalies = performance.withColumn("denseRank", dense_rank().over(windowSpec))

# Keep only the top three goalies in each team
topThreeGoalies = rankedGoalies.filter(col("denseRank") <= 3).drop("denseRank")
topThreeGoalies.show(5)

# Join with Master table
result = topThreeGoalies.join(master, ['playerID'], "left")
result.show(5)

# Column with full name instead of separate firstName and lastName
result = result.withColumn("fullName", concat(col("firstName"), lit(" "), col("lastName"))).drop("firstName").drop("lastName")

# Calculate age of each goalie during his/her top-performing years
result = result.withColumn("startAge", expr("startYear - birthYear")).withColumn("endAge", expr("endYear - birthYear")).drop("birthYear")

# Column with age range
result = result.withColumn("age", expr("concat(startAge, case when startAge = endAge then '' else concat('-', endAge) end)")).drop("startAge").drop("endAge")

# Column with range of playing years
result = result.withColumn("playingYears", 
                           expr("concat(startYear, case when startYear = endYear then '' else concat('-', endYear) end)")).drop("startYear").drop("endYear")
result.printSchema()
result.show(5)

# TODO: join with Team table to get team names
# TODO: make smaller steps in notebook
# TODO: remove unused imports

root
 |-- playerID: string (nullable = true)
 |-- tmID: string (nullable = true)
 |-- totalSA: integer (nullable = true)
 |-- totalGA: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- performance: double (nullable = true)

+---------+----+-------+-------+---------+-------+-----------------+
| playerID|tmID|totalSA|totalGA|startYear|endYear|      performance|
+---------+----+-------+-------+---------+-------+-----------------+
|gigueje01| ANA|     31|     12|     2005|   2005|61.29032258064516|
|hillejo01| AND|    136|     41|     2007|   2011|69.85294117647058|
|gigueje01| AND|     89|     31|     2006|   2009| 65.1685393258427|
|bryzgil01| AND|     16|      7|     2006|   2007|            56.25|
|hedbejo01| ATL|     65|     13|     2007|   2009|             80.0|
+---------+----+-------+-------+---------+-------+-----------------+
only showing top 5 rows

+---------+----+-------+-------+---------+-------+-----------------

## Store the result in BigQuery

In [74]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "data_de2023_qjsol"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
result.write.format('bigquery').option('table', 'dataengineering2023-398611.assignment2.goalies').mode("append").save() # use your project-id

In [75]:
# Stop the spark context
spark.stop()