# Pipeline: Goalies Shootouts

In [48]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit, sum, expr, concat, min, max, dense_rank, when

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GoaliesApp")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

### Prepare GoaliesShootout Table

In [49]:
# Load the goalies shootout table
goaliesShootout = spark.read.format("csv").option("header", "true").load("gs://data_de2023_qjsol/GoaliesShootout.csv") #  use your gcp bucket name. 

# Only keep columns of interest
goaliesShootout = goaliesShootout.select("playerID", "tmID", "SA", "GA", "year")
goaliesShootout.na.drop("any")

# Show schema
goaliesShootout.printSchema()

# Convert shots against (SA), goals against (GA) and year from string to integer type
goaliesShootout = goaliesShootout.withColumn("SA", col("SA").cast("int")).withColumn("GA", col("GA").cast("int")).withColumn("year", col("year").cast("int"))

# Show new schema and top 5 rows
goaliesShootout.printSchema()
goaliesShootout.show(5)

root
 |-- playerID: string (nullable = true)
 |-- tmID: string (nullable = true)
 |-- SA: string (nullable = true)
 |-- GA: string (nullable = true)
 |-- year: string (nullable = true)

root
 |-- playerID: string (nullable = true)
 |-- tmID: string (nullable = true)
 |-- SA: integer (nullable = true)
 |-- GA: integer (nullable = true)
 |-- year: integer (nullable = true)

+---------+----+---+---+----+
| playerID|tmID| SA| GA|year|
+---------+----+---+---+----+
|aebisda01| COL| 10|  2|2005|
|aebisda01| MTL| 18|  6|2006|
|andercr01| CHI|  7|  5|2005|
|andercr01| FLO|  2|  0|2006|
|andercr01| FLO| 11|  7|2008|
+---------+----+---+---+----+
only showing top 5 rows



### Prepare Master Table

In [50]:
#  Load the master table
master = spark.read.format("csv").option("header", "true").load("gs://data_de2023_qjsol/Master.csv") #  use your gcp bucket name. 

# Only keep columns of interest
master = master.select("playerID", "firstName", "lastName", "birthYear")
master.na.drop("any")

# Show schema
master.printSchema()

# Convert birth year from string to integer type
master = master.withColumn("birthYear", col("birthYear").cast("int"))

# Show new schema and top 5 rows
master.printSchema()
master.show(5)

root
 |-- playerID: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- birthYear: string (nullable = true)

root
 |-- playerID: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- birthYear: integer (nullable = true)

+---------+---------+----------+---------+
| playerID|firstName|  lastName|birthYear|
+---------+---------+----------+---------+
|aaltoan01|    Antti|     Aalto|     1975|
|abbeybr01|    Bruce|     Abbey|     1951|
|abbotge01|   George|    Abbott|     1911|
|abbotre01|      Reg|    Abbott|     1930|
|abdelju01|   Justin|Abdelkader|     1987|
+---------+---------+----------+---------+
only showing top 5 rows



### Prepare Teams Table

In [51]:
# Load the teams table
teams = spark.read.format("csv").option("header", "true").load("gs://data_de2023_qjsol/Teams.csv") #  use your gcp bucket name. 

# Only keep columns of interest
teams = teams.select("tmID", "name")
teams.na.drop("any")

# Show schema and top 5 rows
master.printSchema()
master.show(5)

root
 |-- playerID: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- birthYear: integer (nullable = true)

+---------+---------+----------+---------+
| playerID|firstName|  lastName|birthYear|
+---------+---------+----------+---------+
|aaltoan01|    Antti|     Aalto|     1975|
|abbeybr01|    Bruce|     Abbey|     1951|
|abbotge01|   George|    Abbott|     1911|
|abbotre01|      Reg|    Abbott|     1930|
|abdelju01|   Justin|Abdelkader|     1987|
+---------+---------+----------+---------+
only showing top 5 rows



In [52]:
# There is a spelling error in the team names.
# We noticed that 'Chicago Blackhawks' and 'Chicago Black Hawks' are mapped to the same team ID.
teams.createOrReplaceTempView("teams")
teams.show()

# SQL query to display that 'Chicago Blackhawks' and 'Chicago Black Hawks' are mapped to the same team ID.
team_names = spark.sql("""
    SELECT name, tmID, COUNT(*) FROM teams
    WHERE name = 'Chicago Blackhawks' OR name = 'Chicago Black Hawks'
    GROUP BY name, tmID
""")
team_names.show()

# Resolve spelling error in team name
teams = teams.withColumn('name', when(col('name') == 'Chicago Blackhawks', 'Chicago Black Hawks').otherwise(col('name')))

+----+--------------------+
|tmID|                name|
+----+--------------------+
| COB| Cobalt Silver Kings|
| HAI|Haileybury Hockey...|
| LES|       Les Canadiens|
| MOS|  Montreal Shamrocks|
| MOW|  Montreal Wanderers|
| OT1|     Ottawa Senators|
| REN|Renfrew Creamery ...|
| MOC|  Montreal Canadiens|
| MOW|  Montreal Wanderers|
| OT1|     Ottawa Senators|
| QU1|     Quebec Bulldogs|
| REN|Renfrew Creamery ...|
| MOC|  Montreal Canadiens|
| MOW|  Montreal Wanderers|
| OT1|     Ottawa Senators|
| QU1|     Quebec Bulldogs|
| NWR|New Westminster R...|
| VA1|Victoria Aristocrats|
| VML|Vancouver Million...|
| MOC|  Montreal Canadiens|
+----+--------------------+
only showing top 20 rows

+-------------------+----+--------+
|               name|tmID|count(1)|
+-------------------+----+--------+
|Chicago Black Hawks| CHI|      60|
| Chicago Blackhawks| CHI|      25|
+-------------------+----+--------+



### Compute performance

In [53]:
# Sum shots againts (SA) and goals against (GA) of each goalie in each team
sumgoaliesShootout = goaliesShootout.groupBy("playerID", "tmID").agg(sum("SA").cast("int").alias("totalSA"), 
                                                                     sum("GA").cast("int").alias("totalGA"), 
                                                                     min("year").alias("startYear"),
                                                                     max("year").alias("endYear"))

# Compute the performance (percentage of shots that resulted in a goal) for each goalie in each team
performance = sumgoaliesShootout.withColumn("performance", (lit(1) - col("totalGA") / col("totalSA")) * 100)

performance.printSchema()
performance.show(5)

root
 |-- playerID: string (nullable = true)
 |-- tmID: string (nullable = true)
 |-- totalSA: integer (nullable = true)
 |-- totalGA: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- performance: double (nullable = true)

+---------+----+-------+-------+---------+-------+-----------------+
| playerID|tmID|totalSA|totalGA|startYear|endYear|      performance|
+---------+----+-------+-------+---------+-------+-----------------+
|bacasja01| STL|     10|      3|     2005|   2006|             70.0|
|thomati01| BOS|    201|     56|     2005|   2011|72.13930348258705|
|mcelhcu01| CAL|      4|      2|     2009|   2009|             50.0|
|bouchbr01| PHO|      3|      0|     2005|   2005|            100.0|
|hackema01| MIN|      5|      2|     2011|   2011|             60.0|
+---------+----+-------+-------+---------+-------+-----------------+
only showing top 5 rows



### Top 3 goalies per team

In [54]:
# Define window: a window for each team, within each window sort on performance then on total shots against (totalSA)
windowSpec = Window.partitionBy("tmID").orderBy(col("performance").desc(), col("totalSA").desc())

# Apply dense rank on the windows
rankedGoalies = performance.withColumn("denseRank", dense_rank().over(windowSpec))

# Keep only the top three goalies in each team
topThreeGoalies = rankedGoalies.filter(col("denseRank") <= 3)
topThreeGoalies.show(5)

+---------+----+-------+-------+---------+-------+-----------------+---------+
| playerID|tmID|totalSA|totalGA|startYear|endYear|      performance|denseRank|
+---------+----+-------+-------+---------+-------+-----------------+---------+
|gigueje01| ANA|     31|     12|     2005|   2005|61.29032258064516|        1|
|hillejo01| AND|    136|     41|     2007|   2011|69.85294117647058|        1|
|gigueje01| AND|     89|     31|     2006|   2009| 65.1685393258427|        2|
|bryzgil01| AND|     16|      7|     2006|   2007|            56.25|        3|
|hedbejo01| ATL|     65|     13|     2007|   2009|             80.0|        1|
+---------+----+-------+-------+---------+-------+-----------------+---------+
only showing top 5 rows



### Include name, age and playing years of goalie 

In [55]:
# Join with Master table
result = topThreeGoalies.join(master, ['playerID'], "left")
result.show(5)

# Column with full name instead of separate firstName and lastName
result = result.withColumn("player_name", concat(col("firstName"), lit(" "), col("lastName"))).drop("firstName").drop("lastName")

# Calculate age of each goalie during his/her top-performing years
result = result.withColumn("startAge", expr("startYear - birthYear")).withColumn("endAge", expr("endYear - birthYear")).drop("birthYear")

# Column with age range
result = result.withColumn("age", expr("concat(startAge, case when startAge = endAge then '' else concat('-', endAge) end)")).drop("startAge").drop("endAge")

# Column with range of playing years
result = result.withColumn("playingYears", 
                           expr("concat(startYear, case when startYear = endYear then '' else concat('-', endYear) end)")).drop("startYear").drop("endYear")
result.printSchema()
result.show(5)

+---------+----+-------+-------+---------+-------+-----------------+---------+--------------+---------+---------+
| playerID|tmID|totalSA|totalGA|startYear|endYear|      performance|denseRank|     firstName| lastName|birthYear|
+---------+----+-------+-------+---------+-------+-----------------+---------+--------------+---------+---------+
|gigueje01| ANA|     31|     12|     2005|   2005|61.29032258064516|        1|Jean-Sebastien|  Giguere|     1977|
|hillejo01| AND|    136|     41|     2007|   2011|69.85294117647058|        1|         Jonas|   Hiller|     1982|
|gigueje01| AND|     89|     31|     2006|   2009| 65.1685393258427|        2|Jean-Sebastien|  Giguere|     1977|
|bryzgil01| AND|     16|      7|     2006|   2007|            56.25|        3|          Ilya|Bryzgalov|     1980|
|hedbejo01| ATL|     65|     13|     2007|   2009|             80.0|        1|         Johan|  Hedberg|     1973|
+---------+----+-------+-------+---------+-------+-----------------+---------+----------

### Include team names

In [56]:
# Join with Teams table
result = result.join(teams, ['tmID'], "left")

# Rename column for clarity
result = result.withColumnRenamed("name", "team_name")

# Drop duplicates due to many to many relationship of result and teams table
result = result.dropDuplicates()

result.printSchema()
result.show()

root
 |-- tmID: string (nullable = true)
 |-- playerID: string (nullable = true)
 |-- totalSA: integer (nullable = true)
 |-- totalGA: integer (nullable = true)
 |-- performance: double (nullable = true)
 |-- denseRank: integer (nullable = false)
 |-- player_name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- playingYears: string (nullable = true)
 |-- team_name: string (nullable = true)

+----+---------+-------+-------+-----------------+---------+--------------------+-----+------------+--------------------+
|tmID| playerID|totalSA|totalGA|      performance|denseRank|         player_name|  age|playingYears|           team_name|
+----+---------+-------+-------+-----------------+---------+--------------------+-----+------------+--------------------+
| ANA|gigueje01|     31|     12|61.29032258064516|        1|Jean-Sebastien Gi...|   28|        2005|Mighty Ducks of A...|
| AND|hillejo01|    136|     41|69.85294117647058|        1|        Jonas Hiller|25-29|   2007-2011| 

## Store the result in BigQuery

In [57]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "data_de2023_qjsol"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)

# Saving the data to BigQuery
result.write.format('bigquery').option('table', 'dataengineering2023-398611.assignment2.goalies').mode("append").save() # use your project-id

In [58]:
# Stop the spark context
spark.stop()