In [1]:
import os
import numpy as np
import pandas as pd
import shutil
import pyspark


In [2]:
import pyspark.sql.functions as psf
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession 
from pyspark.sql import SparkSession, SQLContext, Window
from pyspark.sql.functions import when, count, col, sum, regexp_replace
from pyspark import SparkContext
from pyspark.sql.types import IntegerType

In [3]:
sparkObj = pyspark.sql.SparkSession.builder.appName("Assignment2_NBA").getOrCreate()
sparkObj.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [4]:
sparkObj

In [5]:
nbaData = sparkObj.read.format("csv").option("header", "true",).option("inferSchema","true").load('shot_logs.csv')


In [6]:
nbaData.show(n=2,vertical=True)

-RECORD 0------------------------------------------
 GAME_ID                    | 21400899             
 MATCHUP                    | MAR 04, 2015 - CH... 
 LOCATION                   | A                    
 W                          | W                    
 FINAL_MARGIN               | 24                   
 SHOT_NUMBER                | 1                    
 PERIOD                     | 1                    
 GAME_CLOCK                 | 2023-11-18 01:09:00  
 SHOT_CLOCK                 | 10.8                 
 DRIBBLES                   | 2                    
 TOUCH_TIME                 | 1.9                  
 SHOT_DIST                  | 7.7                  
 PTS_TYPE                   | 2                    
 SHOT_RESULT                | made                 
 CLOSEST_DEFENDER           | Anderson, Alan       
 CLOSEST_DEFENDER_PLAYER_ID | 101187               
 CLOSE_DEF_DIST             | 1.3                  
 FGM                        | 1                    
 PTS        

In [7]:
print("Total Rows: " , nbaData.count())
print("Total Columns: " , len(nbaData.columns))

Total Rows:  128069
Total Columns:  21


In [8]:
nbaData.printSchema()

root
 |-- GAME_ID: integer (nullable = true)
 |-- MATCHUP: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- W: string (nullable = true)
 |-- FINAL_MARGIN: integer (nullable = true)
 |-- SHOT_NUMBER: integer (nullable = true)
 |-- PERIOD: integer (nullable = true)
 |-- GAME_CLOCK: timestamp (nullable = true)
 |-- SHOT_CLOCK: double (nullable = true)
 |-- DRIBBLES: integer (nullable = true)
 |-- TOUCH_TIME: double (nullable = true)
 |-- SHOT_DIST: double (nullable = true)
 |-- PTS_TYPE: integer (nullable = true)
 |-- SHOT_RESULT: string (nullable = true)
 |-- CLOSEST_DEFENDER: string (nullable = true)
 |-- CLOSEST_DEFENDER_PLAYER_ID: integer (nullable = true)
 |-- CLOSE_DEF_DIST: double (nullable = true)
 |-- FGM: integer (nullable = true)
 |-- PTS: integer (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_id: integer (nullable = true)



### Question 1: For each pair of the players (A, B), we define the fear sore of A when facing B is the hit rate, such that B is closet defender when A is shooting. Based on the fear sore, for each player, please find out who is his ”most unwanted defender”. 

In [9]:
madeCond= psf.when(psf.col("SHOT_RESULT") == "made", 1).otherwise(0)
missedCond = psf.when(psf.col("SHOT_RESULT") == "missed", 1).otherwise(0)

unwantedDf = nbaData.groupBy(
    psf.col("player_id").alias("Player ID"),
    psf.col("CLOSEST_DEFENDER_PLAYER_ID").alias("Defender ID")
).agg(
    psf.sum(madeCond).alias("Scored"),
    psf.sum(missedCond).alias("Not Scored")
)


unwantedDf.show(10)

+---------+-----------+------+----------+
|Player ID|Defender ID|Scored|Not Scored|
+---------+-----------+------+----------+
|   203148|     101179|     0|         1|
|   202687|     201980|     1|         0|
|     2744|       1717|     0|         2|
|   203469|     202329|     1|         1|
|   201945|     202322|     0|         3|
|   202689|     202699|     6|         8|
|   202689|     203924|     1|         0|
|   203077|       2730|     1|         0|
|   203077|     201584|     2|         0|
|   202362|     201188|     2|         0|
+---------+-----------+------+----------+
only showing top 10 rows



In [10]:
unwantedDf = unwantedDf.withColumn(
    "HitRate",
    psf.col("Scored") / (psf.col("Scored") + psf.col("Not Scored"))
)
unwantedDf.show(10)

+---------+-----------+------+----------+-------------------+
|Player ID|Defender ID|Scored|Not Scored|            HitRate|
+---------+-----------+------+----------+-------------------+
|   203148|     101179|     0|         1|                0.0|
|   202687|     201980|     1|         0|                1.0|
|     2744|       1717|     0|         2|                0.0|
|   203469|     202329|     1|         1|                0.5|
|   201945|     202322|     0|         3|                0.0|
|   202689|     202699|     6|         8|0.42857142857142855|
|   202689|     203924|     1|         0|                1.0|
|   203077|       2730|     1|         0|                1.0|
|   203077|     201584|     2|         0|                1.0|
|   202362|     201188|     2|         0|                1.0|
+---------+-----------+------+----------+-------------------+
only showing top 10 rows



In [11]:
unwantedDf = unwantedDf.filter(psf.col("HitRate").isNotNull())

In [12]:
unwantedDf = unwantedDf.dropDuplicates(subset=["Player ID", "HitRate"])


In [13]:
finalDf = unwantedDf.groupBy("Player ID").agg(psf.min("HitRate").alias("HitRate"))

In [14]:
unwantedDf = unwantedDf.join(finalDf, ["Player ID", "HitRate"]).select("Player ID", "Defender ID")


unwantedDf = unwantedDf.join(
    nbaData,
    (nbaData["player_id"] == unwantedDf["Player ID"]) & (nbaData["CLOSEST_DEFENDER_PLAYER_ID"] == unwantedDf["Defender ID"])
).withColumn("Player Name", col("player_name")).withColumn("Most Unwanted Defender", col("CLOSEST_DEFENDER"))

unwantedDf = unwantedDf.dropDuplicates(["Player ID", "Defender ID"])

unwantedDf.select("Player Name", "Most Unwanted Defender").show(10)

+--------------+----------------------+
|   Player Name|Most Unwanted Defender|
+--------------+----------------------+
| kevin garnett|           Exum, Dante|
|   kobe bryant|        Anderson, Kyle|
|    tim duncan|        Roberts, Brian|
|  vince carter|       Crawford, Jamal|
|dirk nowtizski|           Hickson, JJ|
|   paul pierce|         Waiters, Dion|
|  andre miller|       Splitter, Tiago|
|  shawn marion|     Tolliver, Anthony|
|   jason terry|          Lopez, Brook|
| manu ginobili|      Bennett, Anthony|
+--------------+----------------------+
only showing top 10 rows



### Answer 1: 
From the above table, we can see that most unwanted defender for each of the player. So, if Kevin Garnett is the shooter, the most the unwanted defender is the  Exum, Dante

In [15]:
sparkObj.stop()

### Question 2: For each player, we define the comfortable zone of shooting is a matrix of, {SHOT DIST, CLOSE DEF DIST, SHOT CLOCK}Please develop a Spark-based algorithm to classify each player’s records into 4 comfortable zones. Considering the hit rate, which zone is the best for James Harden, Chris Paul, Stephen Curry, and Lebron James.

In [16]:
sparkObj = pyspark.sql.SparkSession.builder.appName("Assignment2_NBA_2").getOrCreate()
sparkObj.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [17]:
nbaData = sparkObj.read.format("csv").option("header", "true",).option("inferSchema","true").load('shot_logs.csv').select("player_name","SHOT_DIST","CLOSE_DEF_DIST", "SHOT_CLOCK", "SHOT_RESULT").na.drop()

In [18]:
nbaShotsData = nbaData.withColumn('SHOT_RESULT', psf.when(psf.col('SHOT_RESULT') == 'made', 1).otherwise(0).cast('float'))
comfortableZoneMat = ["SHOT_DIST", "CLOSE_DEF_DIST", "SHOT_CLOCK"]


In [19]:
for feature in comfortableZoneMat:
    nbaShotsData = nbaShotsData.withColumn(feature, psf.col(feature).cast("float"))


In [20]:

vecAssembler = VectorAssembler(inputCols=comfortableZoneMat, outputCol="shooting_zone")
nbaShotsData = vecAssembler.transform(nbaShotsData).select('player_name', 'shooting_zone', 'SHOT_RESULT')


kmeans = KMeans(k=4, featuresCol="shooting_zone")
kmeansFitData = kmeans.fit(nbaShotsData)

playersData = nbaShotsData.filter(nbaShotsData['player_name'].isin(['james harden', 'chris paul', 'stephen curry', 'lebron james']))

pred = kmeansFitData.transform(playersData).select('player_name', 'prediction', 'SHOT_RESULT')


In [21]:

pred.createOrReplaceTempView("player_zones")

In [22]:


avgShotResultQuery = """SELECT player_name, prediction, AVG(SHOT_RESULT) AS avgShotResult FROM player_zones GROUP BY player_name, prediction ORDER BY player_name, prediction """
res = sparkObj.sql(avgShotResultQuery)

maxAvgShot = res.groupBy("player_name").agg(psf.max("avgShotResult").alias("maxAvgShotResult"))

bestZone = res.alias("df1").join(maxAvgShot.alias("df2"), (psf.col("df1.player_name") == psf.col("df2.player_name")) & (psf.col("df1.avgShotResult") == psf.col("df2.maxAvgShotResult")))

bestZone = bestZone.select("df1.*")
bestZone.show()

+-------------+----------+------------------+
|  player_name|prediction|     avgShotResult|
+-------------+----------+------------------+
| lebron james|         3|0.6613545816733067|
|   chris paul|         0|0.5563380281690141|
| james harden|         3|0.5604395604395604|
|stephen curry|         3|0.6350710900473934|
+-------------+----------+------------------+



In [23]:
sparkObj.stop()

### Answer 2:
1. Zone-1 corresponds to a prediction value of 0, Zone-2 to 1, and Zone-3 to 2 and Zone 4 to 3 in the 'prediction' column.2. 
To determine each player's comfort zone, we grouped the data by player and zone, calculating the average score for each group
3. As seen in the table Lebron, james, stephen has best Zone 1 and Chris has best Zone 4..
