Packages and Data

In [2]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

from thefuzz import fuzz

from sklearn.linear_model import LinearRegression

In [3]:
shot_logs_df = pd.read_csv('data/raw/shot_logs.csv')
draft_combine_df = pd.read_csv('data/raw/draft_combine.csv')

Collect NBA Player Data

In [3]:
# Function to retrieve player info from NBA website
def get_player_info(player_id):
    # URL of the player's page
    url = f'https://www.nba.com/player/{player_id}'

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Position
        try:
            team_info = soup.find('p', class_='PlayerSummary_mainInnerInfo__jv3LO').text
            position = team_info.split(' | ')[-1]
        except:
            position = np.nan
        # First name
        try:
            first_name = soup.find_all('p', class_='PlayerSummary_playerNameText___MhqC')[0].text
        except:
            first_name = np.nan
        # Last name
        try:
            last_name = soup.find_all('p', class_='PlayerSummary_playerNameText___MhqC')[1].text
        except:
            last_name = np.nan
        # Height
        try:
            height = soup.find_all('p', class_='PlayerSummary_playerInfoValue__JS8_v')[0].text
            height = height.split(' ')[0].split("'")
            height = int(height[0])*12 + int(height[1].replace('"', ''))
        except:
            height = np.nan
        # Weight
        try:
            weight = soup.find_all('p', class_='PlayerSummary_playerInfoValue__JS8_v')[1].text
            weight = int(weight.split('lb')[0])
        except:
            weight = np.nan
        # Draft year
        try:
            draft_info = soup.find_all('p', class_='PlayerSummary_playerInfoValue__JS8_v')[6].text
            if "Pick" in draft_info:
                draft_year = draft_info.split(' ')[0]
            else:
                draft_info = soup.find_all('p', class_='PlayerSummary_playerInfoValue__JS8_v')[5].text
                if "Pick" in draft_info:
                    draft_year = int(draft_info.split(' ')[0])
                else:
                    draft_year = np.nan
        except:
            draft_year = np.nan

        # Return the extracted data
        return position, first_name, last_name, height, weight, draft_year
    else:
        return None


In [4]:
# Get list of all defender IDs
closest_defender_ids = shot_logs_df['CLOSEST_DEFENDER_PLAYER_ID'].unique()

# Loop through IDs and get player info
player_info_list = []
for i, id in enumerate(closest_defender_ids):
    player_data = []
    nba_player_data = get_player_info(id)
    if nba_player_data is None:
        print(f"Error: {id}")
        continue
    else:
        player_data.append(id)
        player_data.extend(nba_player_data)
        player_info_list.append(player_data)

In [5]:
# Create dataframe of player data
player_info_df = pd.DataFrame(
    player_info_list, 
    columns=['nba_id', 'position', 'first', 'last', 'height', 'weight', 'draft_year']
)

# Save to csv
player_info_df.to_csv('data/raw/player_nba_info.csv', index=False)

Join NBA Player Info with Draft Combine Data

In [6]:
# Clean up position column
player_info_df['pos_mod'] = player_info_df['position'].str.split('-')
player_info_df['pos1'] = player_info_df['pos_mod'].str[0].map({'Guard': 'G', 'Forward': 'F', 'Center': 'C'})
player_info_df['pos2'] = player_info_df['pos_mod'].str[1].map({'Guard': 'G', 'Forward': 'F', 'Center': 'C'})
player_info_df.drop('pos_mod', axis=1, inplace=True)

# Change draft year to float
player_info_df['draft_year'] = player_info_df['draft_year'].astype(float)

In [7]:
# Split player name into first and last name
draft_combine_df['first'] = draft_combine_df['PLAYER'].str.split(', ').str[1]
draft_combine_df['last'] = draft_combine_df['PLAYER'].str.split(', ').str[0]

# Clean up position column
draft_combine_df['pos_mod'] = draft_combine_df['POS'].str.split('/')
draft_combine_df['pos1'] = draft_combine_df['pos_mod'].str[0].map({'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'})
draft_combine_df['pos2'] = draft_combine_df['pos_mod'].str[1].map({'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'})
draft_combine_df.drop('pos_mod', axis=1, inplace=True)

# Get relevant columns
player_size_df = draft_combine_df[['first', 'last', 'YEAR', 'pos1', 'pos2', 'HEIGHT', 'WEIGHT', 'WINGSPAN', 'STANDING REACH']].copy()
player_size_df.rename(columns={'YEAR': 'draft_year', 'HEIGHT': 'height', 'WEIGHT': 'weight', 'WINGSPAN': 'wingspan', 'STANDING REACH': 'standing_reach'}, inplace=True)

# Round height and weight to integers
player_size_df['height'] = player_size_df['height'].round().astype('Int64')
player_size_df['weight'] = player_size_df['weight'].round().astype('Int64')

# Save dataframe to CSV
player_size_df.to_csv('data/clean/player_size.csv', index=False)

In [8]:
# Function to find best match for a player
def find_best_match(row, target_df):
    best_score = 0
    best_match = None

    for _, target_row in target_df.iterrows():
        # Check for exact matches in first name, last name, and draft year
        first_name_match = row['first'] == target_row['first']
        last_name_match = row['last'] == target_row['last']
        draft_year_match = row.get('draft_year') == target_row.get('draft_year')

        # Count the number of these criteria that are met
        criteria_met = sum([first_name_match, last_name_match, draft_year_match])

        # Only proceed if at least two out of three criteria are met
        if pd.notna(criteria_met) and criteria_met >= 2:
            # Calculate fuzzy match score for names
            name_score = fuzz.token_set_ratio(
                f"{row['first']} {row['last']}", 
                f"{target_row['first']} {target_row['last']}"
            )

            # Position match score
            position_match = ((row.get('pos1') in [target_row.get('pos1'), target_row.get('pos2')]) or
                              (row.get('pos2') in [target_row.get('pos1'), target_row.get('pos2')])) \
                             if pd.notna(row.get('pos1')) and pd.notna(row.get('pos2')) \
                             else False
            position_score = 100 if position_match else 0

            # Height match score
            height_diff = abs(row['height'] - target_row['height']) \
                          if pd.notna(row.get('height')) and pd.notna(target_row.get('height')) \
                          else 10
            height_score = max(0, 100 - height_diff * 10)

            # Weight match score
            weight_diff = abs(row.get('weight', 0) - target_row.get('weight', 0)) \
                          if pd.notna(row.get('weight')) and pd.notna(target_row.get('weight')) \
                          else 10
            weight_score = max(0, 100 - weight_diff)

            # Draft year match score
            draft_year_match = row.get('draft_year') == target_row.get('draft_year') \
                               if pd.notna(row.get('draft_year')) and pd.notna(target_row.get('draft_year')) \
                               else False
            draft_year_score = 100 if draft_year_match else 0

            # Weighted average of scores
            total_score = (name_score * 0.5) + (position_score * 0.2) + (height_score * 0.1) + \
                          (weight_score * 0.1) + (draft_year_score * 0.1)

            # Update best score and best match
            if total_score > best_score:
                best_score = total_score
                best_match = target_row

    return best_match if best_score > 0 else None

In [9]:
# Apply the function to each row in player_info_df
matches = player_info_df.apply(lambda row: find_best_match(row, player_size_df), axis=1)

# Extract 'wingspan' and 'standing_reach' and add to player_info_df
player_info_df['wingspan'] = matches['wingspan']
player_info_df['standing_reach'] = matches['standing_reach']

Fill Missing Wingspan Values with Regression

In [10]:
# Get rows where height and wingspan are not null
filtered_df = player_info_df.dropna(subset=['height', 'wingspan'])

# Prepare data for regression
X = filtered_df[['height']]
y = filtered_df['wingspan']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

In [11]:
# Predict wingspan for rows where it's missing
missing_wingspan_indices = player_info_df[player_info_df['wingspan'].isna() & 
                                          player_info_df['height'].notna()].index
predicted_wingspans = model.predict(player_info_df.loc[missing_wingspan_indices, ['height']])

# Fill in missing wingspan values
player_info_df.loc[missing_wingspan_indices, 'wingspan'] = predicted_wingspans

# Round wingspan to 2 decimal places
player_info_df['wingspan'] = player_info_df['wingspan'].round(2)

# Save dataframe to CSV
player_info_df.to_csv('data/clean/player_info.csv', index=False)

Add Wingspans to Shot Logs

In [12]:
# Rename the 'wingspan' column to 'defender_wingspan'
player_info_df_renamed = player_info_df.rename(columns={'wingspan': 'defender_wingspan'})

# Merge shot_logs_df with the modified player_info_df
open_shot_df = shot_logs_df.merge(
    player_info_df_renamed[['nba_id', 'defender_wingspan']],
    left_on='CLOSEST_DEFENDER_PLAYER_ID',
    right_on='nba_id',
    how='left'
)

# Drop the 'nba_id' column
open_shot_df.drop(columns=['nba_id'], inplace=True)

In [13]:
# Rename columns in open_shot_df
open_shot_df.rename(columns={
    'GAME_ID': 'game_id',
    'MATCHUP': 'matchup',
    'LOCATION': 'location',
    'W': 'win',
    'FINAL_MARGIN': 'final_margin',
    'SHOT_NUMBER': 'shot_number',
    'PERIOD': 'period',
    'GAME_CLOCK': 'game_clock',
    'SHOT_CLOCK': 'shot_clock',
    'DRIBBLES': 'dribbles',
    'TOUCH_TIME': 'touch_time',
    'SHOT_DIST': 'shot_dist',
    'PTS_TYPE': 'pts_type',
    'SHOT_RESULT': 'shot_result',
    'CLOSEST_DEFENDER': 'closest_defender',
    'CLOSEST_DEFENDER_PLAYER_ID': 'closest_defender_id',
    'CLOSE_DEF_DIST': 'close_def_dist',
    'FGM': 'fgm',
    'PTS': 'pts',
    'player_id': 'player_id'
}, inplace=True)

# Convert closest defender distance from feet to inches
open_shot_df['close_def_dist_in'] = open_shot_df['close_def_dist']*12

Create Open Shot DF and Dummy Variable

In [14]:
# Create 'open_shot' column, 1 if close_def_dist_in is >= defender_wingspan/2, 0 otherwise
open_shot_df['open_shot'] = np.where(
    open_shot_df['close_def_dist_in'] >= open_shot_df['defender_wingspan']/2,
    1,
    0
)

In [15]:
# Save dataframe to CSV
open_shot_df.to_csv('data/clean/open_shot.csv', index=False)

In [16]:
# Count the number of open shots
open_shot_df['open_shot'].value_counts()

open_shot
1    68986
0    59083
Name: count, dtype: int64