In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
# define columns to use from our raw data
player_cols = [
    "season", "player_id", "player", "experience", "tm", "g", "mp", "x3p","x3pa",
    "x2p", "x2pa", "ft", "fta", "orb", "drb", "ast", "stl", "blk", "pts"
]
team_cols = ["season", "abbreviation", "w", "l"]
all_star_cols = ["player", "season"]

In [3]:
# load csv data
players = pd.read_csv("../data/raw/Player Totals.csv", usecols=player_cols)
teams = pd.read_csv("../data/raw/Team Summaries.csv", usecols=team_cols)
all_stars = pd.read_csv("../data/raw/All-Star Selections.csv", usecols=all_star_cols)

# filter out 2025 (data doesn't include 2025 all-star selections) and pre-merger era
players = players[(players["season"] >= 1977) & (players["season"] < 2025)]

In [4]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24955 entries, 719 to 25673
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      24955 non-null  int64  
 1   player_id   24951 non-null  float64
 2   player      24955 non-null  object 
 3   experience  24955 non-null  float64
 4   tm          24955 non-null  object 
 5   g           24955 non-null  int64  
 6   mp          24955 non-null  float64
 7   x3p         23880 non-null  float64
 8   x3pa        23880 non-null  float64
 9   x2p         24955 non-null  int64  
 10  x2pa        24955 non-null  int64  
 11  ft          24955 non-null  int64  
 12  fta         24955 non-null  int64  
 13  orb         24955 non-null  float64
 14  drb         24955 non-null  float64
 15  ast         24955 non-null  int64  
 16  stl         24955 non-null  float64
 17  blk         24955 non-null  float64
 18  pts         24955 non-null  int64  
dtypes: float64(9), int64(8), obj

In [8]:
# drop rows with missing player_id
players = players.dropna(subset=["player_id"])

# fill NaN with 0
players['x3p'] = players['x3p'].fillna(0)
players['x3pa'] = players['x3pa'].fillna(0)

# convert columns to int
int_cols = ['player_id', 'experience', 'g', 'x3p', 'x3pa', 'x2p', 'x2pa', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'pts']
players[int_cols] = players[int_cols].astype('int')

# confirm
players.head()

Unnamed: 0,season,player_id,player,experience,tm,g,mp,x3p,x3pa,x2p,x2pa,ft,fta,orb,drb,ast,stl,blk,pts
719,2024,5025,A.J. Green,2,MIL,56,614.0,69,169,14,27,17,19,9,55,30,9,4,252
720,2024,5026,A.J. Lawson,2,DAL,42,311.0,13,50,41,71,15,23,14,36,20,10,3,136
721,2024,5027,AJ Griffin,2,ATL,20,171.0,10,39,8,23,2,2,2,16,5,1,2,48
722,2024,4219,Aaron Gordon,10,DEN,73,2297.0,40,138,358,578,177,269,174,297,259,56,45,1013
723,2024,4582,Aaron Holiday,6,HOU,78,1269.0,84,217,102,200,58,63,23,100,140,42,6,514


In [9]:
# create a dictionary of teams and games played by player-season
player_team_data = defaultdict(lambda: defaultdict(list))
for _, row in players.iterrows():
    if row["tm"] == "TOT":
        continue  # skip totaled rows
    season = row["season"]
    player_id = row["player_id"]
    tm = row["tm"]
    g = row["g"]

    player_team_data[season][player_id].append({"tm": tm, "g": g})

# set primary team in any player-season aggregated rows (total rows for players who had multiple teams in a season)
for idx, row in players.iterrows():
    if row["tm"] == "TOT":
        primary_tm = max(player_team_data[row["season"]][row["player_id"]], key=lambda x: x["g"])["tm"]
        players.loc[idx, "tm"] = primary_tm

# keep only the total row for player-seasons with multiple entries (keeping row with the highest minutes played)
players = players.loc[players.groupby(["player_id", "season"])["mp"].idxmax()].reset_index(drop=True)

In [13]:
players[players['player'] == 'Josh Hart']

Unnamed: 0,season,player_id,player,experience,tm,g,mp,x3p,x3pa,x2p,x2pa,ft,fta,orb,drb,ast,stl,blk,pts
18425,2018,4525,Josh Hart,1,LAL,63,1461.0,78,197,98,178,66,94,42,221,80,47,16,496
18426,2019,4525,Josh Hart,2,LAL,67,1715.0,92,274,97,190,55,80,35,213,93,64,40,525
18427,2020,4525,Josh Hart,3,NOP,65,1755.0,121,354,104,178,85,115,61,364,108,63,23,656
18428,2021,4525,Josh Hart,4,NOP,47,1349.0,63,193,88,151,69,89,53,324,109,38,12,434
18429,2022,4525,Josh Hart,5,NOP,54,1791.0,73,213,214,356,160,211,62,326,224,61,13,807
18430,2023,4525,Josh Hart,6,POR,76,2454.0,61,164,214,356,132,176,142,451,290,88,21,743
18431,2024,4525,Josh Hart,7,NYK,81,2707.0,80,258,215,421,91,115,128,547,331,75,23,761


Because the All-Star Selections data doesn't include `player_id`, we need to check for players of the same name in the same season. If two players have the same name, and one of the players is named to the All-Star team, it could cause a misclassification. Let's address this in the cells below by making a list of conflicting names per season and then checking that against the All-Star Selections data.

In [14]:
# check for players of the same name playing in the same season
name_id_check = players.groupby(["season", "player"])["player_id"].nunique()
conflicts = name_id_check[name_id_check > 1]
print(conflicts)

season  player         
1979    George Johnson     2
1980    George Johnson     2
1981    George Johnson     2
1982    Eddie Johnson      2
        George Johnson     2
1983    Eddie Johnson      2
        George Johnson     2
1984    Eddie Johnson      2
1985    Charles Jones      2
        Eddie Johnson      2
        George Johnson     2
1986    Charles Jones      2
        Eddie Johnson      2
        George Johnson     2
1987    Eddie Johnson      2
1988    Charles Jones      2
1989    Charles Jones      2
1990    Charles Smith      2
1991    Charles Smith      2
1995    Michael Smith      2
1996    Charles Smith      2
2008    Marcus Williams    2
2009    Marcus Williams    2
2013    Chris Johnson      2
2014    Tony Mitchell      2
Name: player_id, dtype: int64


In [15]:
# check if conflicting player names are included in all-star list
conflict_names = conflicts.index.get_level_values("player").unique()
conflict_all_stars = all_stars[all_stars["player"].isin(conflict_names)]
print(conflict_all_stars)

             player  season
1086  Eddie Johnson    1981
1113  Eddie Johnson    1980


Luckily for our purposes, the two years Eddie Johnson was named to the All-Star team occurred before the other Eddie Johnson joined the league. We won't have any All-Star misclassifications based on shared names.