# Cleaning and Preparing the Dataset

Our dataset includes data from the very beginning of the National Basketball Association (NBA) in 1947 until the current season in 2026. The game of basketball has changed and evolved over time to the point where it almost doesn't look like the same game. To ensure prevalence and accuracy, we will be trimming the dataset down to the stretch of 1980-2025.

#### Issues:
The data set we were collecting our awards voting from did not include the voting information from the 2024-2025 NBA season, so that had to be gathered externally in order to modify the csv file.

In [90]:
# some useful mysklearn package import statements and reloads
import importlib
import numpy as np
import tabulate as tb
import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 


In [91]:
adv = MyPyTable().load_from_file("data/Advanced.csv")
vote = MyPyTable().load_from_file("data/End of Season Teams (Voting).csv")
opp_spg = MyPyTable().load_from_file("data/Opponent Stats Per Game.csv")
p_spg = MyPyTable().load_from_file("data/Player Per Game.csv")
t_spg = MyPyTable().load_from_file("data/Team Stats Per Game.csv")
team_adv = MyPyTable().load_from_file("data/Team Summaries.csv")

In [92]:
###########################################################
# Player Advanced Stats

adv_idx_post2025 = 0
adv_idx_pre1980 = 0

for idx,season in enumerate(adv.get_column("season")):
    if adv_idx_post2025 == 0 and season == 2025:
        adv_idx_post2025 = idx
    if adv_idx_pre1980 == 0 and season <1980:
        adv_idx_pre1980 = idx
        break

adv_idxs = list(range(0,adv_idx_post2025))+(list(range(adv_idx_pre1980,len(adv.data))))
adv.drop_rows(adv_idxs)
adv.save_to_file("data/Updated_Advanced.csv")

###########################################################
###########################################################
# Team Advanced Stats

team_adv_idx_post2025 = 0
team_adv_idx_pre1980 = 0

for idx,season in enumerate(team_adv.get_column("season")):
    if team_adv_idx_post2025 == 0 and season == 2025:
        team_adv_idx_post2025 = idx
    if team_adv_idx_pre1980 == 0 and season <1980:
        team_adv_idx_pre1980 = idx
        break

team_adv_idxs = list(range(0,team_adv_idx_post2025))+(list(range(team_adv_idx_pre1980,len(team_adv.data))))
team_adv.drop_rows(team_adv_idxs)
team_adv.save_to_file("data/Updated_Team_Advanced.csv")

###########################################################
###########################################################
# All-NBA / All-Defensive Team Voting Results

removal_idx = []
# all-nba
# for idx,type in enumerate(vote.get_column("type")):
#     if type == "all_defense" or type=="all_rookie":
#         removal_idx.append(idx)

# all-defensive
# for idx,type in enumerate(vote.get_column("type")):
#     if type == "all_nba" or type=="all_rookie":
#         removal_idx.append(idx)

vote.drop_rows(removal_idx)
vote_idx_post2025 = 0
vote_idx_pre1980 = 0

for idx,season in enumerate(vote.get_column("season")):
    if vote_idx_pre1980 == 0 and season <1980:
        vote_idx_pre1980 = idx
        break

vote_idxs = list(range(0,vote_idx_post2025))+(list(range(vote_idx_pre1980,len(vote.data))))
vote.drop_rows(vote_idxs)
vote.save_to_file("data/Updated_End of Season Teams (All_NBA).csv")
# vote.save_to_file("data/Updated_End of Season Teams (All_Defense).csv")

###########################################################
###########################################################
# Team Opponent Stats per Game

opp_spg_idx_post2025 = 0
opp_spg_idx_pre1980 = 0

for idx,season in enumerate(opp_spg.get_column("season")):
    if opp_spg_idx_post2025 == 0 and season == 2025:
        opp_spg_idx_post2025 = idx
    if opp_spg_idx_pre1980 == 0 and season <1980:
        opp_spg_idx_pre1980 = idx
        break

opp_spg_idxs = list(range(0,opp_spg_idx_post2025))+(list(range(opp_spg_idx_pre1980,len(opp_spg.data))))
opp_spg.drop_rows(opp_spg_idxs)
opp_spg.save_to_file("data/Updated_Opponent Stats Per Game.csv")

###########################################################
###########################################################
# Player Stats per Game

p_idx_post2025 = 0
p_idx_pre1980 = 0

for idx,season in enumerate(p_spg.get_column("season")):
    if p_idx_post2025 == 0 and season == 2025:
        adv_idx_post2025 = idx
    if p_idx_pre1980 == 0 and season <1980:
        p_idx_pre1980 = idx
        break

p_idxs = list(range(0,p_idx_post2025))+(list(range(p_idx_pre1980,len(p_spg.data))))
p_spg.drop_rows(adv_idxs)
p_spg.save_to_file("data/Updated_Player Per Game.csv")

###########################################################
###########################################################
# Team Stats per Game

t_spg_idx_post2025 = 0
t_spg_idx_pre1980 = 0

for idx,season in enumerate(t_spg.get_column("season")):
    if t_spg_idx_post2025 == 0 and season == 2025:
        t_spg_idx_post2025 = idx
    if t_spg_idx_pre1980 == 0 and season <1980:
        t_spg_idx_pre1980 = idx
        break

t_spg_idxs = list(range(0,t_spg_idx_post2025))+(list(range(t_spg_idx_pre1980,len(t_spg.data))))
t_spg.drop_rows(t_spg_idxs)
t_spg.save_to_file("data/Updated_Team Stats Per Game.csv")

###########################################################

In [93]:
players = p_spg.perform_inner_join(adv,["season","lg","player","player_id","age","team","pos","g","gs"])
season_idx = players.column_names.index("season")
player_idx = players.column_names.index("player")
team_idx = players.column_names.index("team")
mp_idx = players.column_names.index("mp")  # total minutes

primary_team = {}
for row in players.data:
    if row[team_idx] not in ["2TM", "3TM", "4TM","5TM"]:
        key = (row[season_idx], row[player_idx])
        if key not in primary_team or row[mp_idx] > primary_team[key][1]:
            primary_team[key] = (row[team_idx], row[mp_idx])

new_data = []
for row in players.data:
    key = (row[season_idx], row[player_idx])
    if row[team_idx] in ["2TM", "3TM", "4TM","5TM"]:
        if key in primary_team:
            new_row = row.copy()
            new_row[team_idx] = primary_team[key][0]
            new_data.append(new_row)
    else:
        new_data.append(row)

players =  MyPyTable(players.column_names, new_data)

In [94]:
voted = []
voted_players = set(
    (int(row[vote.column_names.index("season")]), 
     row[vote.column_names.index("player_id")])
    for row in vote.data
)
season_idx = players.column_names.index("season")
player_idx = players.column_names.index("player_id")

for row in players.data:
    key = (row[season_idx], row[player_idx])
    voted.append(1 if key in voted_players else 0)

players.column_names.append("voted")
for i,row in enumerate(players.data):
    row.append(voted[i])

In [None]:

# Defensive team columns
player_columns =['season','team','g','gs','pos','mp_per_game','drb_per_game','trb_per_game','stl_per_game','blk_per_game','pf_per_game','drb_percent','trb_percent','stl_percent','blk_percent','dws','ws','ws_48','dbpm','bpm','vorp','voted']
team_columns = ['season','team','abbreviation','w','l','pw','pl','mov','srs','d_rtg','pace','opp_e_fg_percent', 'opp_tov_percent', 'drb_percent', 'opp_ft_fga']

new_column_names = player_columns[:]
cols = [players.get_column(col) for col in player_columns]
new_data = list(map(list, zip(*cols)))
players_small = MyPyTable(new_column_names,new_data)

new_column_names = team_columns[:]
cols = [team_adv.get_column(col) for col in team_columns]
new_data = list(map(list, zip(*cols)))

team_adv_small = MyPyTable(new_column_names,new_data)

players_small_copy = MyPyTable( column_names = ["abbreviation" if c=="team" else c for c in players_small.column_names], data = players_small.data)

defensive_team_data = players_small_copy.perform_inner_join(team_adv_small, ["season", "abbreviation"])

defensive_team_data.save_to_file("data/Defensive_Team_Data.csv")