In [250]:
import numpy as np
import pandas as pd
import os

In [251]:
#Importing necessary datasets

per_game = pd.read_csv('Player Per Game.csv')
awards = pd.read_csv('Player Award Shares.csv')
player_totals = pd.read_csv('Player Totals.csv')
team_summary = pd.read_csv('Team Summaries.csv')
advanced_stats = pd.read_csv('Advanced.csv')
all_star = pd.read_csv('All-Star Selections.csv')
all_nba = pd.read_csv('End of Season Teams (Voting).csv')
shooting = pd.read_csv('Player Shooting.csv')
abbrv = pd.read_csv('Team Abbrev.csv')

In [252]:
per_game.isnull().sum()

seas_id              0
season               0
player_id            0
player               0
birth_year       29000
pos                  0
age                 22
experience           0
lg                   0
tm                   0
g                    0
gs                8637
mp_per_game       1083
fg_per_game          0
fga_per_game         0
fg_percent         157
x3p_per_game      6352
x3pa_per_game     6352
x3p_percent      10528
x2p_per_game         0
x2pa_per_game        0
x2p_percent        239
e_fg_percent       157
ft_per_game          0
fta_per_game         0
ft_percent        1291
orb_per_game      4657
drb_per_game      4657
trb_per_game       894
ast_per_game         0
stl_per_game      5626
blk_per_game      5625
tov_per_game      5635
pf_per_game          0
pts_per_game         0
dtype: int64

In [253]:
per_game.dtypes

seas_id            int64
season             int64
player_id          int64
player            object
birth_year       float64
pos               object
age              float64
experience         int64
lg                object
tm                object
g                  int64
gs               float64
mp_per_game      float64
fg_per_game      float64
fga_per_game     float64
fg_percent       float64
x3p_per_game     float64
x3pa_per_game    float64
x3p_percent      float64
x2p_per_game     float64
x2pa_per_game    float64
x2p_percent      float64
e_fg_percent     float64
ft_per_game      float64
fta_per_game     float64
ft_percent       float64
orb_per_game     float64
drb_per_game     float64
trb_per_game     float64
ast_per_game     float64
stl_per_game     float64
blk_per_game     float64
tov_per_game     float64
pf_per_game      float64
pts_per_game     float64
dtype: object

In [254]:
#Filtering per game stats from 1974 onwards to remove years where certain statistics were not tracked

per_game['season'] = per_game['season'].astype(int)
per_game_filtered = per_game[per_game['season'] > 1979]
per_game_filtered['season'].unique()

array([2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014,
       2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003,
       2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992,
       1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981,
       1980])

In [255]:
# Filtering only necessary columns for per_game

per_game_filtered = per_game_filtered[['seas_id', 'season', 'player_id', 'player', 'pos', 'age', 'tm', 'g', 'mp_per_game',
                                       'pts_per_game', 'fg_per_game', 'fga_per_game', 'fg_percent', 'x3p_per_game', 
                                       'x3pa_per_game', 'x3p_percent', 'e_fg_percent', 'ft_per_game', 'fta_per_game', 
                                       'orb_per_game', 'drb_per_game', 'trb_per_game', 'ast_per_game', 'stl_per_game', 
                                       'blk_per_game', 'pf_per_game']]
        

In [256]:
#Filling in null values with 0

per_game_filtered['fg_percent'].fillna(0, inplace=True)
per_game_filtered['e_fg_percent'].fillna(0, inplace=True)

In [257]:
per_game_filtered[per_game_filtered['x3p_percent'].isnull()]['x3pa_per_game'].unique()
per_game_filtered['x3p_percent'].fillna(0, inplace = True)
per_game_filtered.isnull().sum()

seas_id          0
season           0
player_id        0
player           0
pos              0
age              0
tm               0
g                0
mp_per_game      0
pts_per_game     0
fg_per_game      0
fga_per_game     0
fg_percent       0
x3p_per_game     0
x3pa_per_game    0
x3p_percent      0
e_fg_percent     0
ft_per_game      0
fta_per_game     0
orb_per_game     0
drb_per_game     0
trb_per_game     0
ast_per_game     0
stl_per_game     0
blk_per_game     0
pf_per_game      0
dtype: int64

In [258]:
# Grouping players who have played for multiple teams in one season so we have one row per player per year

def one_row(df):
    if df.shape[0]==1:
        return df
    else:
        row = df[df['tm']=='TOT']
        row['tm'] = df.iloc[-1,:]['tm']
        return row
    
per_game_filtered = per_game_filtered.groupby(['player_id', 'season']).apply(one_row)
    

In [260]:
per_game_filtered.index = per_game_filtered.index.droplevel()

In [261]:
per_game_filtered

Unnamed: 0,seas_id,season,player_id,player,pos,age,tm,g,mp_per_game,pts_per_game,...,e_fg_percent,ft_per_game,fta_per_game,orb_per_game,drb_per_game,trb_per_game,ast_per_game,stl_per_game,blk_per_game,pf_per_game
23778,8246,1980,834,Paul Silas,PF,36.0,SEA,82,19.5,3.8,...,0.378,1.1,1.7,2.5,2.8,5.3,0.8,0.3,0.1,1.5
23802,8270,1980,856,Rick Barry,SF,35.0,HOU,72,25.2,12.0,...,0.469,2.0,2.1,0.7,2.5,3.3,3.7,1.1,0.4,2.5
23621,8089,1980,949,Earl Monroe,SG,35.0,NYK,51,12.4,7.4,...,0.457,1.1,1.3,0.3,0.4,0.7,1.3,0.4,0.1,0.9
23788,8256,1980,1021,Phil Jackson,PF,34.0,NJN,16,12.1,4.1,...,0.630,0.4,0.6,0.8,0.8,1.5,0.8,0.3,0.3,2.2
23872,8340,1980,1061,Walt Frazier,PG,34.0,CLE,3,9.0,3.3,...,0.364,0.7,0.7,0.3,0.7,1.0,2.7,0.7,0.3,0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,31821,2024,5205,Trayce Jackson-Davis,PF,23.0,GSW,68,16.6,7.9,...,0.702,1.1,1.9,2.0,3.0,5.0,1.2,0.4,1.1,1.6
693,31829,2024,5206,Trey Jemison,C,24.0,MEM,25,22.9,6.8,...,0.551,0.8,1.0,2.6,2.8,5.4,1.1,0.5,1.1,2.8
699,31835,2024,5207,Tristan Vukcevic,C,20.0,WAS,10,15.3,8.5,...,0.507,1.7,2.2,0.6,3.0,3.6,1.3,0.5,0.7,2.8
711,31847,2024,5208,Vasilije Micić,PG,30.0,CHO,60,19.6,7.0,...,0.488,1.0,1.3,0.2,1.2,1.5,4.4,0.5,0.1,1.1


In [285]:
# Isolating MVP Winners from awards dataset

mvp = awards[(awards['award']=='nba mvp') & (awards['winner']==True)]
mvp_winners = mvp[['season', 'player', 'age', 'pts_won']]
mvp_winners  = mvp[mvp['season'] > 1979]
mvp_winners = mvp_winners[['player_id', 'seas_id', 'season', 'player', 'pts_won', 'pts_max', 'share', 'winner']]
mvp_winners.rename(columns={'share':'mvp_share'}, inplace=True)
mvp_winners.rename(columns={'winner':'mvp'}, inplace=True)
mvp_winners

Unnamed: 0,player_id,seas_id,season,player,pts_won,pts_max,mvp_share,mvp
25,4417,30764,2023,Joel Embiid,915.0,1000.0,0.915,True
77,4352,30247,2022,Nikola Jokić,875.0,1000.0,0.875,True
145,4352,29456,2021,Nikola Jokić,971.0,1010.0,0.961,True
206,4164,28507,2020,Giannis Antetokounmpo,962.0,1010.0,0.952,True
267,4164,27809,2019,Giannis Antetokounmpo,941.0,1010.0,0.932,True
351,3880,27201,2018,James Harden,965.0,1010.0,0.955,True
420,3849,26815,2017,Russell Westbrook,888.0,1010.0,0.879,True
491,3903,26260,2016,Stephen Curry,1310.0,1310.0,1.0,True
578,3903,25662,2015,Stephen Curry,1198.0,1300.0,0.922,True
659,3770,24828,2014,Kevin Durant,1232.0,1250.0,0.986,True


In [263]:
# Filtering team wins and losses

team_summary_filtered = team_summary[['season', 'team', 'w', 'l']]
team_records = team_summary_filtered.merge(abbrv, how='outer', on=['season', 'team'])
team_records = team_records[team_records['season'] > 1979]
team_records.dropna(inplace=True)
team_records.rename(columns={'abbreviation':'tm'}, inplace=True)
team_records

Unnamed: 0,season,team,w,l,lg,playoffs,tm
0,2024,Atlanta Hawks,36.0,46.0,NBA,False,ATL
1,2024,Boston Celtics,64.0,18.0,NBA,True,BOS
2,2024,Brooklyn Nets,32.0,50.0,NBA,False,BRK
3,2024,Chicago Bulls,39.0,43.0,NBA,False,CHI
4,2024,Charlotte Hornets,21.0,61.0,NBA,False,CHO
...,...,...,...,...,...,...,...
1249,1980,San Antonio Spurs,41.0,41.0,NBA,True,SAS
1250,1980,San Diego Clippers,35.0,47.0,NBA,False,SDC
1251,1980,Seattle SuperSonics,56.0,26.0,NBA,True,SEA
1252,1980,Utah Jazz,24.0,58.0,NBA,False,UTA


In [264]:
#Filtering advanced stats to necessary columns

advanced_stats_filtered = advanced_stats[['seas_id', 'season', 'tm', 'player_id', 'player', 'tov_percent', 'usg_percent', 'ows', 'dws', 'ws', 'obpm', 'dbpm', 'bpm', 'vorp']]
advanced_stats_filtered = advanced_stats_filtered[advanced_stats_filtered['season'] > 1979]
advanced_stats_filtered['tov_percent'].fillna(0, inplace=True)
advanced_stats_filtered['usg_percent'].fillna(0, inplace=True)
advanced_stats_filtered = advanced_stats_filtered.groupby(['player_id', 'season']).apply(one_row)

In [271]:
advanced_stats_filtered.index = advanced_stats_filtered.index.droplevel()

In [288]:
#Filtering ALl NBA with necessary columns

all_nba_filtered = all_nba[['season', 'seas_id', 'player_id', 'player', 'number_tm', 'share']]
all_nba_filtered = all_nba_filtered[all_nba['season'] > 1979]

#Standardizing All NBA team values

all_nba_filtered['number_tm'].replace(to_replace=["1T", "1st"], value=1, inplace=True)
all_nba_filtered['number_tm'].replace(to_replace=["2T", "2nd"], value=2, inplace=True)
all_nba_filtered['number_tm'].replace(to_replace=["3T", "3rd"], value=3, inplace=True)
all_nba_filtered['number_tm'].replace(to_replace='ORV', value=4, inplace=True)
all_nba_filtered.rename(columns={'share':'all_nba_share'}, inplace=True)
all_nba_filtered['number_tm'].unique()

array([1, 2, 3, 4], dtype=int64)

In [279]:
all_nba_filtered

Unnamed: 0,season,seas_id,player_id,player,number_tm,tm,all_nba_share
0,2022,29918,4164,Giannis Antetokounmpo,1,MIL,1.000
1,2022,30247,4352,Nikola Jokić,1,DEN,0.952
2,2022,30158,4654,Luka Dončić,1,DAL,0.952
3,2022,29850,4321,Devin Booker,1,PHO,0.920
4,2022,30010,4518,Jayson Tatum,1,BOS,0.780
...,...,...,...,...,...,...,...
1537,1980,8076,1701,Dennis Johnson,2,SEA,0.485
1538,1980,8064,1634,Dan Roundfield,2,ATL,0.402
1539,1980,8118,1647,Gus Williams,2,SEA,0.379
1540,1980,8234,1605,Moses Malone,2,HOU,0.356


In [289]:
#Merging datasets into one

combined = per_game_filtered.merge(mvp_winners, how='outer', on=['player', 'seas_id', 'season', 'player_id'])
combined = combined.merge(team_records, how = 'outer', on=['season', 'tm'])
combined = combined.merge(advanced_stats_filtered, how='outer', on=['player', 'seas_id','season', 'player_id'])
combined = combined.merge(all_nba_filtered, how='outer', on=['player', 'seas_id', 'season', 'player_id'])

In [293]:
combined.fillna(0,inplace=True)
combined.isna().sum()

seas_id          0
season           0
player_id        0
player           0
pos              0
age              0
tm_x             0
g                0
mp_per_game      0
pts_per_game     0
fg_per_game      0
fga_per_game     0
fg_percent       0
x3p_per_game     0
x3pa_per_game    0
x3p_percent      0
e_fg_percent     0
ft_per_game      0
fta_per_game     0
orb_per_game     0
drb_per_game     0
trb_per_game     0
ast_per_game     0
stl_per_game     0
blk_per_game     0
pf_per_game      0
pts_won          0
pts_max          0
mvp_share        0
mvp              0
team             0
w                0
l                0
lg               0
playoffs         0
tm_y             0
tov_percent      0
usg_percent      0
ows              0
dws              0
ws               0
obpm             0
dbpm             0
bpm              0
vorp             0
number_tm        0
all_nba_share    0
dtype: int64