In [45]:
import pandas as pd
import numpy as np

In [46]:
files = {
    "2020-21": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2020_21_totals.csv",
    "2021-22": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2021_22_totals.csv",
    "2022-23": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2022_23_totals.csv",
    "2023-24": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2023_24_totals.csv",
    "2024-25": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2024_25_totals.csv"
}

dfs = []

for season, path in files.items():
    df = pd.read_csv(path)
    df["season"] = season
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data.head()

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,...,AST,STL,BLK,TOV,PF,PTS,Trp-Dbl,Awards,Player-additional,season
0,1,Stephen Curry,32,GSW,PG,63,63,2152,658,1365,...,363,77,8,213,119,2015,0,MVP-3ASNBA1,curryst01,2020-21
1,2,Damian Lillard,30,POR,PG,67,67,2398,602,1334,...,505,62,17,203,102,1928,0,MVP-7ASNBA2,lillada01,2020-21
2,3,Nikola Jokić,25,DEN,C,72,72,2488,732,1293,...,599,95,48,222,192,1898,16,MVP-1ASNBA1,jokicni01,2020-21
3,4,Bradley Beal,27,WAS,SG,60,60,2147,670,1382,...,265,69,22,187,140,1878,0,ASNBA3,bealbr01,2020-21
4,5,Luka Dončić,21,DAL,PG,66,66,2262,647,1351,...,567,64,36,281,152,1830,11,MVP-6ASNBA1,doncilu01,2020-21


In [47]:
data = data[data["Player"] != "Player"]

data = data.rename(columns={
    'Player': 'player_name',
    'Team': 'team',
    'Age': 'age',
    'Pos': 'position',
    'G': 'games_played',
    'MP': 'minutes_played',
    'PTS': 'points',
    'TRB': 'rebounds',
    'AST': 'assists'
})

data = data[[
    "player_name", "team", "season", "age", "position",
    "games_played", "minutes_played",
    "points", "rebounds", "assists"
]]

numeric_cols = ["age", "games_played", "minutes_played", "points", "rebounds", "assists"]
for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce")
data = data[[
    "player_name", "team", "season", "age", "position",
    "games_played", "minutes_played",
    "points", "rebounds", "assists"
]]

data

Unnamed: 0,player_name,team,season,age,position,games_played,minutes_played,points,rebounds,assists
0,Stephen Curry,GSW,2020-21,32,PG,63,2152,2015,345,363
1,Damian Lillard,POR,2020-21,30,PG,67,2398,1928,283,505
2,Nikola Jokić,DEN,2020-21,25,C,72,2488,1898,780,599
3,Bradley Beal,WAS,2020-21,27,SG,60,2147,1878,283,265
4,Luka Dončić,DAL,2020-21,21,PG,66,2262,1830,527,567
...,...,...,...,...,...,...,...,...,...,...
3661,Riley Minix,SAS,2024-25,24,SF,1,7,0,2,0
3662,Jahlil Okafor,IND,2024-25,29,C,1,3,0,1,1
3663,Zyon Pullin,MEM,2024-25,23,SG,3,3,0,0,0
3664,Isaiah Stevens,MIA,2024-25,24,PG,3,6,0,2,0


In [48]:
data["points_per_game"] = data["points"] / data["games_played"]
data["rebounds_per_game"] = data["rebounds"] / data["games_played"]
data["assists_per_game"] = data["assists"] / data["games_played"]
data["minutes_per_game"] = data["minutes_played"] / data["games_played"]

# Calculate advanced metrics
data["points_per_minute"] = data["points"] / data["minutes_played"]
data["assist_rate"] = data["assists"] / data["minutes_played"]
data["rebound_rate"] = data["rebounds"] / data["minutes_played"]

data = data.sort_values(["player_name", "season"])

data = data.fillna(0)
data

Unnamed: 0,player_name,team,season,age,position,games_played,minutes_played,points,rebounds,assists,points_per_game,rebounds_per_game,assists_per_game,minutes_per_game,points_per_minute,assist_rate,rebound_rate
2005,A.J. Green,MIL,2022-23,23,SG,35,345,154,45,22,4.400000,1.285714,0.628571,9.857143,0.446377,0.063768,0.130435
2592,A.J. Green,MIL,2023-24,24,SG,56,614,252,64,30,4.500000,1.142857,0.535714,10.964286,0.410423,0.048860,0.104235
3185,A.J. Green,MIL,2024-25,25,SG,73,1659,541,174,108,7.410959,2.383562,1.479452,22.726027,0.326100,0.065099,0.104882
2082,A.J. Lawson,2TM,2022-23,22,SG,15,108,56,21,2,3.733333,1.400000,0.133333,7.200000,0.518519,0.018519,0.194444
2083,A.J. Lawson,MIN,2022-23,22,SG,1,2,2,1,0,2.000000,1.000000,0.000000,2.000000,1.000000,0.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,Zion Williamson,NOP,2022-23,22,PF,29,956,754,202,133,26.000000,6.965517,4.586207,32.965517,0.788703,0.139121,0.211297
2222,Zion Williamson,NOP,2023-24,23,PF,70,2207,1601,406,352,22.871429,5.800000,5.028571,31.528571,0.725419,0.159493,0.183960
3102,Zion Williamson,NOP,2024-25,24,PF,30,857,737,216,159,24.566667,7.200000,5.300000,28.566667,0.859977,0.185531,0.252042
1495,Zylan Cheatham,UTA,2021-22,26,SF,1,5,0,0,0,0.000000,0.000000,0.000000,5.000000,0.000000,0.000000,0.000000


In [49]:
print(f"Records before creating target: {len(data)}")

data["target_next_season_ppg"] = (
    data.groupby("player_name")["points_per_game"].shift(-1)
)

# Filter out rows without targets
data = data.dropna(subset=["target_next_season_ppg"])
print(f"Records after filtering: {len(data)}")

data

Records before creating target: 3666
Records after filtering: 2675


Unnamed: 0,player_name,team,season,age,position,games_played,minutes_played,points,rebounds,assists,points_per_game,rebounds_per_game,assists_per_game,minutes_per_game,points_per_minute,assist_rate,rebound_rate,target_next_season_ppg
2005,A.J. Green,MIL,2022-23,23,SG,35,345,154,45,22,4.400000,1.285714,0.628571,9.857143,0.446377,0.063768,0.130435,4.500000
2592,A.J. Green,MIL,2023-24,24,SG,56,614,252,64,30,4.500000,1.142857,0.535714,10.964286,0.410423,0.048860,0.104235,7.410959
2082,A.J. Lawson,2TM,2022-23,22,SG,15,108,56,21,2,3.733333,1.400000,0.133333,7.200000,0.518519,0.018519,0.194444,2.000000
2083,A.J. Lawson,MIN,2022-23,22,SG,1,2,2,1,0,2.000000,1.000000,0.000000,2.000000,1.000000,0.000000,0.500000,3.857143
2084,A.J. Lawson,DAL,2022-23,22,SG,14,106,54,20,2,3.857143,1.428571,0.142857,7.571429,0.509434,0.018868,0.188679,3.238095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,Ziaire Williams,MEM,2022-23,21,SF,37,561,210,79,35,5.675676,2.135135,0.945946,15.162162,0.374332,0.062389,0.140820,8.235294
2495,Ziaire Williams,MEM,2023-24,22,SF,51,1038,420,180,75,8.235294,3.529412,1.470588,20.352941,0.404624,0.072254,0.173410,10.031746
9,Zion Williamson,NOP,2020-21,20,PF,61,2026,1647,441,226,27.000000,7.229508,3.704918,33.213115,0.812932,0.111550,0.217670,26.000000
1681,Zion Williamson,NOP,2022-23,22,PF,29,956,754,202,133,26.000000,6.965517,4.586207,32.965517,0.788703,0.139121,0.211297,22.871429


In [50]:
player_features = data[[
    "player_name",
    "team",
    "season",
    "age",
    "position",
    "games_played",
    "minutes_per_game",
    "points_per_game",
    "rebounds_per_game",
    "assists_per_game",
    "points_per_minute",
    "assist_rate",
    "rebound_rate",
    "target_next_season_ppg"
]]

print(f"\n Final dataset shape: {player_features.shape}")
print(f"   Columns: {list(player_features.columns)}")
player_features.head()


 Final dataset shape: (2675, 14)
   Columns: ['player_name', 'team', 'season', 'age', 'position', 'games_played', 'minutes_per_game', 'points_per_game', 'rebounds_per_game', 'assists_per_game', 'points_per_minute', 'assist_rate', 'rebound_rate', 'target_next_season_ppg']


Unnamed: 0,player_name,team,season,age,position,games_played,minutes_per_game,points_per_game,rebounds_per_game,assists_per_game,points_per_minute,assist_rate,rebound_rate,target_next_season_ppg
2005,A.J. Green,MIL,2022-23,23,SG,35,9.857143,4.4,1.285714,0.628571,0.446377,0.063768,0.130435,4.5
2592,A.J. Green,MIL,2023-24,24,SG,56,10.964286,4.5,1.142857,0.535714,0.410423,0.04886,0.104235,7.410959
2082,A.J. Lawson,2TM,2022-23,22,SG,15,7.2,3.733333,1.4,0.133333,0.518519,0.018519,0.194444,2.0
2083,A.J. Lawson,MIN,2022-23,22,SG,1,2.0,2.0,1.0,0.0,1.0,0.0,0.5,3.857143
2084,A.J. Lawson,DAL,2022-23,22,SG,14,7.571429,3.857143,1.428571,0.142857,0.509434,0.018868,0.188679,3.238095


In [51]:
# Save to CSV
output_path = '../data/processed/player_features.csv'
player_features.to_csv(output_path, index=False)

print(f"\n SUCCESS!")
print(f"   Saved {len(player_features)} rows with {len(player_features.columns)} columns")
print(f"   File: {output_path}")

# Verify no null values
print(f"   Data Quality Check:")
print(f"   Null values: {player_features.isnull().sum().sum()}")
print(f"   Unique players: {player_features['player_name'].nunique()}")


 SUCCESS!
   Saved 2675 rows with 14 columns
   File: ../data/processed/player_features.csv
   Data Quality Check:
   Null values: 0
   Unique players: 737
