In [21]:
import pandas as pd
import numpy as np

In [22]:
files = {
    "2020-21": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2020_21_totals.csv",
    "2021-22": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2021_22_totals.csv",
    "2022-23": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2022_23_totals.csv",
    "2023-24": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2023_24_totals.csv",
    "2024-25": "C:\\SattyGithub\\Research-Project\\data\\raw\\nba_2024_25_totals.csv"
}

dfs = []

for season, path in files.items():
    df = pd.read_csv(path)
    df["season"] = season
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)

In [23]:
data = data[data["Player"] != "Player"]  # remove repeated headers

data = data.rename(columns={
    "Player": "player_name",
    "Pos": "position",
    "Age": "age",
    "G": "games_played",
    "MP": "minutes_played",
    "PTS": "points",
    "TRB": "rebounds",
    "AST": "assists"
})

data = data[[
    "player_name", "season", "age", "position",
    "games_played", "minutes_played",
    "points", "rebounds", "assists"
]]

numeric_cols = ["age", "games_played", "minutes_played", "points", "rebounds", "assists"]
for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce")
data = data[[
    "player_name", "season", "age", "position",
    "games_played", "minutes_played",
    "points", "rebounds", "assists"
]]

data

Unnamed: 0,player_name,season,age,position,games_played,minutes_played,points,rebounds,assists
0,Stephen Curry,2020-21,32,PG,63,2152,2015,345,363
1,Damian Lillard,2020-21,30,PG,67,2398,1928,283,505
2,Nikola Jokić,2020-21,25,C,72,2488,1898,780,599
3,Bradley Beal,2020-21,27,SG,60,2147,1878,283,265
4,Luka Dončić,2020-21,21,PG,66,2262,1830,527,567
...,...,...,...,...,...,...,...,...,...
3661,Riley Minix,2024-25,24,SF,1,7,0,2,0
3662,Jahlil Okafor,2024-25,29,C,1,3,0,1,1
3663,Zyon Pullin,2024-25,23,SG,3,3,0,0,0
3664,Isaiah Stevens,2024-25,24,PG,3,6,0,2,0


In [24]:
data["points_per_game"] = data["points"] / data["games_played"]
data["rebounds_per_game"] = data["rebounds"] / data["games_played"]
data["assists_per_game"] = data["assists"] / data["games_played"]
data["minutes_per_game"] = data["minutes_played"] / data["games_played"]

data["points_per_minute"] = data["points"] / data["minutes_played"]
data["assist_rate"] = data["assists"] / data["minutes_played"]
data["rebound_rate"] = data["rebounds"] / data["minutes_played"]

data = data.sort_values(["player_name", "season"])

data = data.fillna(0)
data

Unnamed: 0,player_name,season,age,position,games_played,minutes_played,points,rebounds,assists,points_per_game,rebounds_per_game,assists_per_game,minutes_per_game,points_per_minute,assist_rate,rebound_rate
2005,A.J. Green,2022-23,23,SG,35,345,154,45,22,4.400000,1.285714,0.628571,9.857143,0.446377,0.063768,0.130435
2592,A.J. Green,2023-24,24,SG,56,614,252,64,30,4.500000,1.142857,0.535714,10.964286,0.410423,0.048860,0.104235
3185,A.J. Green,2024-25,25,SG,73,1659,541,174,108,7.410959,2.383562,1.479452,22.726027,0.326100,0.065099,0.104882
2082,A.J. Lawson,2022-23,22,SG,15,108,56,21,2,3.733333,1.400000,0.133333,7.200000,0.518519,0.018519,0.194444
2083,A.J. Lawson,2022-23,22,SG,1,2,2,1,0,2.000000,1.000000,0.000000,2.000000,1.000000,0.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,Zion Williamson,2022-23,22,PF,29,956,754,202,133,26.000000,6.965517,4.586207,32.965517,0.788703,0.139121,0.211297
2222,Zion Williamson,2023-24,23,PF,70,2207,1601,406,352,22.871429,5.800000,5.028571,31.528571,0.725419,0.159493,0.183960
3102,Zion Williamson,2024-25,24,PF,30,857,737,216,159,24.566667,7.200000,5.300000,28.566667,0.859977,0.185531,0.252042
1495,Zylan Cheatham,2021-22,26,SF,1,5,0,0,0,0.000000,0.000000,0.000000,5.000000,0.000000,0.000000,0.000000


In [25]:
data["target_next_season_ppg"] = (
    data.groupby("player_name")["points_per_game"].shift(-1)
)

data = data.dropna(subset=["target_next_season_ppg"])
data

Unnamed: 0,player_name,season,age,position,games_played,minutes_played,points,rebounds,assists,points_per_game,rebounds_per_game,assists_per_game,minutes_per_game,points_per_minute,assist_rate,rebound_rate,target_next_season_ppg
2005,A.J. Green,2022-23,23,SG,35,345,154,45,22,4.400000,1.285714,0.628571,9.857143,0.446377,0.063768,0.130435,4.500000
2592,A.J. Green,2023-24,24,SG,56,614,252,64,30,4.500000,1.142857,0.535714,10.964286,0.410423,0.048860,0.104235,7.410959
2082,A.J. Lawson,2022-23,22,SG,15,108,56,21,2,3.733333,1.400000,0.133333,7.200000,0.518519,0.018519,0.194444,2.000000
2083,A.J. Lawson,2022-23,22,SG,1,2,2,1,0,2.000000,1.000000,0.000000,2.000000,1.000000,0.000000,0.500000,3.857143
2084,A.J. Lawson,2022-23,22,SG,14,106,54,20,2,3.857143,1.428571,0.142857,7.571429,0.509434,0.018868,0.188679,3.238095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,Ziaire Williams,2022-23,21,SF,37,561,210,79,35,5.675676,2.135135,0.945946,15.162162,0.374332,0.062389,0.140820,8.235294
2495,Ziaire Williams,2023-24,22,SF,51,1038,420,180,75,8.235294,3.529412,1.470588,20.352941,0.404624,0.072254,0.173410,10.031746
9,Zion Williamson,2020-21,20,PF,61,2026,1647,441,226,27.000000,7.229508,3.704918,33.213115,0.812932,0.111550,0.217670,26.000000
1681,Zion Williamson,2022-23,22,PF,29,956,754,202,133,26.000000,6.965517,4.586207,32.965517,0.788703,0.139121,0.211297,22.871429


In [26]:
player_features = data[[
    "player_name",
    "season",
    "age",
    "games_played",
    "minutes_per_game",
    "points_per_game",
    "rebounds_per_game",
    "assists_per_game",
    "points_per_minute",
    "assist_rate",
    "rebound_rate",
    "target_next_season_ppg"
]]

In [27]:
player_features.to_csv(
    "../data/processed/player_features.csv",
    index=False
)

print("player_features.csv created successfully for Module 1")

player_features.csv created successfully for Module 1
