In [36]:
import pandas as pd
import numpy as np

In [37]:
def load_season_data(season):
    df = pd.read_csv(f'sportsref_download_{season}.csv', parse_dates=['Date'], dayfirst=True)
    df = df.iloc[:-1]  # Remove the last row if it's a summary or unwanted
    return df

In [38]:
seasons = ['20-21', '21-22', '22-23', '23-24', '24-25', '25-26']
all_seasons = [load_season_data(season) for season in seasons]
all_seasons = pd.concat(all_seasons, ignore_index=True)

In [39]:
all_seasons.rename(columns={
    'Unnamed: 5': 'Home/Away',
    'GS': 'Games Started',
    'Rk': 'Rank',
    'Gcar': 'Games_Career',
    'Gtm': 'Games_Team',
    'GS': 'Games_Started',
}, inplace=True)
all_seasons.loc[all_seasons['Home/Away'] == '@', 'Home/Away'] = 'Away'
all_seasons.loc[all_seasons['Home/Away'] != 'Away', 'Home/Away'] = 'Home'
all_seasons['Home/Away_num'] = np.where(all_seasons['Home/Away'] == 'Away', 0, 1)
all_seasons[['Result_type', 'Score']] = all_seasons['Result'].str.split(',', expand=True)
all_seasons[['Team_Score', 'Opponent_Score']] = all_seasons['Score'].str.split('-', expand=True)
all_seasons['Win_lose_num'] = np.where(all_seasons['Result_type'] == 'W', 1, 0)
all_seasons['Games_Started_num'] = np.where(all_seasons['Games_Started'] == '*', 1, 0)
all_seasons['Games_Started_str'] = np.where(all_seasons['Games_Started'] == '*', 'Yes', 'No')

In [40]:
all_seasons.to_csv('processed_seasons_data.csv', index=False)

In [41]:
all_seasons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Rank               438 non-null    float64       
 1   Games_Career       398 non-null    float64       
 2   Games_Team         438 non-null    float64       
 3   Date               438 non-null    datetime64[ns]
 4   Team               438 non-null    object        
 5   Home/Away          438 non-null    object        
 6   Opp                438 non-null    object        
 7   Result             438 non-null    object        
 8   Games_Started      288 non-null    object        
 9   MP                 398 non-null    object        
 10  FG                 398 non-null    float64       
 11  FGA                398 non-null    float64       
 12  FG%                396 non-null    float64       
 13  3P                 398 non-null    float64       
 14  3PA       

In [42]:
pd.read_csv('processed_seasons_data.csv')

Unnamed: 0,Rank,Games_Career,Games_Team,Date,Team,Home/Away,Opp,Result,Games_Started,MP,...,GmSc,+/-,Home/Away_num,Result_type,Score,Team_Score,Opponent_Score,Win_lose_num,Games_Started_num,Games_Started_str
0,1.0,1.0,1.0,2020-12-23,WAS,Away,PHI,"L, 107-113",*,28:06:00,...,6.7,14.0,0,L,107-113,107,113,0,1,Yes
1,2.0,2.0,2.0,2020-12-26,WAS,Home,ORL,"L, 120-130",*,14:40,...,4.7,-7.0,1,L,120-130,120,130,0,1,Yes
2,3.0,3.0,3.0,2020-12-27,WAS,Home,ORL,"L, 113-120",*,36:11:00,...,9.1,13.0,1,L,113-120,113,120,0,1,Yes
3,4.0,4.0,4.0,2020-12-29,WAS,Home,CHI,"L, 107-115",*,26:19:00,...,9.7,-5.0,1,L,107-115,107,115,0,1,Yes
4,5.0,5.0,5.0,2020-12-31,WAS,Home,CHI,"L, 130-133",*,24:32:00,...,6.6,5.0,1,L,130-133,130,133,0,1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,35.0,394.0,35.0,2026-01-02,POR,Away,NOP,"W, 122-109",*,38:24:00,...,32.6,14.0,0,W,122-109,122,109,1,1,Yes
434,36.0,395.0,36.0,2026-01-03,POR,Away,SAS,"W, 115-110",*,38:50:00,...,25.1,10.0,0,W,115-110,115,110,1,1,Yes
435,37.0,396.0,37.0,2026-01-05,POR,Home,UTA,"W, 137-117",*,28:46:00,...,30.0,23.0,1,W,137-117,137,117,1,1,Yes
436,38.0,397.0,38.0,2026-01-07,POR,Home,HOU,"W, 103-102",*,39:08:00,...,27.4,0.0,1,W,103-102,103,102,1,1,Yes
