In [1]:
import requests
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import pandas as pd
import io
import numpy as np
import time
from fuzzywuzzy import fuzz, process
from unidecode import unidecode
from tqdm import tqdm
from datetime import datetime
import ast
import re

In [2]:
nba_team_abbreviations = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Buffalo Braves": "BUF",
    "Charlotte Hornets": "CHH",
    "Charlotte Bobcats": "CHA",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Kansas City Kings": "KCK",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Jersey Nets": "NJN",
    "New Orleans/Oklahoma City Hornets": "NOK",
    "New Orleans Pelicans": "NOP",
    "New Orleans Hornets": "NOH",
    "New Orleans Jazz": "NOJ",
    "New York Knicks": "NYK",
    "New York Nets": "NYN",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "San Diego Clippers": "SDC",
    "Seattle SuperSonics": "SEA",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Vancouver Grizzlies": "VAN",
    "Washington Bullets": "WSB",
    "Washington Wizards": "WAS"
    }


nba_team_names = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "BUF": "Buffalo Braves",
    "CHH": "Charlotte Hornets",
    "CHA": "Charlotte Bobcats",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "KCK": "Kansas City Kings",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NJN": "New Jersey Nets",
    "NOK": "New Orleans/Oklahoma City Hornets",
    "NOP": "New Orleans Pelicans",
    "NOH": "New Orleans Hornets",
    "NOJ": "New Orleans Jazz",
    "NYK": "New York Knicks",
    "NYN": "New York Nets",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHO": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "SDC": "San Diego Clippers",
    "SEA": "Seattle SuperSonics",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "VAN": "Vancouver Grizzlies",
    "WSB": "Washington Bullets",
    "WAS": "Washington Wizards"
}

nba_games_per_season = {
    2001: 82,
    2002: 82,
    2003: 82,
    2004: 82,
    2005: 82,
    2006: 82,
    2007: 82,
    2008: 82,
    2009: 82,
    2010: 82,
    2011: 82,
    2012: 66,  # Lockout-shortened
    2013: 82,
    2014: 82,
    2015: 82,
    2016: 82,
    2017: 82,
    2018: 82,
    2019: 82,
#    2020: 82,
    2021: 72,  # Pandemic-shortened
    2022: 82,
    2023: 82,
    2024: 82,
    2025: 82
}


nba_season_midpoints = {
    2001: datetime(2001, 1, 22),
    2002: datetime(2002, 1, 21),
    2003: datetime(2003, 1, 20),
    2004: datetime(2004, 1, 19),
    2005: datetime(2005, 1, 24),
    2006: datetime(2006, 1, 23),
    2007: datetime(2007, 1, 22),
    2008: datetime(2008, 1, 21),
    2009: datetime(2009, 1, 26),
    2010: datetime(2010, 1, 25),
    2011: datetime(2011, 1, 24),
    2012: datetime(2012, 2, 15),  # Lockout season
    2013: datetime(2013, 1, 22),
    2014: datetime(2014, 1, 21),
    2015: datetime(2015, 1, 26),
    2016: datetime(2016, 1, 25),
    2017: datetime(2017, 1, 23),
    2018: datetime(2018, 1, 22),
    2019: datetime(2019, 1, 21),
#    2020: datetime(2020, 1, 20),
    2021: datetime(2021, 2, 20),  # Shortened, started in Dec
    2022: datetime(2022, 1, 20),
    2023: datetime(2023, 1, 20),
    2024: datetime(2024, 1, 22),
    2025: datetime(2025, 1, 20)
}

nba_season_names = {
    1981: '1980-81',
    1982: '1981-82',
    1983: '1982-83',
    1984: '1983-84',
    1985: '1984-85',
    1986: '1985-86',
    1987: '1986-87',
    1988: '1987-88',
    1989: '1988-89',
    1990: '1989-90',
    1991: '1990-91',
    1992: '1991-92',
    1993: '1992-93',
    1994: '1993-94',
    1995: '1994-95',
    1996: '1995-96',
    1997: '1996-97',
    1998: '1997-98',
    1999: '1998-99',
    2000: '1999-00',
    2001: '2000-01',
    2002: '2001-02',
    2003: '2002-03',
    2004: '2003-04',
    2005: '2004-05',
    2006: '2005-06',
    2007: '2006-07',
    2008: '2007-08',
    2009: '2008-09',
    2010: '2009-10',
    2011: '2010-11',
    2012: '2011-12',
    2013: '2012-13',
    2014: '2013-14',
    2015: '2014-15',
    2016: '2015-16',
    2017: '2016-17',
    2018: '2017-18',
    2019: '2018-19',
    2020: '2019-20',
    2021: '2020-21',
    2022: '2021-22',
    2023: '2022-23',
    2024: '2023-24',
    2025: '2024-25'
}

reversed_nba_season_names = {
    '1980-81': 1981,
    '1981-82': 1982,
    '1982-83': 1983,
    '1983-84': 1984,
    '1984-85': 1985,
    '1985-86': 1986,
    '1986-87': 1987,
    '1987-88': 1988,
    '1988-89': 1989,
    '1989-90': 1990,
    '1990-91': 1991,
    '1991-92': 1992,
    '1992-93': 1993,
    '1993-94': 1994,
    '1994-95': 1995,
    '1995-96': 1996,
    '1996-97': 1997,
    '1997-98': 1998,
    '1998-99': 1999,
    '1999-00': 2000,
    '2000-01': 2001,
    '2001-02': 2002,
    '2002-03': 2003,
    '2003-04': 2004,
    '2004-05': 2005,
    '2005-06': 2006,
    '2006-07': 2007,
    '2007-08': 2008,
    '2008-09': 2009,
    '2009-10': 2010,
    '2010-11': 2011,
    '2011-12': 2012,
    '2012-13': 2013,
    '2013-14': 2014,
    '2014-15': 2015,
    '2015-16': 2016,
    '2016-17': 2017,
    '2017-18': 2018,
    '2018-19': 2019,
    '2019-20': 2020,
    '2020-21': 2021,
    '2021-22': 2022,
    '2022-23': 2023,
    '2023-24': 2024,
    '2024-25': 2025
}

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def get_season_data(year):
    data = pd.read_csv(f'../data/regularseasoncleaned/regseacle_nba_data{year}.csv')
    data = data[['Date', 'Starters', 'TeamName', 'WonGame',
       'Injured', 'DidNotPlay', 'FantasyPoints', 'MPTimeDelta', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-',
       'TeamAbv', 'Year', 'Day', 'Month#', 'MonthName', 'Day_of_week', 'Home',
       'GamePointDiff', 'Start(ET)', 'Overtime', 'Attend.',
       'InSeasonTournament', 'GameID', 'OpponentTeam', 'OpponentTeamAbv',
       'InjTeamateCount', 'Starting', 'Top7InTeam']]
    
    return data   

get_season_data(2024)

Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,TeamAbv,Year,Day,Month#,MonthName,Day_of_week,Home,GamePointDiff,Start(ET),Overtime,Attend.,InSeasonTournament,GameID,OpponentTeam,OpponentTeamAbv,InjTeamateCount,Starting,Top7InTeam
0,2023-10-24,D'Angelo Russell,Los Angeles Lakers,False,False,False,16.5,0 days 00:36:11,36:11,4.0,12.0,0.333,2.0,5.0,0.400,1.0,2.0,0.500,0.0,4.0,4.0,7.0,1.0,0.0,3.0,3.0,11.0,6.7,1.0,LAL,2023,24,10,october,Tuesday,False,-12,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
1,2023-10-24,Anthony Davis,Los Angeles Lakers,False,False,False,23.0,0 days 00:34:09,34:09,6.0,17.0,0.353,1.0,2.0,0.500,4.0,4.0,1.000,1.0,7.0,8.0,4.0,0.0,2.0,2.0,3.0,17.0,11.3,-17.0,LAL,2023,24,10,october,Tuesday,False,-12,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
2,2023-10-24,Austin Reaves,Los Angeles Lakers,False,False,False,21.5,0 days 00:31:20,31:20,4.0,11.0,0.364,1.0,2.0,0.500,5.0,7.0,0.714,4.0,4.0,8.0,4.0,2.0,0.0,2.0,2.0,14.0,13.1,-14.0,LAL,2023,24,10,october,Tuesday,False,-12,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
3,2023-10-24,Taurean Prince,Los Angeles Lakers,False,False,False,16.0,0 days 00:29:53,29:53,6.0,8.0,0.750,4.0,6.0,0.667,2.0,2.0,1.000,1.0,2.0,3.0,1.0,0.0,1.0,1.0,0.0,18.0,16.5,-14.0,LAL,2023,24,10,october,Tuesday,False,-12,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
4,2023-10-24,LeBron James,Los Angeles Lakers,False,False,False,26.0,0 days 00:29:00,29:00,10.0,16.0,0.625,1.0,4.0,0.250,0.0,1.0,0.000,1.0,7.0,8.0,5.0,1.0,0.0,0.0,1.0,21.0,20.3,7.0,LAL,2023,24,10,october,Tuesday,False,-12,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43529,2024-04-19,Matt Ryan,New Orleans Pelicans,True,False,True,0.0,0 days 00:00:00,0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NOP,2024,19,4,april,Friday,True,7,9:30p,False,18656,False,2024194NOP,Sacramento Kings,SAC,1.0,False,False
43530,2024-04-19,Cody Zeller,New Orleans Pelicans,True,False,True,0.0,0 days 00:00:00,0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NOP,2024,19,4,april,Friday,True,7,9:30p,False,18656,False,2024194NOP,Sacramento Kings,SAC,1.0,False,False
43531,2024-04-19,Kevin Huerter,Sacramento Kings,False,True,True,0.0,0 days 00:00:00,0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SAC,2024,19,4,april,Friday,False,-7,9:30p,False,18656,False,2024194NOP,New Orleans Pelicans,NOP,2.0,False,False
43532,2024-04-19,Malik Monk,Sacramento Kings,False,True,True,0.0,0 days 00:00:00,0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SAC,2024,19,4,april,Friday,False,-7,9:30p,False,18656,False,2024194NOP,New Orleans Pelicans,NOP,2.0,False,False


In [5]:
def total_season_grouped_data(year):
    data = get_season_data(year)
    data['Date'] = pd.to_datetime(data['Date'])
    data['MPTimeDelta'] = pd.to_timedelta(data['MPTimeDelta'])
    data['MinPlayedNum'] = data['MPTimeDelta'].dt.total_seconds() / 60
    data = data[data['DidNotPlay'] == False]
    second_halfdata = data[data['Date'] > nba_season_midpoints[year]]
    
    
    full_season = data.groupby('Starters').agg(
        S_TotalWins=('WonGame', 'sum'),
        S_GamesPlayed=('Starters', 'count'),
        S_AvgPoints=('PTS', 'mean'),
        S_AvgAssists=('AST', 'mean'),
        S_AvgRebounds=('TRB','mean'),
        S_AvgSteals=('STL','mean'),
        S_AvgBlocks=('BLK','mean'),
        S_AvgTurnovers=('TOV','mean'),
        S_AvgFG=('FG','mean'),
        S_AvgFGA=('FGA','mean'),
        S_Avg3P=('3P','mean'),
        S_Avg3PA=('3PA','mean'),
        S_AvgFT=('FT','mean'),
        S_AvgFTA=('FTA','mean'),
        S_FantasyPoints=('FantasyPoints','mean'),
        S_MinutesPlayed=('MinPlayedNum','mean'),
        S_GamePointDiff=('GamePointDiff','mean'),
        S_StartingCount=('Starting','count'),
        S_Top7Team=('Top7InTeam','count')
        
        
        
    ).reset_index()
    
    half_season = second_halfdata.groupby('Starters').agg(
        HS_TotalWins=('WonGame', 'sum'),
        HS_GamesPlayed=('Starters', 'count'),
        HS_AvgPoints=('PTS', 'mean'),
        HS_AvgAssists=('AST', 'mean'),
        HS_AvgRebounds=('TRB','mean'),
        HS_AvgSteals=('STL','mean'),
        HS_AvgBlocks=('BLK','mean'),
        HS_AvgTurnovers=('TOV','mean'),
        HS_AvgFG=('FG','mean'),
        HS_AvgFGA=('FGA','mean'),
        HS_Avg3P=('3P','mean'),
        HS_Avg3PA=('3PA','mean'),
        HS_AvgFT=('FT','mean'),
        HS_AvgFTA=('FTA','mean'),
        HS_FantasyPoints=('FantasyPoints','mean'),
        HS_MinutesPlayed=('MinPlayedNum','mean'),
        HS_GamePointDiff=('GamePointDiff','mean'),
        HS_StartingCount=('Starting','count'),
        HS_Top7Team=('Top7InTeam','count')
        
        
        
    ).reset_index()
    
    
    full_season['Season'] = year
    full_season['TotalGamesSeason'] = nba_games_per_season[year]
    
    merged_df = pd.merge(full_season, half_season, on='Starters', how='left')
    merged_df = merged_df.fillna(0)
    
    
    return merged_df


In [6]:
total_season_grouped_data(2025).columns

Index(['Starters', 'S_TotalWins', 'S_GamesPlayed', 'S_AvgPoints',
       'S_AvgAssists', 'S_AvgRebounds', 'S_AvgSteals', 'S_AvgBlocks',
       'S_AvgTurnovers', 'S_AvgFG', 'S_AvgFGA', 'S_Avg3P', 'S_Avg3PA',
       'S_AvgFT', 'S_AvgFTA', 'S_FantasyPoints', 'S_MinutesPlayed',
       'S_GamePointDiff', 'S_StartingCount', 'S_Top7Team', 'Season',
       'TotalGamesSeason', 'HS_TotalWins', 'HS_GamesPlayed', 'HS_AvgPoints',
       'HS_AvgAssists', 'HS_AvgRebounds', 'HS_AvgSteals', 'HS_AvgBlocks',
       'HS_AvgTurnovers', 'HS_AvgFG', 'HS_AvgFGA', 'HS_Avg3P', 'HS_Avg3PA',
       'HS_AvgFT', 'HS_AvgFTA', 'HS_FantasyPoints', 'HS_MinutesPlayed',
       'HS_GamePointDiff', 'HS_StartingCount', 'HS_Top7Team'],
      dtype='object')

In [7]:
def add_general_data(year, add_files = False):
    
    season_agg_data = total_season_grouped_data(year)
    
    total_gen_data = pd.read_csv(f'../data/all_players_complete4_data.csv')
    
    duplicateRows = total_gen_data[total_gen_data.duplicated(['Player'])]
    throwoutplayers = list(duplicateRows['Player'])
    
    season_agg_data = season_agg_data[season_agg_data.Starters.isin(throwoutplayers) == False]
    
    combined_data = pd.merge(total_gen_data, season_agg_data, left_on='Player', right_on='Starters', how='right')
    
    lost_rows_after_merge = len(season_agg_data) - len(combined_data)
    print(f'Number of rows lost with combination: {lost_rows_after_merge}')
    
    
    
    combined_data['Age'] = combined_data.apply(
    lambda row: ast.literal_eval(row["SeasonAge"]).get(str(nba_season_names[row["Season"]])) if pd.notnull(row["SeasonAge"]) else None,
    axis=1
    )
    
    combined_data['PrimaryPosition'] = combined_data.apply(
    lambda row: ast.literal_eval(row["SeasonPositions"]).get(str(nba_season_names[row["Season"]])) if pd.notnull(row["SeasonPositions"]) else None,
    axis=1
    )
    
    combined_data['Team'] = combined_data.apply(
    lambda row: ast.literal_eval(row["SeasonTeams"]).get(str(nba_season_names[row["Season"]])) if pd.notnull(row["SeasonTeams"]) else None,
    axis=1
    )
    
    combined_data['FirstSeason'] = combined_data.apply(
    lambda row: next(iter(ast.literal_eval(row["SeasonAge"]))) if pd.notnull(row["SeasonAge"]) else None,
    axis=1
    )
    

    
    
    def clean_positions(row):
        pos_list = [row['Pos1'], row['Pos2'], row['Pos3'], row['Pos4'], row['Pos5']]
        primary = row['PrimaryPosition']

        
        pos_list = [p for p in pos_list if pd.notnull(p)]

        seen = set()
        pos_list = [x for x in pos_list if not (x in seen or seen.add(x))]

        if pd.notnull(primary):
            pos_list = [primary] + [p for p in pos_list if p != primary]
        else:
            primary = pos_list[0] if pos_list else None
            pos_list = [primary] + [p for p in pos_list if p != primary]

            
        if len(pos_list) == 0:
            pos_list = [None, None, None]
        elif len(pos_list) == 1:
            pos_list = [pos_list[0]] * 3
        elif len(pos_list) == 2:
            pos_list = [pos_list[0], pos_list[1], pos_list[1]]
        else:
            pos_list = pos_list[:3]

        return pd.Series(pos_list, index=['NewPos1', 'NewPos2', 'NewPos3'])


    new_positions = combined_data.apply(clean_positions, axis=1)
    combined_data[['NewPos1', 'NewPos2', 'NewPos3']] = new_positions
    
    combined_data = combined_data.drop(['Pos1','Pos2','Pos3','Pos4','Pos5'], axis=1)
    
    main_data = combined_data[['Player', 'Ht', 'Wt', 'Birth Date', 'Colleges',
                              'PlayerID',
       'RecuitRank', 'TeamDrafted', 'PickDrafted','Starters', 'S_TotalWins', 'S_GamesPlayed',
       'S_AvgPoints', 'S_AvgAssists', 'S_AvgRebounds', 'S_AvgSteals',
       'S_AvgBlocks', 'S_AvgTurnovers', 'S_AvgFG', 'S_AvgFGA', 'S_Avg3P',
       'S_Avg3PA', 'S_AvgFT', 'S_AvgFTA', 'S_FantasyPoints', 'S_MinutesPlayed',
       'S_GamePointDiff', 'S_StartingCount', 'S_Top7Team', 'Season',
       'TotalGamesSeason', 'HS_TotalWins', 'HS_GamesPlayed', 'HS_AvgPoints',
       'HS_AvgAssists', 'HS_AvgRebounds', 'HS_AvgSteals', 'HS_AvgBlocks',
       'HS_AvgTurnovers', 'HS_AvgFG', 'HS_AvgFGA', 'HS_Avg3P', 'HS_Avg3PA',
       'HS_AvgFT', 'HS_AvgFTA', 'HS_FantasyPoints', 'HS_MinutesPlayed',
       'HS_GamePointDiff', 'HS_StartingCount', 'HS_Top7Team', 'Age',
       'PrimaryPosition', 'Team', 'NewPos1', 'NewPos2', 'NewPos3', 'FirstSeason']]
    
    main_data['FirstSeasonYear'] = main_data['FirstSeason'].map(reversed_nba_season_names)
    
    main_data['YearsExperience'] = main_data['Season'] - main_data['FirstSeasonYear'] 
    
    def compute_pick_number(x):
        if pd.isnull(x):
            return np.nan
        numbers = list(map(int, re.findall(r'\d+', str(x))))
        if len(numbers) == 2:
            round_num, pick_num = numbers
            return (round_num - 1) * 30 + pick_num
        return np.nan

    
    main_data['PickDraftedNumber'] = main_data['PickDrafted'].apply(compute_pick_number)
    
    if add_files:    
        main_data.to_csv(f"../data/seasonfullstatsreal/playerbasedata{year}.csv")
        combined_data.to_csv(f"../data/seasonfullstatsrookiereal/playerbasedata{year}.csv")    
    
    return main_data




In [33]:
test2025 = add_general_data(2001, add_files = True)

Number of rows lost with combination: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data['FirstSeasonYear'] = main_data['FirstSeason'].map(reversed_nba_season_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data['YearsExperience'] = main_data['Season'] - main_data['FirstSeasonYear']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_data['PickDraftedNumber'] = mai

In [35]:
base_df = pd.DataFrame()



for i in [2025,2024,2023,2022,2021,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001]:
    year_data = pd.read_csv(f'../data/seasonfullstatsreal/playerbasedata{i}.csv')
    base_df = pd.concat([base_df, year_data], ignore_index=True)
    print(f"done with year {i}")
    time.sleep(5)

done with year 2025
done with year 2024
done with year 2023
done with year 2022
done with year 2021
done with year 2019
done with year 2018
done with year 2017
done with year 2016
done with year 2015
done with year 2014
done with year 2013
done with year 2012
done with year 2011
done with year 2010
done with year 2009
done with year 2008
done with year 2007
done with year 2006
done with year 2005
done with year 2004
done with year 2003
done with year 2002
done with year 2001


In [38]:
base_df.to_csv(f"../data/seasonfullstatsreal/playerbasedatatotal.csv")

In [43]:
base_df = base_df.sort_values(by=['PlayerID','Season'])
base_df.head(3)

Unnamed: 0.1,Unnamed: 0,Player,Ht,Wt,Birth Date,Colleges,PlayerID,RecuitRank,TeamDrafted,PickDrafted,Starters,S_TotalWins,S_GamesPlayed,S_AvgPoints,S_AvgAssists,S_AvgRebounds,S_AvgSteals,S_AvgBlocks,S_AvgTurnovers,S_AvgFG,S_AvgFGA,S_Avg3P,S_Avg3PA,S_AvgFT,S_AvgFTA,S_FantasyPoints,S_MinutesPlayed,S_GamePointDiff,S_StartingCount,S_Top7Team,Season,TotalGamesSeason,HS_TotalWins,HS_GamesPlayed,HS_AvgPoints,HS_AvgAssists,HS_AvgRebounds,HS_AvgSteals,HS_AvgBlocks,HS_AvgTurnovers,HS_AvgFG,HS_AvgFGA,HS_Avg3P,HS_Avg3PA,HS_AvgFT,HS_AvgFTA,HS_FantasyPoints,HS_MinutesPlayed,HS_GamePointDiff,HS_StartingCount,HS_Top7Team,Age,PrimaryPosition,Team,NewPos1,NewPos2,NewPos3,FirstSeason,FirstSeasonYear,YearsExperience,PickDraftedNumber
11484,268,Mahmoud Abdul-Rauf,6.083333,162.0,1969-03-09 00:00:00,LSU,abdulma02,,Denver Nuggets,1st round (3rd pick,Mahmoud Abdul-Rauf,11,41,6.487805,1.853659,0.609756,0.219512,0.02439,0.634146,2.926829,6.0,0.097561,0.341463,0.536585,0.707317,5.609756,11.83374,-8.0,41,41,2001,82,9.0,31.0,7.548387,2.0,0.612903,0.258065,0.032258,0.774194,3.419355,6.612903,0.129032,0.451613,0.580645,0.806452,6.258065,12.714516,-7.225806,31.0,31.0,31.0,PG,VAN,PG,PG,PG,1990-91,1991.0,10.0,3.0
11606,390,Tariq Abdul-Wahad,6.5,223.0,1974-11-03 00:00:00,"Michigan, San Jose State",abdulta01,,Sacramento Kings,1st round (11th pick,Tariq Abdul-Wahad,15,29,3.827586,0.758621,2.034483,0.482759,0.448276,1.172414,1.482759,3.827586,0.137931,0.344828,0.724138,1.241379,5.465517,14.507471,-3.931034,29,29,2001,82,3.0,6.0,1.5,0.166667,1.5,0.0,0.166667,0.833333,0.666667,2.5,0.0,0.166667,0.166667,0.666667,1.916667,8.816667,-6.166667,6.0,6.0,26.0,SG,DEN,SG,SG,SG,1997-98,1998.0,3.0,11.0
11164,387,Tariq Abdul-Wahad,6.5,223.0,1974-11-03 00:00:00,"Michigan, San Jose State",abdulta01,,Sacramento Kings,1st round (11th pick,Tariq Abdul-Wahad,12,24,5.625,1.0,3.5,0.833333,0.416667,1.125,2.291667,6.125,0.041667,0.083333,1.0,1.375,8.708333,18.434028,-3.625,24,24,2002,82,6.0,12.0,3.916667,0.75,2.75,0.75,0.25,1.0,1.583333,3.666667,0.0,0.0,0.75,0.916667,6.458333,12.090278,-6.333333,12.0,12.0,27.0,SG,2TM,SG,SG,SG,1997-98,1998.0,4.0,11.0


In [44]:
stat_cols = ['S_TotalWins', 'S_GamesPlayed', 'S_AvgPoints'
             ,'S_AvgAssists','S_AvgRebounds','S_AvgSteals'
             ,'S_AvgBlocks','S_AvgTurnovers','S_AvgFG',
             'S_AvgFGA','S_Avg3P','S_Avg3PA','S_AvgFT',
             'S_AvgFTA','S_FantasyPoints','S_MinutesPlayed',
             'S_GamePointDiff','S_StartingCount','S_Top7Team',
             'TotalGamesSeason']

In [46]:
for col in stat_cols:
    
    base_df[f'{col}_prevyear'] = base_df.groupby('PlayerID')[col].shift(1).fillna(0)

In [52]:
for col in stat_cols:
    base_df[f'{col}_prev5years'] = (
        base_df.groupby('PlayerID')[col]
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )

In [None]:
for col in stat_cols:
    base_df[f'{col}_next1'] = (
        base_df.groupby('PlayerID')[col].shift(-1)
    )

In [53]:
base_df[base_df['PlayerID'] == 'zubaciv01']

Unnamed: 0.1,Unnamed: 0,Player,Ht,Wt,Birth Date,Colleges,PlayerID,RecuitRank,TeamDrafted,PickDrafted,Starters,S_TotalWins,S_GamesPlayed,S_AvgPoints,S_AvgAssists,S_AvgRebounds,S_AvgSteals,S_AvgBlocks,S_AvgTurnovers,S_AvgFG,S_AvgFGA,S_Avg3P,S_Avg3PA,S_AvgFT,S_AvgFTA,S_FantasyPoints,S_MinutesPlayed,S_GamePointDiff,S_StartingCount,S_Top7Team,Season,TotalGamesSeason,HS_TotalWins,HS_GamesPlayed,HS_AvgPoints,HS_AvgAssists,HS_AvgRebounds,HS_AvgSteals,HS_AvgBlocks,HS_AvgTurnovers,HS_AvgFG,HS_AvgFGA,HS_Avg3P,HS_Avg3PA,HS_AvgFT,HS_AvgFTA,HS_FantasyPoints,HS_MinutesPlayed,HS_GamePointDiff,HS_StartingCount,HS_Top7Team,Age,PrimaryPosition,Team,NewPos1,NewPos2,NewPos3,FirstSeason,FirstSeasonYear,YearsExperience,PickDraftedNumber,S_TotalWins_prevyear,S_GamesPlayed_prevyear,S_AvgPoints_prevyear,S_AvgAssists_prevyear,S_AvgRebounds_prevyear,S_AvgSteals_prevyear,S_AvgBlocks_prevyear,S_AvgTurnovers_prevyear,S_AvgFG_prevyear,S_AvgFGA_prevyear,S_Avg3P_prevyear,S_Avg3PA_prevyear,S_AvgFT_prevyear,S_AvgFTA_prevyear,S_FantasyPoints_prevyear,S_MinutesPlayed_prevyear,S_GamePointDiff_prevyear,S_StartingCount_prevyear,S_Top7Team_prevyear,TotalGamesSeason_prevyear,S_TotalWins_prev5years,S_GamesPlayed_prev5years,S_AvgPoints_prev5years,S_AvgAssists_prev5years,S_AvgRebounds_prev5years,S_AvgSteals_prev5years,S_AvgBlocks_prev5years,S_AvgTurnovers_prev5years,S_AvgFG_prev5years,S_AvgFGA_prev5years,S_Avg3P_prev5years,S_Avg3PA_prev5years,S_AvgFT_prev5years,S_AvgFTA_prev5years,S_FantasyPoints_prev5years,S_MinutesPlayed_prev5years,S_GamePointDiff_prev5years,S_StartingCount_prev5years,S_Top7Team_prev5years,TotalGamesSeason_prev5years
4068,178,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,8,38,7.473684,0.789474,4.184211,0.368421,0.868421,0.789474,3.315789,6.263158,0.0,0.078947,0.842105,1.289474,10.5,16.013596,-11.342105,38,38,2017,82,5.0,25.0,9.52,1.0,4.92,0.4,1.04,0.92,4.24,7.76,0.0,0.12,1.04,1.64,12.76,19.106,-9.72,25.0,25.0,19.0,C,LAL,C,C,C,2016-17,2017.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3545,194,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,19,43,3.744186,0.581395,2.860465,0.186047,0.348837,0.604651,1.418605,2.837209,0.0,0.023256,0.906977,1.186047,5.802326,9.531008,-0.55814,43,43,2018,82,14.0,30.0,4.966667,0.8,3.766667,0.266667,0.5,0.7,1.833333,3.6,0.0,0.033333,1.3,1.7,7.916667,12.058889,1.6,30.0,30.0,20.0,C,LAL,C,C,C,2016-17,2017.0,1.0,32.0,8.0,38.0,7.473684,0.789474,4.184211,0.368421,0.868421,0.789474,3.315789,6.263158,0.0,0.078947,0.842105,1.289474,10.5,16.013596,-11.342105,38.0,38.0,82.0,8.0,38.0,7.473684,0.789474,4.184211,0.368421,0.868421,0.789474,3.315789,6.263158,0.0,0.078947,0.842105,1.289474,10.5,16.013596,-11.342105,38.0,38.0,82.0
3026,204,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,32,59,8.898305,1.067797,6.135593,0.237288,0.864407,1.186441,3.59322,6.423729,0.0,0.0,1.711864,2.135593,12.805085,17.614689,0.237288,59,59,2019,82,20.0,32.0,9.6875,1.40625,7.5625,0.3125,0.9375,1.53125,3.9375,7.53125,0.0,0.0,1.8125,2.4375,14.9375,20.596354,0.03125,32.0,32.0,21.0,C,2TM,C,C,C,2016-17,2017.0,2.0,32.0,19.0,43.0,3.744186,0.581395,2.860465,0.186047,0.348837,0.604651,1.418605,2.837209,0.0,0.023256,0.906977,1.186047,5.802326,9.531008,-0.55814,43.0,43.0,82.0,13.5,40.5,5.608935,0.685435,3.522338,0.277234,0.608629,0.697062,2.367197,4.550184,0.0,0.051102,0.874541,1.23776,8.151163,12.772302,-5.950122,40.5,40.5,82.0
2493,210,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,47,72,9.027778,1.25,7.208333,0.333333,0.861111,1.125,3.569444,5.472222,0.013889,0.055556,1.875,2.375,14.423611,22.341204,6.180556,72,72,2021,72,25.0,41.0,9.682927,1.365854,7.634146,0.292683,0.926829,1.195122,3.95122,6.097561,0.02439,0.073171,1.756098,2.341463,15.292683,24.537398,5.731707,41.0,41.0,23.0,C,LAC,C,C,C,2016-17,2017.0,4.0,32.0,32.0,59.0,8.898305,1.067797,6.135593,0.237288,0.864407,1.186441,3.59322,6.423729,0.0,0.0,1.711864,2.135593,12.805085,17.614689,0.237288,59.0,59.0,82.0,19.666667,46.666667,6.705392,0.812889,4.393423,0.263919,0.693888,0.860189,2.775871,5.174699,0.0,0.034068,1.153649,1.537038,9.70247,14.386431,-3.887652,46.666667,46.666667,82.0
1913,234,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,39,77,10.246753,1.584416,8.441558,0.467532,1.025974,1.519481,4.038961,6.454545,0.0,0.0,2.168831,3.0,16.941558,24.317749,0.376623,77,77,2022,82,19.0,36.0,10.888889,1.972222,8.888889,0.416667,0.972222,1.75,4.361111,7.416667,0.0,0.0,2.166667,3.138889,17.722222,23.97963,1.361111,36.0,36.0,24.0,C,LAC,C,C,C,2016-17,2017.0,5.0,32.0,47.0,72.0,9.027778,1.25,7.208333,0.333333,0.861111,1.125,3.569444,5.472222,0.013889,0.055556,1.875,2.375,14.423611,22.341204,6.180556,72.0,72.0,72.0,26.5,53.0,7.285988,0.922166,5.097151,0.281272,0.735694,0.926391,2.974265,5.24908,0.003472,0.03944,1.333987,1.746528,10.882755,16.375124,-1.3706,53.0,53.0,79.5
1330,191,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,43,76,10.763158,1.013158,9.947368,0.381579,1.289474,1.539474,4.289474,6.763158,0.0,0.026316,2.184211,3.131579,18.5,28.546272,1.105263,76,76,2023,82,20.0,30.0,11.466667,0.966667,9.166667,0.366667,1.266667,1.033333,4.6,7.0,0.0,0.0,2.266667,3.266667,18.433333,27.323333,4.333333,30.0,30.0,25.0,C,LAC,C,C,C,2016-17,2017.0,6.0,32.0,39.0,77.0,10.246753,1.584416,8.441558,0.467532,1.025974,1.519481,4.038961,6.454545,0.0,0.0,2.168831,3.0,16.941558,24.317749,0.376623,77.0,77.0,82.0,29.0,57.8,7.878141,1.054616,5.766032,0.318524,0.79375,1.045009,3.187204,5.490173,0.002778,0.031552,1.500956,1.997223,12.094516,17.963649,-1.021156,57.8,57.8,80.0
779,211,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,42,68,11.676471,1.367647,9.205882,0.323529,1.220588,1.161765,4.955882,7.632353,0.0,0.0,1.764706,2.441176,18.676471,26.389216,2.720588,68,68,2024,82,17.0,30.0,10.8,1.5,8.533333,0.466667,1.033333,1.2,4.533333,7.066667,0.0,0.0,1.733333,2.033333,17.566667,25.496111,-1.1,30.0,30.0,26.0,C,LAC,C,C,C,2016-17,2017.0,7.0,32.0,43.0,76.0,10.763158,1.013158,9.947368,0.381579,1.289474,1.539474,4.289474,6.763158,0.0,0.026316,2.184211,3.131579,18.5,28.546272,1.105263,76.0,76.0,82.0,36.0,65.4,8.536036,1.099353,6.918664,0.321156,0.877961,1.195009,3.381941,5.590173,0.002778,0.021025,1.769377,2.365644,13.694516,20.470184,1.468318,65.4,65.4,80.0
203,203,Ivica Zubac,7.0,240.0,1997-03-18 00:00:00,,zubaciv01,,Los Angeles Lakers,2nd round (2nd pick,Ivica Zubac,50,80,16.75,2.675,12.625,0.6875,1.125,1.5875,7.4,11.775,0.0,0.0,1.95,2.95,26.475,32.805,4.9875,80,80,2025,82,26.0,39.0,18.564103,2.794872,12.589744,0.692308,1.0,1.384615,8.128205,12.692308,0.0,0.0,2.307692,3.025641,27.564103,33.812821,6.846154,39.0,39.0,27.0,C,LAC,C,C,C,2016-17,2017.0,8.0,32.0,42.0,68.0,11.676471,1.367647,9.205882,0.323529,1.220588,1.161765,4.955882,7.632353,0.0,0.0,1.764706,2.441176,18.676471,26.389216,2.720588,68.0,68.0,82.0,40.6,70.4,10.122493,1.256603,8.187747,0.348652,1.052311,1.306432,4.089396,6.549201,0.002778,0.016374,1.940922,2.61667,16.269345,23.841826,2.124064,70.4,70.4,80.0
