In [1]:
import requests
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import pandas as pd
import io
import numpy as np
import time
from fuzzywuzzy import fuzz, process
from unidecode import unidecode

## img itemscope="image" src="https://www.basketball-reference.com/req/202106291/images/headshots/jokicni01.jpg" alt="Photo of Nikola Jokić"

## Add picture Link and add a non starting most minutes tag that starters also get, for fantasy defense rating. 

In [2]:
positions = pd.read_csv("../data/playerpositions.csv")
player_names_positions = positions['Players'].tolist()
players_pic_link = pd.read_csv("../data/positions2.csv")
player_pic_link_names = players_pic_link['Players'].tolist()
players_pic_link

Unnamed: 0.2,Unnamed: 0.1,Players,Pos1,Pos2,Pos3,Pos4,Pos5,PictureLink,Unnamed: 0
0,0,Jonas ValanÄiÅ«nas,C,,,,,https://www.basketball-reference.com/req/20210...,
1,1,Carlton Carrington,PG,,,,,https://www.basketball-reference.com/req/20210...,
2,2,Bilal Coulibaly,SF,,,,,https://www.basketball-reference.com/req/20210...,
3,3,Alex Sarr,C,PF,,,,https://www.basketball-reference.com/req/20210...,
4,4,Corey Kispert,SF,,,,,https://www.basketball-reference.com/req/20210...,
...,...,...,...,...,...,...,...,...,...
521,521,Kobe Bufkin,SG,,,,,https://www.basketball-reference.com/req/20210...,380.0
522,522,Dominick Barlow,PF,SF,,,,https://www.basketball-reference.com/req/20210...,381.0
523,523,Mouhamed Gueye,PF,,,,,https://www.basketball-reference.com/req/20210...,382.0
524,524,Daeqwon Plowden,SG,,,,,https://www.basketball-reference.com/req/20210...,383.0


In [3]:
nba_team_abbreviations = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Charlotte Hornets": "CHO", ####
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Wizards": "WAS"
    }


nba_team_names = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "CHO": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NOP": "New Orleans Pelicans",
    "NYK": "New York Knicks",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHO": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "WAS": "Washington Wizards"
}

In [4]:
def get_game_dates(year, month):  
    session = requests.Session()
    
    URL = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
    response = session.get(URL)
    response.status_code
    soup = BeautifulSoup(response.text, features="html.parser")
    soup.prettify()
    tables = soup.find_all('table')
    table_html = str(tables[0])
    df = pd.read_html(io.StringIO(table_html))[0]
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Day'] = df['Date'].dt.day.astype(str).str.zfill(2)
    df['Month#'] = df['Date'].dt.month
    df['MonthName'] = df['Date'].dt.month_name().str.lower()
    df['GameFinished'] = df['Date'] < (pd.Timestamp('today') - pd.Timedelta(days=1))
    
    df = df.rename(columns = {
        'Visitor/Neutral':'AwayTeam',
        'PTS':'AwayPTS',
        'PTS.1':'HomePTS',
        'Home/Neutral' : 'HomeTeam',
        'Unnamed: 7' : 'Overtime',
        'Notes':'InSeasonTournament'
    })
    df = df.drop(['Unnamed: 6', 'LOG','Arena'], axis = 1)
    df['Overtime'] = df['Overtime'].apply(lambda x: True if x == 'OT' else False)
    df['HomeTeamAbv'] = df['HomeTeam'].map(nba_team_abbreviations)
    df['AwayTeamAbv'] = df['AwayTeam'].map(nba_team_abbreviations)
    df['HomeWin'] = df['HomePTS'] > df['AwayPTS']
    df['AwayWin'] = df['AwayPTS'] > df['HomePTS']
    df['HomePointDiff'] = df['HomePTS'] - df['AwayPTS']
    df['AwayPointDiff'] = df['AwayPTS'] - df['HomePTS']
    df['GameID'] = df['Year'].astype(str) + df['Day'].astype(str) + df['Month#'].astype(str) + df['HomeTeamAbv']

    df['InSeasonTournament'] = df['InSeasonTournament'].apply(lambda x: True if x == 'In-Season Tournament' else False)

    return df

In [5]:
get_game_dates(2025, 'november')

Unnamed: 0,Date,Start (ET),AwayTeam,AwayPTS,HomeTeam,HomePTS,Overtime,Attend.,InSeasonTournament,Year,...,Month#,MonthName,GameFinished,HomeTeamAbv,AwayTeamAbv,HomeWin,AwayWin,HomePointDiff,AwayPointDiff,GameID
0,2024-11-01,7:00p,Boston Celtics,124,Charlotte Hornets,109,False,18557,False,2024,...,11,november,True,CHO,BOS,False,True,-15,15,20240111CHO
1,2024-11-01,7:00p,Orlando Magic,109,Cleveland Cavaliers,120,False,19432,False,2024,...,11,november,True,CLE,ORL,True,False,11,-11,20240111CLE
2,2024-11-01,7:00p,New York Knicks,128,Detroit Pistons,98,False,17022,False,2024,...,11,november,True,DET,NYK,False,True,-30,30,20240111DET
3,2024-11-01,7:30p,Sacramento Kings,123,Atlanta Hawks,115,False,15156,False,2024,...,11,november,True,ATL,SAC,False,True,-8,8,20240111ATL
4,2024-11-01,7:30p,Chicago Bulls,112,Brooklyn Nets,120,False,17977,False,2024,...,11,november,True,BRK,CHI,True,False,8,-8,20240111BRK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,2024-11-30,6:00p,Atlanta Hawks,107,Charlotte Hornets,104,False,17969,False,2024,...,11,november,True,CHO,ATL,False,True,-3,3,20243011CHO
218,2024-11-30,7:00p,Philadelphia 76ers,111,Detroit Pistons,96,False,22062,False,2024,...,11,november,True,DET,PHI,False,True,-15,15,20243011DET
219,2024-11-30,8:00p,Washington Wizards,114,Milwaukee Bucks,124,False,17341,False,2024,...,11,november,True,MIL,WAS,True,False,10,-10,20243011MIL
220,2024-11-30,9:00p,Golden State Warriors,105,Phoenix Suns,113,False,17071,False,2024,...,11,november,True,PHO,GSW,True,False,8,-8,20243011PHO


In [6]:
def all_games_season(year):
    months = ['october', 'november', 'december', 'january', 'february', 'march', 'april']
    total_df = pd.DataFrame()
    all_dataframes = []
    for month in months:
        
        df = get_game_dates(year, month)
        all_dataframes.append(df)
    
    total_df = pd.concat(all_dataframes)
    
    return total_df

In [50]:
dfdf = all_games_season(2025)
dfdf

Unnamed: 0,Date,Start (ET),AwayTeam,AwayPTS,HomeTeam,HomePTS,Overtime,Attend.,InSeasonTournament,Year,...,Month#,MonthName,GameFinished,HomeTeamAbv,AwayTeamAbv,HomeWin,AwayWin,HomePointDiff,AwayPointDiff,GameID
0,2024-10-22,7:30p,New York Knicks,109.0,Boston Celtics,132.0,False,19156.0,False,2024,...,10,october,True,BOS,NYK,True,False,23.0,-23.0,20242210BOS
1,2024-10-22,10:00p,Minnesota Timberwolves,103.0,Los Angeles Lakers,110.0,False,18997.0,False,2024,...,10,october,True,LAL,MIN,True,False,7.0,-7.0,20242210LAL
2,2024-10-23,7:00p,Indiana Pacers,115.0,Detroit Pistons,109.0,False,20062.0,False,2024,...,10,october,True,DET,IND,False,True,-6.0,6.0,20242310DET
3,2024-10-23,7:30p,Brooklyn Nets,116.0,Atlanta Hawks,120.0,False,17548.0,False,2024,...,10,october,True,ATL,BRK,True,False,4.0,-4.0,20242310ATL
4,2024-10-23,7:30p,Orlando Magic,116.0,Miami Heat,97.0,False,19630.0,False,2024,...,10,october,True,MIA,ORL,False,True,-19.0,19.0,20242310MIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,2025-04-13,3:30p,Utah Jazz,,Minnesota Timberwolves,,False,,False,2025,...,4,april,False,MIN,UTA,False,False,,,2025134MIN
101,2025-04-13,3:30p,Oklahoma City Thunder,,New Orleans Pelicans,,False,,False,2025,...,4,april,False,NOP,OKC,False,False,,,2025134NOP
102,2025-04-13,3:30p,Los Angeles Lakers,,Portland Trail Blazers,,False,,False,2025,...,4,april,False,POR,LAL,False,False,,,2025134POR
103,2025-04-13,3:30p,Phoenix Suns,,Sacramento Kings,,False,,False,2025,...,4,april,False,SAC,PHO,False,False,,,2025134SAC


In [8]:
def get_boxscore_link(df):
    
    if int(df['Month#'].iloc[0]) < 10:
        month = '0' + str(df['Month#'].iloc[0])
    else:
        month = df['Month#'].iloc[0]
    year = df['Year'].iloc[0]
    day = df['Day'].iloc[0]
    hometeam = df['HomeTeamAbv'].iloc[0]
    
    link = f'https://www.basketball-reference.com/boxscores/{year}{month}{day}0{hometeam}.html'
    
    return link
    

In [9]:
def get_all_links_all_games(big_df):
    length = len(big_df)
    link_list = []
    for i in range(length):
        df = big_df.iloc[[i]]
        link_list.append(get_boxscore_link(df))
    
    return link_list
        

In [10]:
links = get_all_links_all_games(dfdf)

In [11]:
smalldfdf = dfdf.iloc[[0]]

In [12]:
def player_scores_game(GameDF):
    
    global positions
    
    columns_to_replace = [
    'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 
    'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-'
    ]
    URL = get_boxscore_link(GameDF)
    
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, features="html.parser")
    soup.prettify()
    tables = soup.find_all('table')
    
    
    inactive_section = soup.find('strong', string=lambda t: t and "Inactive:" in t)
    team_sections = inactive_section.find_next_siblings('span')
    team_inactives = {}  

    for team_section in team_sections:
        team_name = team_section.text.strip() 

        
        players = []
        sibling = team_section.find_next_sibling()
        while sibling and sibling.name == 'a':  
            players.append(sibling.text.strip())
            sibling = sibling.find_next_sibling()

        # Store the team and its players
        if players:
            team_inactives[team_name] = players
    rows = []
    for team, players in team_inactives.items():
        for player in players:
            row = {'Starters': player, 'Injured': True, 'TeamAbv': team}  
            rows.append(row)
    
    new_df = pd.DataFrame(rows)
    team_inactive_counts = {}
    team_inactive_counts = {team: len(players) for team, players in team_inactives.items()} 
    
    
    table_html1 = str(tables[0])
    df1 = pd.read_html(io.StringIO(table_html1))[0]
    df1.columns = df1.columns.droplevel(0)
    df1['Starting'] = [True if i < 5 else False for i in range(len(df1))]
    df1['Injured'] = False
    df1['TeamAbv'] = nba_team_abbreviations[GameDF['AwayTeam'].iloc[0]]
    name1 = df1['Starters'].iloc[0]
    
    df1_filtered = df1[~df1['Starters'].isin(['Team Totals', 'Reserves'])].copy()
    
    df1_filtered.loc[:, 'MP_sort'] = pd.to_numeric(df1_filtered['MP'], errors='coerce')
    top_non_startersdf1 = df1_filtered[df1_filtered['Starting'] == False].nlargest(2, 'MP_sort')
    
    df1['Top7InTeam'] = df1_filtered['Starting'] | df1_filtered.index.isin(top_non_startersdf1.index)
    
    
    
    
    GoodToGo = False
    i = 6
    
    while not GoodToGo:
        table_html2 = str(tables[i])
        df2 = pd.read_html(io.StringIO(table_html2))[0]
        #print(df2)
        #print("ABOVE IS THE SECOND DATAFRAME!")
        df2.columns = df2.columns.droplevel(0)
        df2['Starting'] = [True if i < 5 else False for i in range(len(df2))]
       
        df2['Injured'] = False
        df2['TeamAbv'] = nba_team_abbreviations[GameDF['HomeTeam'].iloc[0]]
        name2= df2['Starters'].iloc[0]
        #print(name2)
        #print(i)
        i = i + 1
        if name1 != name2:
            GoodToGo = True
        
    
    df2_filtered = df2[~df2['Starters'].isin(['Team Totals', 'Reserves'])].copy()
    
    df2_filtered.loc[:, 'MP_sort'] = pd.to_numeric(df2_filtered['MP'], errors='coerce')
    top_non_startersdf2 = df2_filtered[df2_filtered['Starting'] == False].nlargest(2, 'MP_sort')
    
    df2['Top7InTeam'] = df2_filtered['Starting'] | df2_filtered.index.isin(top_non_startersdf2.index)
    
    
    
    
#     for i in range(len(tables)):
#         print("                                    ")
#         print("                                    ")
#         print("                                    ")
#         print(i)
#         print("                                    ")
#         print("                                    ")
#         print("                                    ")
#         table_html0 = str(tables[i])
#         df0 = pd.read_html(io.StringIO(table_html0))[0]
#         print(df0)
    
    
    #Do a first run of didnotplay = injured, then do if did not play = false?
    # or do did not play be an or statement just the whole thing
    
    
    
    df_merged = pd.concat([df1, df2, new_df], ignore_index=True, sort=False)
    #df_merged.columns = df_merged.columns.droplevel(0)
    
    df_merged = df_merged[df_merged['Starters'] != 'Team Totals']
    df_merged = df_merged[df_merged['Starters'] != 'Reserves']

    
    df_merged['Date'] = GameDF['Date'].iloc[0]
    df_merged['Year'] = df_merged['Date'].dt.year
    df_merged['Day'] = df_merged['Date'].dt.day.astype(str).str.zfill(2)
    df_merged['Month#'] = df_merged['Date'].dt.month
    df_merged['MonthName'] = df_merged['Date'].dt.month_name().str.lower()
    df_merged['Day_of_week'] = df_merged['Date'].dt.day_name()
    df_merged['TeamName'] = df_merged['TeamAbv'].map(nba_team_names)
    df_merged['Home'] = df_merged['TeamName'].apply(lambda x: x == GameDF['HomeTeam'].iloc[0])
    df_merged['DidNotPlay'] = (df_merged['FG'] == df_merged['MP']) | df_merged['Injured']
    df_merged[df_merged['Injured']] = df_merged.fillna(0)
    
    
    for col_replace in columns_to_replace:
        df_merged.loc[df_merged['DidNotPlay'], col_replace] = 0
      
    
    df_merged['MPTimeDelta'] = df_merged['MP'].apply(
    lambda x: pd.to_timedelta("00:" + str(x)) if ':' in str(x) else pd.to_timedelta(0))
    df_merged['OpponentTeam'] = df_merged['Home'].apply(lambda x: GameDF['AwayTeam'].iloc[0] if x else GameDF['HomeTeam'].iloc[0])
    df_merged['OpponentTeamAbv'] = df_merged['Home'].apply(lambda x: GameDF['AwayTeamAbv'].iloc[0] if x else GameDF['HomeTeamAbv'].iloc[0])
    df_merged['WonGame'] = df_merged['Home'].apply(lambda x: GameDF['HomeWin'].iloc[0] if x else GameDF['AwayWin'].iloc[0])
    df_merged['GamePointDiff'] = df_merged['Home'].apply(lambda x: GameDF['HomePointDiff'].iloc[0] if x else GameDF['AwayPointDiff'].iloc[0])
    df_merged['Start(ET)'] = GameDF['Start (ET)'].iloc[0]
    df_merged['Overtime'] = GameDF['Overtime'].iloc[0]
    df_merged['Attend.'] = GameDF['Attend.'].iloc[0]
    df_merged['InSeasonTournament'] = GameDF['InSeasonTournament'].iloc[0]
    
    df_merged['Pos1'] = df_merged['Starters'].apply(
    lambda x: positions.loc[positions['Players'] == x, 'Pos1'].iloc[0] if not positions.loc[positions['Players'] == x].empty else positions.loc[positions['Players'] == process.extractOne(x, player_names_positions)[0], 'Pos1'].iloc[0])
    
    df_merged['Pos2'] = df_merged['Starters'].apply(
    lambda x: positions.loc[positions['Players'] == x, 'Pos2'].iloc[0] if not positions.loc[positions['Players'] == x].empty else positions.loc[positions['Players'] == process.extractOne(x, player_names_positions)[0], 'Pos2'].iloc[0])
    
    df_merged['Pos3'] = df_merged['Starters'].apply(
    lambda x: positions.loc[positions['Players'] == x, 'Pos3'].iloc[0] if not positions.loc[positions['Players'] == x].empty else positions.loc[positions['Players'] == process.extractOne(x, player_names_positions)[0], 'Pos3'].iloc[0])
    
    df_merged['Pos4'] = df_merged['Starters'].apply(
    lambda x: positions.loc[positions['Players'] == x, 'Pos4'].iloc[0] if not positions.loc[positions['Players'] == x].empty else positions.loc[positions['Players'] == process.extractOne(x, player_names_positions)[0], 'Pos4'].iloc[0])
    
    df_merged['Pos5'] = df_merged['Starters'].apply(
    lambda x: positions.loc[positions['Players'] == x, 'Pos5'].iloc[0] if not positions.loc[positions['Players'] == x].empty else positions.loc[positions['Players'] == process.extractOne(x, player_names_positions)[0], 'Pos5'].iloc[0])
    
    df_merged['correct_positions'] = df_merged['Starters'].apply(
    lambda x: positions['Players'].eq(x).any() or process.extractOne(x, player_names_positions, score_cutoff=75) is not None)
    
    df_merged['PhotoLink'] = df_merged['Starters'].apply(
    lambda x: players_pic_link.loc[players_pic_link['Players'] == x, 'PictureLink'].iloc[0] 
    if not players_pic_link.loc[players_pic_link['Players'] == x].empty 
    else None
    )
    
    df_merged['Starting'] = df_merged['Starting'].apply(lambda x: False if x == 0 else x)
    df_merged['PTS'] = pd.to_numeric(df_merged['PTS'], errors='coerce')
    df_merged['TRB'] = pd.to_numeric(df_merged['TRB'], errors='coerce')
    df_merged['AST'] = pd.to_numeric(df_merged['AST'], errors='coerce')
    df_merged['STL'] = pd.to_numeric(df_merged['STL'], errors='coerce')
    df_merged['BLK'] = pd.to_numeric(df_merged['BLK'], errors='coerce')
    df_merged['TOV'] = pd.to_numeric(df_merged['TOV'], errors='coerce')
    df_merged['3P'] = pd.to_numeric(df_merged['3P'], errors='coerce')
    df_merged['tens'] = ((df_merged['PTS'] >= 10) * 1) + ((df_merged['TRB'] >= 10) * 1) + ((df_merged['AST'] >= 10) * 1)+ ((df_merged['STL'] >= 10) * 1)+ ((df_merged['BLK'] >= 10) * 1)
    df_merged['FantasyPoints'] = (df_merged['PTS'] * .5) + (df_merged['TRB'] * 1) + (df_merged['AST'] * 1) + (df_merged['STL'] * 2) + (df_merged['BLK'] * 2) + (df_merged['TOV'] * -1) + (df_merged['3P'] * .5) + ((df_merged['PTS'] >= 40).astype(int) * 2) + ((df_merged['PTS'] >= 50).astype(int) * 2) + ((df_merged['tens'] >= 2).astype(int) * 1) + ((df_merged['tens'] >= 3).astype(int) * 2)
    df_merged['GameID'] = GameDF['GameID'].iloc[0]
    df_merged['InjTeamateCount'] = df_merged['TeamAbv'].map(team_inactive_counts)
    df_merged['Top7InTeam'] = df_merged['Top7InTeam'].apply(lambda x: False if x == 0 else x)
    
    df_merged = df_merged[['Date','Starters','TeamName','WonGame','Injured','DidNotPlay','FantasyPoints','MPTimeDelta','MP','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','GmSc','+/-','TeamAbv','Year','Day','Month#','MonthName','Day_of_week','Home','GamePointDiff','Start(ET)','Overtime','Attend.','InSeasonTournament','Pos1','Pos2','Pos3','Pos4','Pos5','correct_positions','GameID','OpponentTeam','OpponentTeamAbv','InjTeamateCount', 'Starting','Top7InTeam','PhotoLink']]
    #df_merged = df_merged.rename(columns = {
    #    'Starters'
    #    
    #})
    
    
    return df_merged

stats = player_scores_game(smalldfdf)

In [13]:
# Index([, 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA',
#        'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#        'GmSc', '+/-', '', 'Injured', 'TeamAbv', , 'Year', 'Day',
#        'Month#', 'MonthName', 'Day_of_week', 'TeamName', 'Home', 'DidNotPlay',
#        'WonGame', 'GamePointDiff', 'Start(ET)', 'Overtime', 'Attend.',
#        'InSeasonTournament', 'Pos1', 'Pos2', 'Pos3', 'Pos4', 'Pos5'],
#       dtype='object')

In [14]:
stats

Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,MP,FG,...,Pos4,Pos5,correct_positions,GameID,OpponentTeam,OpponentTeamAbv,InjTeamateCount,Starting,Top7InTeam,PhotoLink
0,2024-10-22,Mikal Bridges,New York Knicks,False,False,False,10.0,0 days 00:34:37,34:37,7,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
1,2024-10-22,OG Anunoby,New York Knicks,False,False,False,12.0,0 days 00:34:10,34:10,1,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
2,2024-10-22,Jalen Brunson,New York Knicks,False,False,False,10.5,0 days 00:24:30,24:30,9,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
3,2024-10-22,Josh Hart,New York Knicks,False,False,False,12.0,0 days 00:24:30,24:30,4,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
4,2024-10-22,Karl-Anthony Towns,New York Knicks,False,False,False,16.5,0 days 00:23:37,23:37,5,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
6,2024-10-22,Miles McBride,New York Knicks,False,False,False,14.0,0 days 00:25:51,25:51,8,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,True,https://www.basketball-reference.com/req/20210...
7,2024-10-22,Jericho Sims,New York Knicks,False,False,False,15.0,0 days 00:24:23,24:23,2,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,True,https://www.basketball-reference.com/req/20210...
8,2024-10-22,Cameron Payne,New York Knicks,False,False,False,14.0,0 days 00:20:43,20:43,5,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,False,https://www.basketball-reference.com/req/20210...
9,2024-10-22,Pacome Dadiet,New York Knicks,False,False,False,3.0,0 days 00:13:23,13:23,1,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,False,https://www.basketball-reference.com/req/20210...
10,2024-10-22,Ariel Hukporti,New York Knicks,False,False,False,6.0,0 days 00:06:05,6:05,0,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,False,https://www.basketball-reference.com/req/20210...


In [15]:
first_2025 = dfdf[dfdf['Year'] == 2025].iloc[[0]]
first_2025

Unnamed: 0,Date,Start (ET),AwayTeam,AwayPTS,HomeTeam,HomePTS,Overtime,Attend.,InSeasonTournament,Year,...,Month#,MonthName,GameFinished,HomeTeamAbv,AwayTeamAbv,HomeWin,AwayWin,HomePointDiff,AwayPointDiff,GameID
0,2025-01-01,7:00p,Orlando Magic,96.0,Detroit Pistons,105.0,False,19399.0,False,2025,...,1,january,True,DET,ORL,True,False,9.0,-9.0,2025011DET


In [16]:
#20242210BOS

In [17]:
testingdfdf = dfdf[dfdf['GameID'] == '2025011DET']

In [18]:
player_scores_game(testingdfdf)

Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,MP,FG,...,Pos4,Pos5,correct_positions,GameID,OpponentTeam,OpponentTeamAbv,InjTeamateCount,Starting,Top7InTeam,PhotoLink
0,2025-01-01,Kentavious Caldwell-Pope,Orlando Magic,False,False,False,20.5,0 days 00:34:23,34:23,8,...,,,True,2025011DET,Detroit Pistons,DET,6,True,True,https://www.basketball-reference.com/req/20210...
1,2025-01-01,Wendell Carter Jr.,Orlando Magic,False,False,False,21.5,0 days 00:34:04,34:04,7,...,,,True,2025011DET,Detroit Pistons,DET,6,True,True,https://www.basketball-reference.com/req/20210...
2,2025-01-01,Jalen Suggs,Orlando Magic,False,False,False,28.5,0 days 00:33:49,33:49,9,...,,,True,2025011DET,Detroit Pistons,DET,6,True,True,https://www.basketball-reference.com/req/20210...
3,2025-01-01,Tristan Da Silva,Orlando Magic,False,False,False,10.0,0 days 00:22:42,22:42,0,...,,,True,2025011DET,Detroit Pistons,DET,6,True,True,https://www.basketball-reference.com/req/20210...
4,2025-01-01,Goga Bitadze,Orlando Magic,False,False,False,13.0,0 days 00:22:39,22:39,3,...,,,True,2025011DET,Detroit Pistons,DET,6,True,True,https://www.basketball-reference.com/req/20210...
6,2025-01-01,Caleb Houstan,Orlando Magic,False,False,False,9.0,0 days 00:27:18,27:18,1,...,,,True,2025011DET,Detroit Pistons,DET,6,False,True,https://www.basketball-reference.com/req/20210...
7,2025-01-01,Gary Harris,Orlando Magic,False,False,False,9.0,0 days 00:15:19,15:19,2,...,,,True,2025011DET,Detroit Pistons,DET,6,False,True,https://www.basketball-reference.com/req/20210...
8,2025-01-01,Jett Howard,Orlando Magic,False,False,False,5.5,0 days 00:15:12,15:12,3,...,,,True,2025011DET,Detroit Pistons,DET,6,False,False,https://www.basketball-reference.com/req/20210...
9,2025-01-01,Jonathan Isaac,Orlando Magic,False,False,False,7.0,0 days 00:15:02,15:02,1,...,,,True,2025011DET,Detroit Pistons,DET,6,False,False,https://www.basketball-reference.com/req/20210...
10,2025-01-01,Cole Anthony,Orlando Magic,False,False,False,10.0,0 days 00:13:18,13:18,3,...,,,True,2025011DET,Detroit Pistons,DET,6,False,False,https://www.basketball-reference.com/req/20210...


In [19]:
get_boxscore_link(first_2025)

'https://www.basketball-reference.com/boxscores/202501010DET.html'

In [20]:
stats

Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,MP,FG,...,Pos4,Pos5,correct_positions,GameID,OpponentTeam,OpponentTeamAbv,InjTeamateCount,Starting,Top7InTeam,PhotoLink
0,2024-10-22,Mikal Bridges,New York Knicks,False,False,False,10.0,0 days 00:34:37,34:37,7,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
1,2024-10-22,OG Anunoby,New York Knicks,False,False,False,12.0,0 days 00:34:10,34:10,1,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
2,2024-10-22,Jalen Brunson,New York Knicks,False,False,False,10.5,0 days 00:24:30,24:30,9,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
3,2024-10-22,Josh Hart,New York Knicks,False,False,False,12.0,0 days 00:24:30,24:30,4,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
4,2024-10-22,Karl-Anthony Towns,New York Knicks,False,False,False,16.5,0 days 00:23:37,23:37,5,...,,,True,20242210BOS,Boston Celtics,BOS,3,True,True,https://www.basketball-reference.com/req/20210...
6,2024-10-22,Miles McBride,New York Knicks,False,False,False,14.0,0 days 00:25:51,25:51,8,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,True,https://www.basketball-reference.com/req/20210...
7,2024-10-22,Jericho Sims,New York Knicks,False,False,False,15.0,0 days 00:24:23,24:23,2,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,True,https://www.basketball-reference.com/req/20210...
8,2024-10-22,Cameron Payne,New York Knicks,False,False,False,14.0,0 days 00:20:43,20:43,5,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,False,https://www.basketball-reference.com/req/20210...
9,2024-10-22,Pacome Dadiet,New York Knicks,False,False,False,3.0,0 days 00:13:23,13:23,1,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,False,https://www.basketball-reference.com/req/20210...
10,2024-10-22,Ariel Hukporti,New York Knicks,False,False,False,6.0,0 days 00:06:05,6:05,0,...,,,True,20242210BOS,Boston Celtics,BOS,3,False,False,https://www.basketball-reference.com/req/20210...


In [21]:
smalldfdf

Unnamed: 0,Date,Start (ET),AwayTeam,AwayPTS,HomeTeam,HomePTS,Overtime,Attend.,InSeasonTournament,Year,...,Month#,MonthName,GameFinished,HomeTeamAbv,AwayTeamAbv,HomeWin,AwayWin,HomePointDiff,AwayPointDiff,GameID
0,2024-10-22,7:30p,New York Knicks,109.0,Boston Celtics,132.0,False,19156.0,False,2024,...,10,october,True,BOS,NYK,True,False,23.0,-23.0,20242210BOS


In [22]:
def final_df(year,existing_df,numaddrows, add_all):
    
    all_games_df = all_games_season(year)
    all_games_df = all_games_df[~all_games_df['GameID'].isin(existing_df['GameID'])]
    all_games_df = all_games_df[all_games_df['GameFinished'] == True]
    total_df = existing_df
    
    if add_all:
        for i in range(len(all_games_df)):
            game_df = player_scores_game(all_games_df.iloc[[i]])
            total_df = pd.concat([total_df, game_df], ignore_index=True, sort=False)
            time.sleep(10)
            print(f'Done with game #{i}')
            print(all_games_df.iloc[[i]])
    else:
        for i in range(numaddrows):
            game_df = player_scores_game(all_games_df.iloc[[i]])
            total_df = pd.concat([total_df, game_df], ignore_index=True, sort=False)
            time.sleep(10)
            print(f'Done with game #{i}')
            print(all_games_df.iloc[[i]])

    
    return total_df


#stats_2025 = final_df(2025)

## To get positions, go through names (through teams or whatever) and do 
## https://www.basketball-reference.com/players/w/wembavi01.html
## where the slash after players is the last name initial
## and then it is first 5 of last name (less if last name is smaller), and first 2 of first, then a number.
## try 01 first, and check if name on page matches title. if not add to number and try
## if error, don't put position.

## https://www.basketball-reference.com/teams/ATL/2025.html#all_roster
## this is format for each team roster in a table

## https://www.basketball-reference.com/players/j/jamesle01.html
## format for player name section



In [23]:
def team_link(teamabv, year):
    return f'https://www.basketball-reference.com/teams/{teamabv}/{year}.html#all_roster'

In [24]:
def team_players(team_link):
    URL = team_link
    response = requests.get(URL)
    response.status_code
    soup = BeautifulSoup(response.text, features="html.parser")
    soup.prettify()
    tables = soup.find_all('table')
    table_html = str(tables[0])
    df = pd.read_html(io.StringIO(table_html))[0]
    players_list = df['Player'].tolist()
    players_list = list(map(lambda x: x.split('(')[0].strip(), players_list))
    return players_list

In [25]:
suns = team_players(team_link('PHO',2025))
suns

['Tyus Jones',
 'Mason Plumlee',
 'Ryan Dunn',
 'Devin Booker',
 "Royce O'Neale",
 'Grayson Allen',
 'Kevin Durant',
 'Oso Ighodaro',
 'Monte Morris',
 'Bradley Beal',
 'Bol Bol',
 'Damion Lee',
 'Nick Richards',
 'Collin Gillespie',
 'TyTy Washington Jr.',
 'Jalen Bridges',
 'Vasilije MiciÄ\x87',
 'Cody Martin']

In [26]:
#Get rid of dashes and apostraphese and stuff like that, no jr. stuff counted for last name.


def player_link(name):
    
    firstname = name.split()[0]
    lastname = name.split()[1]
    firstname = "".join(char for char in firstname if char.isalpha()).lower()
    lastname = "".join(char for char in lastname if char.isalpha()).lower()
    lastinitial = lastname[0].lower()
    
    lastname5 = lastname[0:5]
    firstname2 = firstname[0:2]
    number = 1
    
    link = f'https://www.basketball-reference.com/players/{lastinitial}/{lastname5}{firstname2}0{number}.html'
    NotFound = True
    
    while NotFound:
        URL = link
        response = requests.get(URL)
        soup = BeautifulSoup(response.text, features="html.parser")
        h1_tag = soup.find('h1')
        player_name = h1_tag.text.strip()
        if player_name == name:
            print('Found it!')
            
            
            NotFound = False
        elif response.status_code != 200:
            print("Name Not Found....")
            NotFound = False
            positions = ['PG','SG','SF','PF','C']
        else:
            number += 1
            link = f'https://www.basketball-reference.com/players/{lastinitial}/{lastname5}{firstname2}0{number}.html'
            print(f'Didnt find it, trying again. This is {number} trying next.' )
        
    picturelink = f'https://www.basketball-reference.com/req/202106291/images/headshots/{lastname5}{firstname2}0{number}.jpg'
    linklist = []
    linklist.append(link)
    linklist.append(picturelink)
    return linklist

In [27]:
link = player_link("Devin Booker")
link

Found it!


['https://www.basketball-reference.com/players/b/bookede01.html',
 'https://www.basketball-reference.com/req/202106291/images/headshots/bookede01.jpg']

In [28]:
def get_positions_link(link, testing = False):
    response = requests.get(link)
    if response.status_code != 200:
        positions = ['PG','SG','SF','PF','C']
        return positions
    else:
        soup = BeautifulSoup(response.text, features="html.parser")
        inactive_section = soup.find_all('p')
        for i in range(10):
            try:
                if testing:
                    print(f'Trying {i}')
                
                positions = inactive_section[i]
                
                strong_tag = positions.find('strong', string=lambda t: t and "Position:" in t)
                position_text = strong_tag.next_sibling.strip()
                break
            except:
                pass
            
            
            
        positions = position_text.strip()
        only_positions = positions.split('\n')[0]
        
        if ',' in only_positions:
            only_positions = only_positions.split(',')
        else:
            if 'and' in only_positions:
                only_positions = only_positions.split('and')
            else:
                only_positions = [only_positions]
        if testing:
            print(f'This is only positions: {only_positions}')
            
            
        final_positions = []
        position_dict = {
            'Point Guard':"PG",
            'Shooting Guard' : "SG",
            'Small Forward' : "SF",
            'Power Forward' : "PF",
            'Center' : "C",
            'Forward' : "SF",
            'Guard' : 'PG'
        }
        for i in range(len(only_positions)):
            cleaned_string = only_positions[i].replace("and", "")
            if testing:
                print(f'This is cleaned_string: {cleaned_string}')
            cleaned_string = cleaned_string.strip()
            if testing:
                print(f'This is cleaned_string: {cleaned_string}')
            
            cleaned_string = position_dict[cleaned_string]
            
            final_positions.append(cleaned_string)
            
        
        return final_positions

In [29]:
get_positions_link('https://www.basketball-reference.com/players/j/fefsef01.html')

['PG', 'SG', 'SF', 'PF', 'C']

In [30]:
def name_to_positions(name, testing = False):
    link = player_link(name)
    positions = get_positions_link(link[0], testing = testing)
    positionsandlink = []
    positionsandlink.append(positions)
    positionsandlink.append(link[1])
    return positionsandlink

In [31]:
name_to_positions('Mikal Bridges')

Found it!


[['SF', 'SG'],
 'https://www.basketball-reference.com/req/202106291/images/headshots/bridgmi01.jpg']

In [32]:
team_players(team_link('PHO',2025))

['Tyus Jones',
 'Mason Plumlee',
 'Ryan Dunn',
 'Devin Booker',
 "Royce O'Neale",
 'Grayson Allen',
 'Kevin Durant',
 'Oso Ighodaro',
 'Monte Morris',
 'Bradley Beal',
 'Bol Bol',
 'Damion Lee',
 'Nick Richards',
 'Collin Gillespie',
 'TyTy Washington Jr.',
 'Jalen Bridges',
 'Vasilije MiciÄ\x87',
 'Cody Martin']

In [33]:
def get_positions(team_abv,year):
    link = team_link(team_abv,year)
    list_players = team_players(link)
    
    
    
    names_list = []
    pos1_list = []
    pos2_list = []
    pos3_list = []
    pos4_list = []
    pos5_list = []
    picture_link = []
    for name in list_players:
        time.sleep(8)
        
        positionsAll = name_to_positions(name, False)
        positions = positionsAll[0]
        
        names_list.append(name)
        
        for i in range(5):
            if i == 0:
                try:
                    pos1_list.append(positions[i])
                except:
                    pos1_list.append(np.nan)
            if i == 1:
                try:
                    pos2_list.append(positions[i])
                except:
                    pos2_list.append(np.nan)
            if i == 2:
                try:
                    pos3_list.append(positions[i])
                except:
                    pos3_list.append(np.nan)
            if i == 3:
                try:
                    pos4_list.append(positions[i])
                except:
                    pos4_list.append(np.nan)
            if i == 4:
                try:
                    pos5_list.append(positions[i])
                except:
                    pos5_list.append(np.nan)
        
        picture_link.append(positionsAll[1])
        print(names_list)
        #print(pos1_list)
        #print(pos2_list)
        #print(pos3_list)
        #print(pos4_list)
        #print(pos5_list)
    
    
    df = pd.DataFrame(
        {
            'Players':names_list,
            'Pos1':pos1_list,
            'Pos2':pos2_list,
            'Pos3':pos3_list,
            'Pos4':pos4_list,
            'Pos5':pos5_list,
            'PictureLink':picture_link
            
        }
    )
    
    return df
        
    

In [34]:
nba_teams = list(nba_team_names.keys())
nba_teams

['ATL',
 'BOS',
 'BRK',
 'CHO',
 'CHI',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GSW',
 'HOU',
 'IND',
 'LAC',
 'LAL',
 'MEM',
 'MIA',
 'MIL',
 'MIN',
 'NOP',
 'NYK',
 'OKC',
 'ORL',
 'PHI',
 'PHO',
 'POR',
 'SAC',
 'SAS',
 'TOR',
 'UTA',
 'WAS']

In [35]:
remaining_teams = ['PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS']

In [36]:
# all_player_pos2 = pd.read_csv("../data/positions2.csv")


# for abr in remaining_teams:
#     df = get_positions(abr,2025)
#     all_player_pos2 =  pd.concat([df, all_player_pos2], ignore_index=True, sort=False)
#     all_player_pos2.to_csv('../data/positions2.csv')
    
# all_player_pos2

In [37]:
cool

NameError: name 'cool' is not defined

In [None]:
final_dataframe = pd.read_csv("../data/full_nba_data_2.csv")
final_dataframe["Date"] = pd.to_datetime(final_dataframe["Date"], format="mixed")
final_dataframe

In [38]:
def add_fantasy_averages(df):
    
    df = df.copy()  # Avoid modifying original dataframe
    df["_original_index"] = df.index  # Store original index
    
    df = df.sort_values(by=["Starters", "Date"])  # Sort by player and date

    # Calculate cumulative average per player
    df["AvgFantPoints"] = df.groupby("Starters")["FantasyPoints"].expanding().mean().reset_index(level=0, drop=True)

    df = df.sort_values(by="_original_index").drop(columns=["_original_index"])  # Restore original order

    return df

In [39]:
def add_fantasy_averages_new(df):
    
    df["AvgFantPoints"] = df.apply(lambda row: df[
    (df["Starters"] == row["Starters"]) &  # Same player
    (df["Date"] < row["Date"]) &  # Only past games
    (~df["DidNotPlay"])  # Only games where the player actually played
    ]["FantasyPoints"].mean(), axis=1)


    return df

In [40]:
def add_fantasy_averages_new_new(df):
    #df = df.sort_values(["Starters", "Date"])  # Ensure chronological order for each player

    avg_fantasy = []
    avg_fantasy_last10 = []
    avg_points = []
    avg_points_last10 = []
    avg_assists = []
    avg_assists_last10 = []
    avg_rebounds = []
    avg_rebounds_last10 = []
    avg_steals = []
    avg_steals_last10 = []
    avg_blocks = []
    avg_blocks_last10 = []
    avg_turnovers = []
    avg_turnovers_last10 = []
    avg_threepointers = []
    avg_threepointers_last10 = []
    
    avg_mp_time = []
    avg_mp_time_last10 = []

    # Convert MPTimeDelta to timedelta format
    df["MPTimeDelta"] = pd.to_timedelta(df["MPTimeDelta"])

    for idx, row in df.iterrows():
        past_games = df[
            (df["Starters"] == row["Starters"]) & 
            (df["Date"] < row["Date"]) & 
            (~df["DidNotPlay"])
        ].copy()  # ✅ Copy to avoid potential SettingWithCopyWarning

        # Convert MPTimeDelta to minutes as float
        past_games["MPMinutes"] = past_games["MPTimeDelta"].dt.total_seconds() / 60

        # Calculate averages
        avg_fantasy.append(past_games["FantasyPoints"].mean())
        avg_points.append(past_games["PTS"].mean())
        avg_assists.append(past_games["AST"].mean())
        avg_rebounds.append(past_games["TRB"].mean())
        avg_steals.append(past_games["STL"].mean())
        avg_blocks.append(past_games["BLK"].mean())
        avg_turnovers.append(past_games["TOV"].mean())
        avg_threepointers.append(past_games["3P"].mean())

        last_10_games = past_games.tail(10).copy()  # ✅ Copy ensures "MPMinutes" exists

        avg_fantasy_last10.append(last_10_games["FantasyPoints"].mean())
        avg_points_last10.append(last_10_games["PTS"].mean())
        avg_assists_last10.append(last_10_games["AST"].mean())
        avg_rebounds_last10.append(last_10_games["TRB"].mean())
        avg_steals_last10.append(last_10_games["STL"].mean())
        avg_blocks_last10.append(last_10_games["BLK"].mean())
        avg_turnovers_last10.append(last_10_games["TOV"].mean())
        avg_threepointers_last10.append(last_10_games["3P"].mean())
        
        
        
        
        avg_mp_time.append(past_games["MPMinutes"].mean())

        # Ensure 'MPMinutes' exists in last_10_games before computing mean
        avg_mp_time_last10.append(last_10_games["MPMinutes"].mean() if not last_10_games.empty else None)

    # Assign new columns
    df["AvgPoints"] = avg_points
    df["AvgPointsLast10"] = avg_points_last10
    df["AvgAssists"] = avg_assists
    df["AvgAssistsLast10"] = avg_assists_last10
    df["AvgRebounds"] = avg_rebounds
    df["AvgReboundsLast10"] = avg_rebounds_last10
    df["AvgSteals"] = avg_steals
    df["AvgStealsLast10"] = avg_steals_last10
    df["AvgBlocks"] = avg_blocks
    df["AvgBlocksLast10"] = avg_blocks_last10
    df["AvgTurnovers"] = avg_turnovers
    df["AvgTurnoversLast10"] = avg_turnovers_last10
    df["AvgThreePointers"] = avg_threepointers
    df["AvgThreePointersLast10"] = avg_threepointers_last10
    

    return df

In [41]:
def add_injury_counts(df):
    # Ensure correct data types
    
    
    # Create empty lists for new columns
    prev_inj_count = []
    opp_inj_count = []
    opp_prev_inj_count = []

    # Convert Date to datetime if not already
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    # Create a lookup table for previous injuries by team
    prev_injury_lookup = {}

    # Iterate over each row
    for idx, row in df.iterrows():
        team = row["TeamAbv"]
        opp_team = row["OpponentTeamAbv"]
        game_id = row["GameID"]

        # Get previous injury count for the team
        prev_value = prev_injury_lookup.get(team, None)
        prev_inj_count.append(prev_value)

        # Get opponent's injury count for the current game
        opp_injury = df[(df["GameID"] == game_id) & (df["TeamAbv"] == opp_team)]["InjTeamateCount"]
        opp_inj_count.append(opp_injury.mean() if not opp_injury.empty else None)

        # Get previous injury count for the opponent
        opp_prev_value = prev_injury_lookup.get(opp_team, None)
        opp_prev_inj_count.append(opp_prev_value)

        # Update lookup table with the current team's injury count
        prev_injury_lookup[team] = row["InjTeamateCount"]

    # Assign new columns
    df["PrevInjuredCount"] = prev_inj_count
    df["OpponentInjTeamateCount"] = opp_inj_count
    df["OpponentPrevInjuredCount"] = opp_prev_inj_count

    return df
    

In [None]:
testing_opp_df = add_injury_counts(new_df_2)

In [None]:
testing_opp_df[1490:1505]#["GameID"]

In [None]:
new_df_2.head(10)

In [None]:
new_df_2.columns

In [None]:
testingdataframe[testingdataframe['Starters'] == 'LeBron James']

In [42]:
def cut_dataframe(nba_data, ending_date, team_name):
    # Filter data based on date and team criteria
    cut_data = nba_data[
        (nba_data["Date"] < ending_date) &
        (nba_data["Date"] > nba_data["Date"].min()) &
        (nba_data["OpponentTeamAbv"] == team_name) &
        (nba_data["Top7InTeam"] == True)
    ]

    # Group by position and calculate mean fantasy points
    cut_data = cut_data.groupby("Pos1", as_index=False).agg(AvgFantPoints=("FantasyPoints", "mean"))
    
    # Add team and date columns
    cut_data["team"] = team_name
    cut_data["date"] = ending_date

    # Compute team-level statistics
    team_games = nba_data[
        (nba_data["Date"] < ending_date) &
        (nba_data["Date"] >= nba_data["Date"].min()) &
        (nba_data["OpponentTeamAbv"] == team_name)
    ].groupby("GameID", as_index=False).agg(
        Loss=("WonGame", lambda x: max(x)),  # If any player won, the team lost
        Win=("WonGame", lambda x: max(x == False)),  # If any player lost, the team won
        PointDiff=("GamePointDiff", lambda x: -max(x))  # Reverse point diff
    )

    # Calculate win percentage and total point differential
    win_percentage = team_games["Win"].sum() / len(team_games) if len(team_games) > 0 else 0
    point_diff = team_games["PointDiff"].sum() if len(team_games) > 0 else 0

    # Add win percentage and point differential
    cut_data["WinPercentage"] = win_percentage
    cut_data["PointDiff"] = point_diff

    # Add overall average row
    overall_avg = pd.DataFrame([{
        "Pos1": "ALL",
        "AvgFantPoints": cut_data["AvgFantPoints"].mean(),
        "team": team_name,
        "date": ending_date,
        "WinPercentage": win_percentage,
        "PointDiff": point_diff
    }])

    # Append overall average row
    cut_data = pd.concat([cut_data, overall_avg], ignore_index=True)

    return cut_data


In [43]:
testing = cut_dataframe(final_dataframe,pd.to_datetime("2025-01-06"),"WAS")

testing

NameError: name 'final_dataframe' is not defined

In [44]:
def whole_team_dataframe(nba_data, team_name):
    start_date = pd.to_datetime("2024-10-25")
    end_date = nba_data["Date"].max()
    date_sequence = pd.date_range(start=start_date, end=end_date, freq="D")

    # Loop through dates and combine results into one dataframe
    full_team_df = pd.concat([cut_dataframe(nba_data, date, team_name) for date in date_sequence], ignore_index=True)

    return full_team_df


In [45]:
def generate_full_nba_dataframe(nba_data):
    nba_team_abbreviations = [
        "ATL", "BOS", "BRK", "CHO", "CHI", "CLE", "DAL", "DEN", "DET", "GSW",
        "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", "MIN", "NOP", "NYK",
        "OKC", "ORL", "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"
    ]

    # Loop through each team abbreviation and combine results into one dataframe
    full_nba_df = pd.concat([whole_team_dataframe(nba_data, team) for team in nba_team_abbreviations], ignore_index=True)

    # Rank within each date & position
    full_nba_df["FantPointsAgainstRank"] = full_nba_df.groupby(["date", "Pos1"])["AvgFantPoints"].rank(method="min")

    return full_nba_df

In [None]:
teams = generate_full_nba_dataframe(final_dataframe)

In [None]:
teams

In [None]:
final_dataframe['Date'] = pd.to_datetime(final_dataframe['Date'])
teams['date'] = pd.to_datetime(teams['date'])




In [None]:
teams_unique = teams.drop_duplicates()
teams_unique

In [None]:
nba_data_merged = final_dataframe.merge(
    teams[['date', 'team', 'FantPointsAgainstRank', 'PointDiff','WinPercentage']],  
    left_on=['Date', 'OpponentTeamAbv'], 
    right_on=['date', 'team'], 
    how='left'
).drop(columns=['team','date'])
nba_data_merged

In [None]:
positional_ranks = teams.pivot(index=['date', 'team'], columns='Pos1', values='FantPointsAgainstRank')
positional_ranks = positional_ranks.reset_index()
positional_ranks

In [46]:
def add_opponent_info(MainDF):
    
    teams = generate_full_nba_dataframe(MainDF)
    
    print(MainDF)
    print(teams)
    print("Checkpoint1")


    MainDF['Date'] = pd.to_datetime(MainDF['Date'], format = 'mixed')
    teams['date'] = pd.to_datetime(teams['date'])

    teams_noposition =  teams.drop_duplicates(subset=['date', 'team'])
    nba_data_merged = MainDF.merge(
        teams_noposition[['date', 'team', 'FantPointsAgainstRank', 'PointDiff','WinPercentage']],  
        left_on=['Date', 'OpponentTeamAbv'], 
        right_on=['date', 'team'], 
        how='left'
    ).drop(columns=['team','date'])

    print(nba_data_merged)
    print("Checkpoint2")
    positional_ranks = teams.pivot(index=['date', 'team'], columns='Pos1', values='FantPointsAgainstRank')
    positional_ranks = positional_ranks.reset_index()

    nba_data_merged = nba_data_merged.merge(
        positional_ranks, 
        left_on=['Date', 'OpponentTeamAbv'], 
        right_on=['date', 'team'], 
        how='left'
    ).drop(columns=['team','date'])

    nba_data_merged.rename(columns = {'PointDiff':'OpponentPointDiff','WinPercentage':'OpponentWinPercentage','ALL':'OppFantDefense: ALL','C':'OppFantDefense: C','PF_y':'OppFantDefense: PF','PG':'OppFantDefense: PG','SF':'OppFantDefense: SF','SG':'OppFantDefense: SG', 'PF_x':'PF'}, inplace = True)

    
    print(nba_data_merged)
    print("Checkpoint3")
    
    
    return(nba_data_merged)

In [47]:
def update_full_data(rows_to_add = 10, add_all = False):
    existing_dataframe = pd.read_csv("../data/full_nba_data.csv")
    existing_dataframe = existing_dataframe[['Date','Starters','TeamName','WonGame','Injured','DidNotPlay','FantasyPoints','MPTimeDelta','MP','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','GmSc','+/-','TeamAbv','Year','Day','Month#','MonthName','Day_of_week','Home','GamePointDiff','Start(ET)','Overtime','Attend.','InSeasonTournament','Pos1','Pos2','Pos3','Pos4','Pos5','correct_positions','GameID','OpponentTeam','OpponentTeamAbv','InjTeamateCount','Starting']]
    existing_dataframe

    new_df = final_df(2025, existing_dataframe, rows_to_add, add_all)
    new_df.to_csv('../data/full_nba_data.csv')
    return new_df
    

In [None]:
existing_dataframe = stats
existing_dataframe
existing_dataframe.to_csv('../data/full_nba_data_2.csv')

In [48]:
def update_full_data_new(rows_to_add = 10, add_all = False):
    existing_dataframe = pd.read_csv("../data/full_nba_data.csv")

    existing_dataframe = existing_dataframe[['Date','Starters','TeamName','WonGame','Injured','DidNotPlay','FantasyPoints','MPTimeDelta','MP','FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','GmSc','+/-','TeamAbv','Year','Day','Month#','MonthName','Day_of_week','Home','GamePointDiff','Start(ET)','Overtime','Attend.','InSeasonTournament','Pos1','Pos2','Pos3','Pos4','Pos5','correct_positions','GameID','OpponentTeam','OpponentTeamAbv','InjTeamateCount', 'Starting','Top7InTeam','PhotoLink']]
    existing_dataframe

    new_df = final_df(2025, existing_dataframe, rows_to_add, add_all)
    new_df['Date'] = pd.to_datetime(new_df['Date'], format = 'mixed')
    new_df = add_opponent_info(new_df)
    new_df = add_fantasy_averages_new_new(new_df)
    new_df = add_injury_counts(new_df)
    new_df.to_csv('../data/full_nba_data.csv')
    return new_df
    

In [None]:
existing_dataframe['MPTimeDelta'].dtypes

In [49]:
new_df_2 = update_full_data_new(add_all = True)
new_df_2

Done with game #0
         Date Start (ET)          AwayTeam  AwayPTS        HomeTeam  HomePTS  \
42 2025-02-06      7:30p  Dallas Mavericks    127.0  Boston Celtics    120.0   

    Overtime  Attend.  InSeasonTournament  Year  ... Month#  MonthName  \
42     False  19156.0               False  2025  ...      2   february   

   GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  HomePointDiff  \
42         True          BOS         DAL   False     True           -7.0   

    AwayPointDiff      GameID  
42            7.0  2025062BOS  

[1 rows x 21 columns]
Done with game #1
         Date Start (ET)         AwayTeam  AwayPTS                HomeTeam  \
43 2025-02-06      8:00p  Houston Rockets    114.0  Minnesota Timberwolves   

    HomePTS  Overtime  Attend.  InSeasonTournament  Year  ... Month#  \
43    127.0     False  18978.0               False  2025  ...      2   

    MonthName GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  \
43   february         True          MIN 

Done with game #15
         Date Start (ET)           AwayTeam  AwayPTS       HomeTeam  HomePTS  \
57 2025-02-08      7:00p  San Antonio Spurs    111.0  Orlando Magic    112.0   

    Overtime  Attend.  InSeasonTournament  Year  ... Month#  MonthName  \
57     False  19354.0               False  2025  ...      2   february   

   GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  HomePointDiff  \
57         True          ORL         SAS    True    False            1.0   

    AwayPointDiff      GameID  
57           -1.0  2025082ORL  

[1 rows x 21 columns]
Done with game #16
         Date Start (ET)       AwayTeam  AwayPTS            HomeTeam  HomePTS  \
58 2025-02-08      7:00p  Atlanta Hawks    125.0  Washington Wizards    111.0   

    Overtime  Attend.  InSeasonTournament  Year  ... Month#  MonthName  \
58     False  16835.0               False  2025  ...      2   february   

   GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  HomePointDiff  \
58         True         

Done with game #30
         Date Start (ET)           AwayTeam  AwayPTS       HomeTeam  HomePTS  \
72 2025-02-10      7:30p  Charlotte Hornets     89.0  Brooklyn Nets     97.0   

    Overtime  Attend.  InSeasonTournament  Year  ... Month#  MonthName  \
72     False  16013.0               False  2025  ...      2   february   

   GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  HomePointDiff  \
72         True          BRK         CHO    True    False            8.0   

    AwayPointDiff      GameID  
72           -8.0  2025102BRK  

[1 rows x 21 columns]
Done with game #31
         Date Start (ET)        AwayTeam  AwayPTS    HomeTeam  HomePTS  \
73 2025-02-10      7:30p  Boston Celtics    103.0  Miami Heat     85.0   

    Overtime  Attend.  InSeasonTournament  Year  ... Month#  MonthName  \
73     False  19961.0               False  2025  ...      2   february   

   GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  HomePointDiff  \
73         True          MIA         B

Done with game #45
         Date Start (ET)       AwayTeam  AwayPTS         HomeTeam  HomePTS  \
87 2025-02-12      7:30p  Atlanta Hawks    148.0  New York Knicks    149.0   

    Overtime  Attend.  InSeasonTournament  Year  ... Month#  MonthName  \
87      True  19812.0               False  2025  ...      2   february   

   GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  HomePointDiff  \
87         True          NYK         ATL    True    False            1.0   

    AwayPointDiff      GameID  
87           -1.0  2025122NYK  

[1 rows x 21 columns]
Done with game #46
         Date Start (ET)             AwayTeam  AwayPTS         HomeTeam  \
88 2025-02-12      7:30p  Cleveland Cavaliers    131.0  Toronto Raptors   

    HomePTS  Overtime  Attend.  InSeasonTournament  Year  ... Month#  \
88    108.0     False  16549.0               False  2025  ...      2   

    MonthName GameFinished  HomeTeamAbv AwayTeamAbv HomeWin  AwayWin  \
88   february         True          TOR         

Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,MP,FG,...,AvgStealsLast10,AvgBlocks,AvgBlocksLast10,AvgTurnovers,AvgTurnoversLast10,AvgThreePointers,AvgThreePointersLast10,PrevInjuredCount,OpponentInjTeamateCount,OpponentPrevInjuredCount
0,2024-10-22,Mikal Bridges,New York Knicks,False,False,False,10.0,0 days 00:34:37,34:37,7.0,...,,,,,,,,,4.0,
1,2024-10-22,OG Anunoby,New York Knicks,False,False,False,12.0,0 days 00:34:10,34:10,1.0,...,,,,,,,,3.0,4.0,
2,2024-10-22,Jalen Brunson,New York Knicks,False,False,False,10.5,0 days 00:24:30,24:30,9.0,...,,,,,,,,3.0,4.0,
3,2024-10-22,Josh Hart,New York Knicks,False,False,False,12.0,0 days 00:24:30,24:30,4.0,...,,,,,,,,3.0,4.0,
4,2024-10-22,Karl-Anthony Towns,New York Knicks,False,False,False,16.5,0 days 00:23:37,23:37,5.0,...,,,,,,,,3.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28424,2025-02-12,Vince Williams Jr.,Memphis Grizzlies,False,True,True,0.0,0 days 00:00:00,0,0,...,0.333333,0.333333,0.333333,1.333333,1.333333,0.500000,0.5,5.0,4.0,4.0
28425,2025-02-12,Drew Eubanks,Los Angeles Clippers,True,True,True,0.0,0 days 00:00:00,0,0,...,0.200000,0.825000,1.200000,0.775000,0.400000,0.075000,0.1,4.0,5.0,5.0
28426,2025-02-12,Trentyn Flowers,Los Angeles Clippers,True,True,True,0.0,0 days 00:00:00,0,0,...,0.000000,0.000000,0.000000,0.500000,0.500000,0.000000,0.0,4.0,5.0,5.0
28427,2025-02-12,Jordan Miller,Los Angeles Clippers,True,True,True,0.0,0 days 00:00:00,0,0,...,0.700000,0.166667,0.200000,0.733333,0.500000,0.266667,0.1,4.0,5.0,5.0


In [None]:
new_df_2.columns

In [None]:
new_df_2.columns

In [None]:
new_df_2.groupby('Starters')['FantasyPoints'].mean().reset_index().sort_values('FantasyPoints', ascending=False).head(10)

In [None]:
new_df_2.sort_values('FantasyPoints', ascending = False)

In [None]:
existing_dataframe = pd.read_csv("../data/full_nba_data.csv")
existing_dataframe

In [None]:
all_games = all_games_season(2025)
all_games

In [None]:
all_games.to_csv('../data/nba_game_list.csv')