# Web Scrapping

In [1]:
import pandas as pd
import time
import logging
import gc
from concurrent.futures import ThreadPoolExecutor, as_completed
from nba_api.stats.endpoints import boxscoreplayertrackv2, playerdashptshots, leaguegamefinder, boxscoretraditionalv2, playbyplayv2, defensehub


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def get_games_for_season(season):
    logging.info(f"Fetching games for season {season}")
    try:
        gamefinder = leaguegamefinder.LeagueGameFinder(
            season_nullable=season,
            league_id_nullable='00', 
            season_type_nullable='Regular Season'
        )
        games_df = gamefinder.get_data_frames()[0]
        return games_df['GAME_ID'].unique()  
    except Exception as e:
        logging.error(f"Error fetching games for season {season}: {e}")
        return []


def get_players_for_game(game_id):
    logging.info(f"Fetching player data for game {game_id}")
    try:
        boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
        players_df = boxscore.player_stats.get_data_frame()
        return players_df[['PLAYER_ID', 'TEAM_ID']].drop_duplicates()  # Return player and team IDs
    except Exception as e:
        logging.error(f"Error fetching player data for game {game_id}: {e}")

def get_player_tracking_data(game_id):
    logging.info(f"Fetching tracking data for game {game_id}")
    try:
        tracking_data = boxscoreplayertrackv2.BoxScorePlayerTrackV2(game_id=game_id)
        return tracking_data.player_stats.get_data_frame()
    except Exception as e:
        logging.error(f"Error fetching tracking data for game {game_id}: {e}")
        return pd.DataFrame()


def get_advanced_shot_data(player_id, team_id, season):
    logging.info(f"Fetching advanced shot data for player {player_id}, team {team_id}, season {season}")
    try:
        shot_data = playerdashptshots.PlayerDashPtShots(player_id=player_id, team_id=team_id, season=season)
        overall_df = shot_data.overall.get_data_frame()
        shot_clock_df = shot_data.shot_clock_shooting.get_data_frame()
        return pd.concat([overall_df, shot_clock_df], ignore_index=True)
    except Exception as e:
        logging.error(f"Error fetching advanced shot data for player {player_id} and team {team_id}: {e}")
        return pd.DataFrame()


def get_additional_features_for_game(game_id, season):
    tracking_df = get_player_tracking_data(game_id)
    player_team_df = get_players_for_game(game_id)
    all_shot_data = []

    for _, row in player_team_df.iterrows():
        player_id = row['PLAYER_ID']
        team_id = row['TEAM_ID']
        shot_df = get_advanced_shot_data(player_id=player_id, team_id=team_id, season=season)
        if not shot_df.empty:
            all_shot_data.append(shot_df)
        time.sleep(1)  # Avoid API rate limits

    if not tracking_df.empty and len(all_shot_data) > 0:
        combined_df = pd.concat(all_shot_data, ignore_index=True)
        return pd.merge(tracking_df, combined_df, on="PLAYER_ID", how="inner")
    elif len(all_shot_data) > 0:
        return pd.concat(all_shot_data, ignore_index=True)
    else:
        return pd.DataFrame()

def get_play_by_play_data(game_id):
    logging.info(f"Fetching play-by-play data for game {game_id}")
    try:
        play_by_play_data = playbyplayv2.PlayByPlayV2(game_id=game_id)
        return play_by_play_data.get_data_frames()[0]
    except Exception as e:
        logging.error(f"Error fetching play-by-play data for game {game_id}: {e}")
        return pd.DataFrame()


def get_defensive_data(season):
    logging.info(f"Fetching defense data for season {season}")
    try:
        defense_data = defensehub.DefenseHub(season=season)
        defense_stat1 = defense_data.defense_hub_stat1.get_data_frame()
        defense_stat2 = defense_data.defense_hub_stat2.get_data_frame()
        defense_stat3 = defense_data.defense_hub_stat3.get_data_frame()
        defense_stat4 = defense_data.defense_hub_stat4.get_data_frame()
        defense_stat5 = defense_data.defense_hub_stat5.get_data_frame()
        defense_stat6 = defense_data.defense_hub_stat6.get_data_frame()
        defense_stat7 = defense_data.defense_hub_stat7.get_data_frame()
        defense_stat9 = defense_data.defense_hub_stat9.get_data_frame()
        return pd.concat([defense_stat1, defense_stat2, defense_stat3, defense_stat4, defense_stat5, defense_stat6, defense_stat7, defense_stat9], axis=1)
    except Exception as e:
        logging.error(f"Error fetching defense data for season {season}: {e}")
        return pd.DataFrame()

# Function to process a batch of games and save in chunks
def process_and_save_chunk(season, game_ids, defense_data, chunk_idx):
    logging.info(f"Processing chunk {chunk_idx} for season {season}")
    season_data = []

    for game_id in game_ids:
        play_by_play_df = get_play_by_play_data(game_id)
        if play_by_play_df.empty:
            continue

        game_data = get_additional_features_for_game(game_id, season)
        if not game_data.empty:
            merged_df = pd.merge(game_data, play_by_play_df, on="GAME_ID", how="inner")
            if not defense_data.empty:
                merged_df = pd.merge(merged_df, defense_data, left_on="TEAM_ID", right_on="DEFENSE_TEAM_ID", how="left")
            season_data.append(merged_df)

    if season_data:
        chunk_df = pd.concat(season_data, ignore_index=True)
        chunk_file = f'season_{season}_chunk_{chunk_idx}.csv'
        chunk_df.to_csv(chunk_file, index=False)
        logging.info(f"Saved chunk {chunk_idx} for season {season} with {chunk_df.shape[0]} rows.")
        del chunk_df 
        gc.collect()  

def process_season(season):
    logging.info(f"Processing season {season}")
    game_ids = get_games_for_season(season)
    defense_data = get_defensive_data(season)
    chunk_size = 20 
   
    for i in range(0, len(game_ids), chunk_size):
        chunk_game_ids = game_ids[i:i + chunk_size]
        process_and_save_chunk(season, chunk_game_ids, defense_data, i // chunk_size)

def run_extraction_multithreaded(seasons):
    with ThreadPoolExecutor(max_workers=4) as executor:  
        futures = [executor.submit(process_season, season) for season in seasons]
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error processing season data: {e}")

season = ["2023-24"]

run_extraction_multithreaded(seasons)


2024-10-21 23:14:27,708 - INFO - Processing season 2023-24
2024-10-21 23:14:27,708 - INFO - Fetching games for season 2023-24
2024-10-21 23:14:27,990 - INFO - Fetching defense data for season 2023-24
2024-10-21 23:14:28,176 - ERROR - Error fetching defense data for season 2023-24: Expecting value: line 1 column 1 (char 0)
2024-10-21 23:14:28,178 - INFO - Processing chunk 0 for season 2023-24
2024-10-21 23:14:28,179 - INFO - Fetching play-by-play data for game 0022301192
2024-10-21 23:14:28,470 - INFO - Fetching tracking data for game 0022301192
2024-10-21 23:14:28,712 - INFO - Fetching player data for game 0022301192
2024-10-21 23:14:28,911 - INFO - Fetching advanced shot data for player 1628969, team 1610612751, season 2023-24
2024-10-21 23:14:30,065 - INFO - Fetching advanced shot data for player 1641730, team 1610612751, season 2023-24
2024-10-21 23:14:31,228 - INFO - Fetching advanced shot data for player 1629651, team 1610612751, season 2023-24
2024-10-21 23:14:32,363 - INFO - Fet

In [9]:
import os
def merge_csv_files(output_filename, folder_path='.', file_pattern='season_2023-24_chunk_'):
    """
    Merges multiple CSV files based on a file pattern and saves the merged file to a specified output file.
    
    Parameters:
    - output_filename (str): The name of the output CSV file to save the merged data.
    - folder_path (str): The folder where the CSV files are located. Default is current directory.
    - file_pattern (str): The pattern to match the CSV files for merging. Default is 'season_2023-24_chunk_'.
    
    Returns:
    - None: Saves the merged CSV file to the specified output filename.
    """
    try:
        # Step 1: List all files in the specified directory
        file_list = [f for f in os.listdir(folder_path) if f.startswith(file_pattern) and f.endswith('.csv')]

        if not file_list:
            print(f"No files matching the pattern '{file_pattern}' were found in the folder '{folder_path}'.")
            return
        
        print(f"Found {len(file_list)} files to merge.")

        # Step 2: Load each CSV file into a DataFrame and concatenate them
        df_list = [pd.read_csv(os.path.join(folder_path, file)) for file in file_list]
        final_df = pd.concat(df_list, ignore_index=True)

        # Step 3: Save the final merged DataFrame to a single CSV
        final_df.to_csv(output_filename, index=False)

        print(f"Merged data saved as '{output_filename}'. Shape: {final_df.shape}")
    
    except Exception as e:
        print(f"An error occurred: {e}") 
