In [69]:
import requests
import json
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler



In [70]:
should_download_data = False

# Get Games
Fetch games and their id's for multiple months to collect data

In [71]:
# Set common variables
api_key = os.environ['SPORTRADAR_API_KEY']
access_level = 'trial'
version = 'v8'
language_code = 'en'
season_year = '2022'

In [72]:
if should_download_data:
    season_schedule = f"https://api.sportradar.com/nba/{access_level}/{version}/{language_code}/games/{season_year}/REG/schedule.json?api_key={api_key}"
    response = requests.get(season_schedule)
    if response.status_code == 200:
        try:
            schedule_data = response.json()
        except ValueError as e:
            print("Error decoding JSON:", e)
    else:
        print("Failed to retrieve data: ", response.status_code)
    schedule_data = response.json()
    print(json.dumps(schedule_data, indent=4, sort_keys=True))

In [73]:
if should_download_data: 
    with (open("data/season_schedule.json", "w")) as f:
        json.dump(schedule_data, f, indent=4)

In [74]:
# Open the file
with open("data/season_schedule.json", "r") as f:
    schedule_data = json.load(f)

flat_data = schedule_data['games']

df = pd.DataFrame(flat_data)
df["target"] = df["home_points"] > df["away_points"]
df.head()

Unnamed: 0,id,status,coverage,scheduled,home_points,away_points,track_on_court,sr_id,reference,time_zones,venue,broadcasts,home,away,title,neutral_site,target
0,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,closed,full,2022-10-18T23:30:00Z,126.0,117.0,True,sr:match:35431579,22200001,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'id': '7d69b080-91ca-53c9-9302-45c1a72c5549',...","[{'network': 'TNT', 'type': 'TV', 'locale': 'N...","{'name': 'Boston Celtics', 'alias': 'BOS', 'id...","{'name': 'Philadelphia 76ers', 'alias': 'PHI',...",,,True
1,c665e441-9f38-48a7-8796-1f7292f97db1,closed,full,2022-10-19T02:00:00Z,123.0,109.0,True,sr:match:35431587,22200002,"{'venue': 'US/Pacific', 'home': 'US/Pacific', ...","{'id': '938016dc-9e1d-4abc-88f5-3a7d772332e6',...","[{'network': 'TNT', 'type': 'TV', 'locale': 'N...","{'name': 'Golden State Warriors', 'alias': 'GS...","{'name': 'Los Angeles Lakers', 'alias': 'LAL',...",,,True
2,1f8105fe-e6ca-475f-aea6-15b455892a9e,closed,full,2022-10-19T23:00:00Z,107.0,114.0,True,sr:match:35431591,22200004,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'id': '24bb478e-eb31-5f8a-8c8d-07f513169ec1',...","[{'network': 'NBCS-DC', 'type': 'TV', 'locale'...","{'name': 'Indiana Pacers', 'alias': 'IND', 'id...","{'name': 'Washington Wizards', 'alias': 'WAS',...",,,False
3,9f64a3b0-46bd-4caa-b5c3-4578e9bab8d8,closed,full,2022-10-19T23:00:00Z,113.0,109.0,True,sr:match:35431589,22200003,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'id': '5a9ddefc-2267-4fd1-8d6e-0f82163ce8bd',...","[{'network': 'BSDET', 'type': 'TV', 'locale': ...","{'name': 'Detroit Pistons', 'alias': 'DET', 'i...","{'name': 'Orlando Magic', 'alias': 'ORL', 'id'...",,,True
4,44b31f4a-bc51-41aa-b1d9-a2bfc95d4e0d,closed,full,2022-10-19T23:30:00Z,108.0,105.0,True,sr:match:35431599,22200008,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'id': '62cc9661-7b13-56e7-bf4a-bba7ad7be8da',...","[{'network': 'BSOH', 'type': 'TV', 'locale': '...","{'name': 'Toronto Raptors', 'alias': 'TOR', 'i...","{'name': 'Cleveland Cavaliers', 'alias': 'CLE'...",,,True


In [75]:

if should_download_data:
    def get_play_by_play(game_id):
        return f"https://api.sportradar.com/nba/{access_level}/{version}/{language_code}/games/{game_id}/pbp.json?api_key={api_key}"


    # Get every game id from the season schedule
    game_ids = []
    for game in schedule_data['games']:
        game_ids.append(game['id'])

    # Get play by play data for each game, add it to a list, and write the list to a json file
    play_by_play_data = []
    for game_id in game_ids:
        response = requests.get(get_play_by_play(game_id))
        if response.status_code == 200:
            try:
                play_by_play_data.append(response.json())
            except ValueError as e:
                print("Error decoding JSON:", e)
        else:
            print("Failed to retrieve data: ", response.status_code)

    with (open("data/play_by_play_data.json", "w")) as f:
        json.dump(play_by_play_data, f, indent=4)

In [76]:
# Get data from the json file and put it into a dataframe

with open('data/play_by_play_data.json') as f:
    data = json.load(f)


In [77]:
# Remove duplicates from data
# TODO: Run the data download again, and remove this line
data = [i for n, i in enumerate(data) if i not in data[n + 1:]]

game_level_data_copy = df.copy()

df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,status,coverage,scheduled,duration,attendance,lead_changes,times_tied,clock,quarter,track_on_court,reference,entry_mode,sr_id,clock_decimal,time_zones,home,away,periods,deleted_events,title,neutral_site
0,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,closed,full,2022-10-18T23:30:00+00:00,2:31,19156.0,8.0,7.0,00:00,4.0,True,22200001,WEBSOCKET,sr:match:35431579,00:00,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'name': 'Celtics', 'alias': 'BOS', 'market': ...","{'name': '76ers', 'alias': 'PHI', 'market': 'P...","[{'type': 'quarter', 'id': 'b2055122-2475-4178...",[{'id': '3a7522fd-dd18-4644-aa0f-fb8294b65c84'...,,
1,c665e441-9f38-48a7-8796-1f7292f97db1,closed,full,2022-10-19T02:00:00+00:00,2:33,18064.0,1.0,2.0,00:00,4.0,True,22200002,WEBSOCKET,sr:match:35431587,00:00,"{'venue': 'US/Pacific', 'home': 'US/Pacific', ...","{'name': 'Warriors', 'alias': 'GSW', 'market':...","{'name': 'Lakers', 'alias': 'LAL', 'market': '...","[{'type': 'quarter', 'id': '30829330-c8e3-4ff6...",[{'id': '625bb81a-0ede-4a8e-8f16-4cc5cc2704ad'...,,
2,1f8105fe-e6ca-475f-aea6-15b455892a9e,closed,full,2022-10-19T23:00:00+00:00,2:22,15027.0,0.0,0.0,00:00,4.0,True,22200004,WEBSOCKET,sr:match:35431591,00:00,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'name': 'Pacers', 'alias': 'IND', 'market': '...","{'name': 'Wizards', 'alias': 'WAS', 'market': ...","[{'type': 'quarter', 'id': 'e5c8501a-729a-4746...",[{'id': 'c36853e2-f892-42f7-a482-2bf4209d139a'...,,
3,9f64a3b0-46bd-4caa-b5c3-4578e9bab8d8,closed,full,2022-10-19T23:00:00+00:00,2:16,20190.0,7.0,4.0,00:00,4.0,True,22200003,WEBSOCKET,sr:match:35431589,00:00,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'name': 'Pistons', 'alias': 'DET', 'market': ...","{'name': 'Magic', 'alias': 'ORL', 'market': 'O...","[{'type': 'quarter', 'id': 'f2bf467f-85da-4a05...",[{'id': '50061ec0-25f9-4ab2-af46-8867d9565cc0'...,,
4,44b31f4a-bc51-41aa-b1d9-a2bfc95d4e0d,closed,full,2022-10-19T23:30:00+00:00,2:19,19800.0,16.0,17.0,00:00,4.0,True,22200008,WEBSOCKET,sr:match:35431599,00:00,"{'venue': 'US/Eastern', 'home': 'US/Eastern', ...","{'name': 'Raptors', 'alias': 'TOR', 'market': ...","{'name': 'Cavaliers', 'alias': 'CLE', 'market'...","[{'type': 'quarter', 'id': '08bea6a0-f82a-41a9...",[{'id': '16d692db-a7a7-43fc-9535-a3187cd349ab'...,,


In [78]:
def process_data(df):
    df_copy = df.copy()

    # List of columns to drop initially
    columns_to_drop = ["lead_changes", "times_tied", "track_on_court", "deleted_events", "title", "neutral_site", 'coverage', 'scheduled', "status", "attendance", "clock", "duration", "reference", "entry_mode", "sr_id", "clock_decimal", "time_zones", "quarter"]

    # List of team features to drop
    team_features_to_drop = ["name", "alias", "market", "id", "sr_id", "bonus", "remaining_timeouts", "reference", "points"]

    # Process home and away data
    for team in ['home', 'away']:
        team_data = pd.json_normalize(df_copy[team]).drop(team_features_to_drop, axis=1)
        team_data.columns = [f'{team}_{col}' for col in team_data.columns]
        df_copy = df_copy.join(team_data)

    # Create features for home win percentage and away win percentage based on the team's record
    for team in ['home', 'away']:
        df_copy[f'{team}_win_pct'] = df_copy[f"{team}_record.wins"] / (df_copy[f"{team}_record.wins"] + df_copy[f"{team}_record.losses"])
        columns_to_drop.extend([f"{team}_record.wins", f"{team}_record.losses"])

    # Drop the home, away and record columns from the original dataframe
    columns_to_drop.extend(["home", "away"])
    df_copy.drop(columns_to_drop, axis=1, inplace=True)
    df_copy["target"] = game_level_data_copy["target"]


    return df_copy

df_processed = process_data(df)
df_processed.head()

Unnamed: 0,id,periods,home_win_pct,away_win_pct,target
0,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,"[{'type': 'quarter', 'id': 'b2055122-2475-4178...",0.758621,0.555556,True
1,c665e441-9f38-48a7-8796-1f7292f97db1,"[{'type': 'quarter', 'id': '30829330-c8e3-4ff6...",0.482759,0.407407,True
2,1f8105fe-e6ca-475f-aea6-15b455892a9e,"[{'type': 'quarter', 'id': 'e5c8501a-729a-4746...",0.43038,0.435897,False
3,9f64a3b0-46bd-4caa-b5c3-4578e9bab8d8,"[{'type': 'quarter', 'id': 'f2bf467f-85da-4a05...",0.266667,0.310345,True
4,44b31f4a-bc51-41aa-b1d9-a2bfc95d4e0d,"[{'type': 'quarter', 'id': '08bea6a0-f82a-41a9...",0.464286,0.62069,True


In [79]:
import pandas as pd

def extract_periods(df):
    # Create an empty DataFrame to store the extracted data
    extracted_data = pd.DataFrame()

    features_to_keep = ['game_id','type', 'id', 'number', 'sequence', 'times_tied',
       'lead_changes', "events"]

    for i in range(len(df)):
        # Iterate through each game entry
        game_id = df['id'].iloc[i]
        periods = df['periods'].iloc[i]

        for period in periods:
            # For each period, flatten the period data and the nested 'scoring' data
            period_data = pd.json_normalize(period)
            scoring_data = pd.json_normalize(period['scoring'])

            # Combine period data with scoring data
            combined_data = pd.concat([period_data, scoring_data], axis=1)

            # Add game-level information (e.g., game ID)
            combined_data['game_id'] = game_id

            # Append to the extracted_data DataFrame
            extracted_data = pd.concat([extracted_data, combined_data], ignore_index=True)

    # Keep only the columns we want
    extracted_data = extracted_data[features_to_keep]

    return extracted_data

# Use the function to extract period data
period_data = extract_periods(df_processed)
# Display the first few rows
period_data.head()

Unnamed: 0,game_id,type,id,number,sequence,times_tied,lead_changes,events
0,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,quarter,b2055122-2475-4178-8643-87dc7c940e07,1,1,3.0,3,[{'id': '03c554f3-62e6-4746-82f5-d0b8f1733c20'...
1,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,quarter,54baa13c-4695-4c15-9a44-86e46a547d58,2,2,3.0,4,[{'id': 'd4e131e5-05b8-4ba4-9a19-f7032abfd7e1'...
2,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,quarter,dbb8fe7d-c9c4-4a30-a1a4-63ce1dc24dea,3,3,1.0,1,[{'id': '49815faa-6f74-474a-acbc-6c4379014ea1'...
3,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,quarter,6a988dec-8692-4717-996a-e468b8eba2f0,4,4,0.0,0,[{'id': 'dfc4f1b1-fb06-4ac7-ab39-2ebcaf1b8c3d'...
4,c665e441-9f38-48a7-8796-1f7292f97db1,quarter,30829330-c8e3-4ff6-8e23-811d3a33e1c1,1,1,2.0,1,[{'id': '9183a93e-e763-4db8-a508-c80be8545c35'...


In [80]:
from enum import Enum

class Team(Enum):
    HOME = 1
    AWAY = 0
    TIED = -1

In [102]:
from tqdm import tqdm
from collections import deque


def convert_to_timedelta(time_str):
    if '.' in time_str:
        # Format '00:ss.ms'
        minutes = '00'
        seconds, milliseconds = time_str.split('.')
        seconds = seconds.zfill(2)  # Ensure two digits in seconds
        time_formatted = f'{minutes}:{seconds}.{milliseconds}'
    else:
        # Format 'mm:ss'
        minutes, seconds = time_str.split(':')
        minutes = minutes.zfill(2)  # Ensure two digits in minutes
        time_formatted = f'00:{minutes}:{seconds}'

    return pd.to_timedelta(time_formatted)

def calculate_game_time_remaining(quarter_number, clock_decimal):
    try:
        if quarter_number <= 4:
            total_time_per_quarter = pd.to_timedelta('00:12:00')
            game_time_remaining = (4 - quarter_number) * total_time_per_quarter + clock_decimal
        else:
            total_time_per_quarter = pd.to_timedelta('00:05:00')
            game_time_remaining = (5 - quarter_number) * total_time_per_quarter + clock_decimal

        return game_time_remaining.total_seconds()
    except OverflowError as e:
        print(f"Error processing sequence {quarter_number}, clock_decimal {clock_decimal}: {e}")
        return pd.NaT
    
def calculate_scoring_run(recent_scoring, max_window_size=25):
    current_window_size = min(len(recent_scoring), max_window_size)
    if current_window_size > 0:  # Ensure at least one event is in the window
        recent_list = list(recent_scoring)  # Convert deque to list for slicing
        return max(recent_list[-current_window_size:]) - min(recent_list[-current_window_size:])
    return 0  # Default value if no events yet



def encode_leading_team(row):
    if pd.isna(row['current_leading_team_id']):
        return Team.TIED.value  # Tie
    elif row['current_leading_team_id'] == row['on_court.away.id']:
        return Team.AWAY.value  # Away team leads
    else:
        return Team.HOME.value  # Home team leads

def extract_events(df): 
    extracted_data = pd.DataFrame()
    one_hot_columns = set()  # Set to store unique one-hot encoded column names

    CLOSE_GAME_THRESHOLD = 5
    CRITICAL_TIME_THRESHOLD = 300
    RECENT_EVENTS_WINDOW = 25 # Number of events to keep in the recent scoring deque

    last_known_leading_team = None
    previous_score_difference = None
    lead_change_count = 0
    times_tied_count = 0
    recent_scoring = deque(maxlen=RECENT_EVENTS_WINDOW)


    # Add tqdm() around the range to create a progress bar
    for i in tqdm(range(len(df))):
        game_id = df['game_id'].iloc[i]
        events = df['events'].iloc[i]
        quarter_number = df['sequence'].iloc[i]
        # Remove events with event_type 'endquarter' and 'lineupchange'
        events = [event for event in events if event['event_type'] != 'endperiod']
        events = [event for event in events if event['event_type'] not in ['lineupchange', 'timeout', 'endtimeout', "teamtimeout", "flagrantone", "flagranttwo", "stoppage"]]


        for event in events:            
            event_data = pd.json_normalize(event)
            event_data['game_id'] = game_id
            event_data["quarter_number"] = quarter_number
            if 'possession.id' not in event_data.columns:
                event_data['possession.id'] = None


            # Time remaining in game feature
            event_data['clock_decimal'] = event_data['clock_decimal'].apply(convert_to_timedelta)
            event_data['game_time_remaining'] = event_data.apply(lambda x: calculate_game_time_remaining(x['quarter_number'], x['clock_decimal']), axis=1)
            event_data["current_score_difference"] = event_data["home_points"] - event_data["away_points"];
            event_data["current_leading_team_id"] = event_data.apply(lambda x: x["on_court.home.id"] if x["current_score_difference"] > 0 else x["on_court.away.id"] if x["current_score_difference"] < 0 else None, axis=1)
            
            current_score_difference = event_data["current_score_difference"].iloc[0]

            # Determine the current leading team
            if current_score_difference > 0:
                current_leading_team = Team.HOME.value
            elif current_score_difference < 0:
                current_leading_team = Team.AWAY.value
            else:
                current_leading_team = Team.TIED.value

            # Update lead change count
            if last_known_leading_team is not None and last_known_leading_team != Team.TIED and current_leading_team != Team.TIED and current_leading_team != last_known_leading_team:
                lead_change_count += 1

            if current_leading_team != Team.TIED:
                last_known_leading_team = current_leading_team
            # Add lead change count and times tied count to the DataFrame
            event_data['lead_changes'] = lead_change_count
            
            # Update times tied
            if previous_score_difference is not None and previous_score_difference != 0 and current_score_difference == 0:
                times_tied_count += 1
            previous_score_difference = current_score_difference
            event_data['times_tied'] = times_tied_count

            
            # Recent scoring
            recent_scoring.append(event_data["current_score_difference"].iloc[0])
            event_data['scoring_run'] = calculate_scoring_run(recent_scoring)
                        
            event_data['is_close_game'] = event_data['current_score_difference'].abs() <= CLOSE_GAME_THRESHOLD
            
            try:
                event_data['critical_possession'] = event_data['is_close_game'] & (event_data['game_time_remaining'] <= CRITICAL_TIME_THRESHOLD) & (event_data['possession.id'] == event_data['current_leading_team_id'])
            except KeyError: 
                event_data['critical_possession'] = False

            event_data['home_win_pct'] = df_processed[df_processed['id'] == game_id]['home_win_pct'].iloc[0]
            event_data['away_win_pct'] = df_processed[df_processed['id'] == game_id]['away_win_pct'].iloc[0]

            event_data["target"] = df_processed[df_processed["id"] == game_id]["target"].iloc[0]
            
            event_data['home_possession'] = (event_data['possession.id'] == event_data['on_court.home.id']).astype(int)
            event_data['away_possession'] = (event_data['possession.id'] == event_data['on_court.away.id']).astype(int)
            event_data['no_possession'] = event_data['possession.id'].isna().astype(int)

            event_data['encoded_leading_team'] = event_data.apply(encode_leading_team, axis=1)

            event_data.drop(['clock_decimal'], axis=1)
            extracted_data = pd.concat([extracted_data, event_data], ignore_index=True)

    extracted_data = pd.get_dummies(extracted_data, columns=['event_type'], dummy_na=False)


    # Apply normalization and standardization
    # Initialize scalers
    minmax_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()

    # Normalize and standardize
    features_to_normalize = ['game_time_remaining', 'home_win_pct', 'away_win_pct']
    features_to_standardize = ['home_points', 'away_points', 'current_score_difference', 'lead_changes', 'times_tied', 'scoring_run']

    # Apply normalization
    extracted_data[features_to_normalize] = minmax_scaler.fit_transform(extracted_data[features_to_normalize])
    # Apply standardization
    extracted_data[features_to_standardize] = standard_scaler.fit_transform(extracted_data[features_to_standardize])

    features_to_keep = ['id', 'number', 'home_points', 'away_points','on_court.home.players',
        'on_court.away.players', 'game_id', 'statistics', 'attempt', "quarter_number",
         "game_time_remaining", "current_score_difference", "is_close_game", "critical_possession",
         'lead_changes', 'times_tied', "scoring_run", "home_win_pct", "away_win_pct", "target", "home_possession", "away_possession", "no_possession", "encoded_leading_team" ] + [col for col in extracted_data.columns if col.startswith('event_type_')]
    features_to_add_later = ["on_court.home.players", "on_court.away.players", "statistics", "attempt"]
    extracted_data = extracted_data[features_to_keep]
    return extracted_data.drop(features_to_add_later + "event_type_opentip", axis=1)

event_data = extract_events(period_data[:20])


100%|██████████| 20/20 [00:08<00:00,  2.40it/s]


In [103]:
# Display head for the where event_type_delay is true 
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

event_data.head(30)




Unnamed: 0,id,number,home_points,away_points,game_id,quarter_number,game_time_remaining,current_score_difference,is_close_game,critical_possession,lead_changes,times_tied,scoring_run,home_win_pct,away_win_pct,target,home_possession,away_possession,no_possession,encoded_leading_team,event_type_defensivegoaltending,event_type_delay,event_type_freethrowmade,event_type_freethrowmiss,event_type_jumpball,event_type_offensivefoul,event_type_opentip,event_type_personalfoul,event_type_rebound,event_type_review,event_type_shootingfoul,event_type_technicalfoul,event_type_threepointmade,event_type_threepointmiss,event_type_turnover,event_type_twopointmade,event_type_twopointmiss
0,e613ea8b-0023-45c6-b22f-932821d0b31f,4.0,-1.575423,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,1.0,-0.072472,True,False,-1.713418,-1.668059,-1.80404,1.0,0.790123,True,0,1,0,-1,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,6fe0abb4-7cd5-4676-8f98-5a98a790048c,7.0,-1.575423,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.992361,-0.072472,True,False,-1.713418,-1.668059,-1.80404,1.0,0.790123,True,0,0,1,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,3eedb0e6-f4ce-4752-8ca7-c0bcbefdb61f,9.0,-1.575423,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.991319,-0.072472,True,False,-1.713418,-1.668059,-1.80404,1.0,0.790123,True,0,1,0,-1,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,20f74c6a-f1eb-4283-b765-a65c8551f7f6,10.0,-1.575423,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.991319,-0.072472,True,False,-1.713418,-1.668059,-1.80404,1.0,0.790123,True,1,0,0,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,836add5b-8258-4f7e-a6b6-353c5a389e2a,11.0,-1.518337,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.984374,0.156847,True,False,-1.664676,-1.668059,-1.1909,1.0,0.790123,True,0,1,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
5,0bd9b470-1ed7-4d19-a0c1-0e78823ff6df,12.0,-1.518337,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.980902,0.156847,True,False,-1.664676,-1.668059,-1.1909,1.0,0.790123,True,0,0,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
6,b18cf88d-1e45-4b52-aee2-b266d3d87547,13.0,-1.518337,-1.68012,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.980902,0.156847,True,False,-1.664676,-1.668059,-1.1909,1.0,0.790123,True,0,1,0,1,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
7,e3628c17-03f8-4bc3-9b8a-021b70d9affe,14.0,-1.518337,-1.618535,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.980208,-0.072472,True,False,-1.615933,-1.544884,-1.1909,1.0,0.790123,True,1,0,0,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
8,7b266649-5da5-4f4d-8a87-0e00ba13534f,15.0,-1.432709,-1.618535,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.974305,0.271506,True,False,-1.567191,-1.544884,-0.88433,1.0,0.790123,True,0,1,0,1,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
9,eb29fe19-3d72-4141-a91f-a6e85346dcb6,17.0,-1.432709,-1.618535,9f01b268-29c5-4f0e-bf67-21e3dbcf3005,1,0.969791,0.271506,True,False,-1.567191,-1.544884,-0.88433,1.0,0.790123,True,0,0,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [299]:
# # Create a function to extract the player data
# def extract_player_data(df):
#     extracted_data = pd.DataFrame()
#     # The df passed in is the event_data DataFrame
#     for i in tqdm(range(len(df))):
#         game_id = df['game_id'].iloc[i]
#         quarter_number = df['quarter_number'].iloc[i]
        
#         for event in df:
#             # For each event, flatten the player data
#             player_data = pd.json_normalize(event['statistics'])
#             # Add game-level information (e.g., game ID)
#             player_data['game_id'] = game_id
#             player_data["quarter_number"] = quarter_number
#             # Append to the extracted_data DataFrame
#             extracted_data = pd.concat([extracted_data, player_data], ignore_index=True)


# # Use the function to extract player data
# player_data = extract_player_data(event_data)
# # Display the first few rows
# player_data.head()

# Data Preprocessing

In [None]:
df = event_data.copy()

features_to_drop = ["game_id", "id", "number"]

df.drop()