# Extract players in a game
The aim of this notebook is to identify the players that played a specific game. This data is not directly available in any of the two data sources(Kaggle and CMU dataset), but Kaggle has play by play data of each game which has the record of each point scored, any blocks or steals done by a player in a game. We parse these records and identify the unique players in each game.

In [2]:
import pandas as pd

In [3]:
def combinePlayerTeam(row, playerCol, teamCol):
    return [str(row[playerCol][i])+"_"+str(row[teamCol][i]) for i in range(0,len(row[teamCol]))]

def getPlayerInGame(pbp):
    df = pbp.groupby('game_id').agg({'player1_id': list, 'player1_team_abbreviation': list, 'player2_id': list, 'player2_team_abbreviation': list, 'player3_id': list, 'player3_team_abbreviation': list})
    df["combined"] = df.apply(lambda x: list(set(combinePlayerTeam(x, "player1_id", "player1_team_abbreviation") + combinePlayerTeam(x, "player2_id", "player2_team_abbreviation") +combinePlayerTeam(x, "player3_id", "player3_team_abbreviation"))), axis=1)
    df = df.explode("combined")
    df["player_id"] = df.apply(lambda x: x["combined"].split("_")[0], axis = 1)
    df["team_id"] = df.apply(lambda x: x["combined"].split("_")[1], axis = 1)

    return df[["player_id","team_id"]]

gameToPlayerChunks = []
count = 1

#Processing data in chunks, our machines cant process this amount of data at once.
for df in pd.read_csv('Kaggle Data/play_by_play.csv', chunksize=10000):
    if count % 100 == 0:
        chunk = pd.concat(gameToPlayerChunks)
        chunk.to_csv("Chunks/chunk_"+str(count/100)+".csv")
    count+=1
    gameToPlayerChunks.append(getPlayerInGame(df))
gameToPlayer = pd.concat(gameToPlayerChunks)
gameToPlayer.head()

In [6]:
gameToPlayer.to_csv("gameToPlayer.csv")