https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html?highlight=group

https://stackoverflow.com/questions/36392735/how-to-combine-multiple-rows-into-a-single-row-with-pandas

In [1]:
import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import freeman

## Carregando os datasets

In [2]:
atbats_csv = pd.read_csv('atbats.csv')
games_csv = pd.read_csv('games.csv')
player_names_csv = pd.read_csv('player_names.csv')

In [None]:
merged = pd.merge(games_csv, atbats_csv, how="inner", on=["g_id"])
games = merged.groupby(by=['g_id'])

Criando Rede

In [None]:
games_teams = {}
for gameId in games.groups:
    game = games.get_group(gameId)
    games_teams[gameId] = {
        "home_team": {
            "name": game.home_team.values[0],
            "batters": [],
            "pitchers": []
        },
        "away_team": {
            "name": game.away_team.values[0],
            "batters": [],
            "pitchers": []
        }
    }

    batters = game.groupby(by="batter_id")
    for batterId in batters.groups:
        batter = batters.get_group(batterId)
        if batter.top.max() != True:
            games_teams[gameId]["home_team"]["batters"].append(batterId)
        else:
            games_teams[gameId]["away_team"]["batters"].append(batterId)

    pitchers = game.groupby(by="pitcher_id")
    for pitcherId in pitchers.groups:
        pitcher = pitchers.get_group(pitcherId)
        if pitcher.top.max() == True:
            games_teams[gameId]["home_team"]["pitchers"].append(pitcherId)
        else:
            games_teams[gameId]["away_team"]["pitchers"].append(pitcherId)

In [None]:
game_data = {}
for gameId in games.groups:
    game = games.get_group(gameId)
    game_data[gameId] = {
        game.home_team.values[0]: [],
        game.away_team.values[0]: []
    }

    batters = game.groupby(by="batter_id")
    for batterId in batters.groups:
        batter = batters.get_group(batterId)
        if batter.top.max() != True:
            game_data[gameId][game.home_team.values[0]].append(batterId)
        else:
            game_data[gameId][game.away_team.values[0]].append(batterId)

    pitchers = game.groupby(by="pitcher_id")
    for pitcherId in pitchers.groups:
        pitcher = pitchers.get_group(pitcherId)
        if pitcher.top.max() == True:
            game_data[gameId][game.home_team.values[0]].append(pitcherId)
        else:
            game_data[gameId][game.away_team.values[0]].append(pitcherId)

In [None]:
player_ids = player_names_csv["id"].values

In [None]:

for game, teams in game_data.items():
    for team, players in teams.items():
        game_data[game][team] = list(set(players))
connections = {}
for game in game_data.values():
    for team, players in game.items():
        for index in range(len(players) - 2):
            player = players[index]
            player_connections = connections.get(player, None)
            if not player_connections:
                connections[player] = set()
            for i in range(index + 1, len(players)):
                partner = players[i]
                connection = connections.get(partner, None)
                if not connection or player not in connection:
                    connections[player].add(partner)
for player in player_ids:
    if player not in connections:
        connections[player] = []

        

In [None]:
gml = 'graph [\n\tdirected 0\n'
for player in connections.keys():
    gml += f"\n\tnode [\n\t\tid {player}\n\t]"
gml += '\n'
for player, player_connections in connections.items():
    for connection in player_connections:
        gml += f'\n\tedge [\n\t\tsource {player}\n\t\ttarget {connection}\n\t]'
gml += '\n]'
with open('network.gml', 'w') as f:
    f.write(gml)

In [None]:
g = freeman.load('network.gml')

Extraindo dados de WHIP e ERA

In [13]:
atbats_csv.event.unique()

array(['Groundout', 'Double', 'Single', 'Strikeout', 'Walk', 'Runner Out',
       'Flyout', 'Forceout', 'Pop Out', 'Intent Walk', 'Lineout',
       'Home Run', 'Triple', 'Hit By Pitch', 'Grounded Into DP',
       'Sac Bunt', 'Fielders Choice', 'Bunt Groundout', 'Field Error',
       'Double Play', 'Sac Fly', 'Fielders Choice Out', 'Bunt Pop Out',
       'Catcher Interference', 'Strikeout - DP', 'Batter Interference',
       'Sac Fly DP', 'Bunt Lineout', 'Sacrifice Bunt DP', 'Triple Play'],
      dtype=object)

In [14]:
pitcher_data = {}
for index, row in atbats_csv.iterrows():
    pitcher_id = row['pitcher_id']
    pitcher = pitcher_data.get(pitcher_id, None)
    if not pitcher:
        pitcher_data[pitcher_id] = {
            'walks': 0,
            'hits': 0,
            'at-bats': 0,
            'IP': 0
        }
    pitcher_data[pitcher_id]['at-bats'] += 1
    at_bat_result = row['event']
    if at_bat_result in ['Walk', 'Intent Walk', 'Hit By Pitch']:
        pitcher_data[pitcher_id]['walks'] += 1
    elif at_bat_result in ['Double', 'Single', 'Triple', 'Home Run']:
        pitcher_data[pitcher_id]['hits'] += 1
    elif at_bat_result in ['Groundout', 'Strikeout', 'Runner Out', 'Flyout', 'Forceout', 'Pop out', 'Lineout', 'Sac Bunt', 'Bunt Groundout', 'Sac Fly', 'Fielders Choice Out', 'Bunt Pop Out', 'Bunt Lineout']:
        pitcher_data[pitcher_id]['IP'] += 1/3
    elif at_bat_result in ['Double Play', 'Grounded Into DP', 'Strikeout - DP', 'Sac Fly DP', 'Sacrifice Bunt DP']:
        pitcher_data[pitcher_id]['IP'] += 2/3
    elif at_bat_result == 'Triple Play':
        pitcher_data[pitcher_id]['IP'] += 1
    

In [18]:
for pitcher in pitcher_data.keys():
    try:
        pitcher_data[pitcher]['WHIP'] = (pitcher_data[pitcher]['walks'] + pitcher_data[pitcher]['hits'])/pitcher_data[pitcher]['IP']
    except:
        pitcher_data[pitcher]['WHIP'] = None

In [20]:
data = pd.DataFrame({
    'pitcher_id': [n for n in pitcher_data.keys()],
    'walks': [n['walks'] for n in pitcher_data.values()],
    'hits': [n['hits'] for n in pitcher_data.values()],
    'at-bats': [n['at-bats'] for n in pitcher_data.values()],
    'IP': [n['IP'] for n in pitcher_data.values()],
    'WHIP': [n['WHIP'] for n in pitcher_data.values()]
})

In [21]:
data.to_csv('pitcher_data.csv')