In [40]:
import pandas as pd
import os

from formatacao_sumulas import FootballDataProcessor
from algorithm import FootballMatchRating
import numpy as np
import warnings
warnings.filterwarnings("ignore")


id_players = pd.read_excel('id_players.xlsx', index_col=0)


data_folder = '../Data/'


In [41]:
fix_id_players = pd.read_csv('players_ratings.xlsx')

In [42]:
fix_id_players = fix_id_players[['player', 'rating']]

In [43]:
import pandas as pd
import numpy as np
from scipy.stats import skellam
from scipy.optimize import fsolve

def calculate_team_rating(sub_df):
    """Calculate the weighted average rating of players on the field during a subgame."""
    return sub_df['rating'].mean()

def calculate_skellam_parameters(rating_home, rating_away, p_alvo=0.75):
    """Calculate the parameters (mu1, mu2) for the Skellam distribution based on team ratings."""
    soma = 3  # Define a fixed sum of expected goals (adjustable if needed)
    p_alvo = 1/(1+10**((rating_away-rating_home)/400))

    def equation(mu2, p_alvo):
        mu1 = soma - mu2
        p = 1 - skellam.cdf(0, mu1, mu2) + skellam.pmf(0, mu1, mu2) / 2
        return p - p_alvo

    mu2_inicial = 0.9  # Initial guess for mu2
    mu2 = fsolve(equation, mu2_inicial, args=(p_alvo))[0]
    mu1 = soma - mu2
    return mu1, mu2

def simulate_goals(E_home, E_away, duration_subgame):
    """Simulate goals for home and away teams using Skellam trials based on expected scores."""
    goals_home = np.random.poisson(E_home * duration_subgame/90, 1) 
    goals_away = np.random.poisson(E_away * duration_subgame/90, 1) 

    return goals_home, goals_away

def process_game(df):
    """Process an entire game to calculate goals for and against for each player based on subgames."""
    
    # Step 1: Identify subgames based on substitutions
    home_df = df[df['status'] == 'Home']
    away_df = df[df['status'] == 'Away']

    # Create a list of all substitution times for each team (including the start and end of the match)
    sub_times_home = sorted(set(home_df['Minute Entered'].tolist() + home_df['Minute Exited'].tolist() + [0, 90]))
    sub_times_away = sorted(set(away_df['Minute Entered'].tolist() + away_df['Minute Exited'].tolist() + [0, 90]))
    
    # We will look at every interval (subgame) from one substitution to the next for both teams
    subgame_intervals = sorted(set(sub_times_home + sub_times_away))

    # Initialize columns for tracking goals for and against for each player.
    df['Goals For'] = 0
    df['Goals Against'] = 0

    # Loop over each subgame interval
    for i in range(1, len(subgame_intervals)):
        start_minute = subgame_intervals[i - 1]
        end_minute = subgame_intervals[i]

        # Calculate duration of the subgame (in minutes)
        duration_subgame = end_minute - start_minute

        # Filter players who were on the field during this interval for both teams
        home_on_field = home_df[(home_df['Minute Entered'] <= start_minute) & (home_df['Minute Exited'] >= end_minute)]
        away_on_field = away_df[(away_df['Minute Entered'] <= start_minute) & (away_df['Minute Exited'] >= end_minute)]

        # Calculate team ratings for this subgame
        home_rating_subgame = calculate_team_rating(home_on_field)
        away_rating_subgame = calculate_team_rating(away_on_field)

        # Calculate Skellam parameters for this subgame
        mu1, mu2 = calculate_skellam_parameters(home_rating_subgame, away_rating_subgame)

        # Simulate goals for this subgame, adjusted by the duration of the subgame
        goals_home_subgame, goals_away_subgame = simulate_goals(mu1, mu2, duration_subgame)

        # Step 3: Assign goals to players based on their presence in the subgame
        # Update 'Goals For' and 'Goals Against' for players in Home team
        df.loc[df['player'].isin(home_on_field['player']), 'Goals For'] += goals_home_subgame
        df.loc[df['player'].isin(home_on_field['player']), 'Goals Against'] += goals_away_subgame

        # Update 'Goals For' and 'Goals Against' for players in Away team
        df.loc[df['player'].isin(away_on_field['player']), 'Goals For'] += goals_away_subgame
        df.loc[df['player'].isin(away_on_field['player']), 'Goals Against'] += goals_home_subgame

    return df


In [44]:
import pandas as pd
import numpy as np
from scipy.stats import skellam
from scipy.optimize import fsolve

def calculate_team_rating(sub_df):
    """Calculate the weighted average rating of players on the field during a subgame."""
    return sub_df['rating'].mean()

def calculate_skellam_parameters(rating_home, rating_away, p_alvo=0.75):
    """Calculate the parameters (mu1, mu2) for the Skellam distribution based on team ratings."""
    soma = 3  # Define a fixed sum of expected goals (adjustable if needed)
    p_alvo = 1/(1+10**((rating_away-rating_home)/400))

    def equation(mu2, p_alvo):
        mu1 = soma - mu2
        p = 1 - skellam.cdf(0, mu1, mu2) + skellam.pmf(0, mu1, mu2) / 2
        return p - p_alvo

    mu2_inicial = 0.9  # Initial guess for mu2
    mu2 = fsolve(equation, mu2_inicial, args=(p_alvo))[0]
    mu1 = soma - mu2
    return mu1, mu2

def simulate_goals(E_home, E_away, duration_subgame, taxa_at_home, taxa_at_away):
    E_home_adjusted = E_home * taxa_at_home
    E_away_adjusted = E_away * taxa_at_away
    # Simula os gols usando a distribuição Poisson com as taxas ajustadas
    goals_home = np.random.poisson(E_home_adjusted, 1)
    goals_away = np.random.poisson(E_away_adjusted, 1)

    return goals_home, goals_away

def process_game(df):
    """Process an entire game to calculate goals for and against for each player based on subgames."""
    
    # Step 1: Identify subgames based on substitutions
    home_df = df[df['status'] == 'Home']
    away_df = df[df['status'] == 'Away']
    
    # Step 2: Calculate team ratings for the entire game (no subgame division)
    home_rating_game = calculate_team_rating(home_df)
    away_rating_game = calculate_team_rating(away_df)   
    mu1, mu2 = calculate_skellam_parameters(home_rating_game, away_rating_game) 

    # Create a list of all substitution times for each team (including the start and end of the match)
    sub_times_home = sorted(set(home_df['Minute Entered'].tolist() + home_df['Minute Exited'].tolist() + [0, 90]))
    sub_times_away = sorted(set(away_df['Minute Entered'].tolist() + away_df['Minute Exited'].tolist() + [0, 90]))
    
    # We will look at every interval (subgame) from one substitution to the next for both teams
    subgame_intervals = sorted(set(sub_times_home + sub_times_away))

    # Initialize columns for tracking goals for and against for each player.
    df['Goals For'] = 0
    df['Goals Against'] = 0

    # Loop over each subgame interval
    for i in range(1, len(subgame_intervals)):
        start_minute = subgame_intervals[i - 1]
        end_minute = subgame_intervals[i]

        # Calculate duration of the subgame (in minutes)
        duration_subgame = end_minute - start_minute

        # Filter players who were on the field during this interval for both teams
        home_on_field = home_df[(home_df['Minute Entered'] <= start_minute) & (home_df['Minute Exited'] >= end_minute)]
        away_on_field = away_df[(away_df['Minute Entered'] <= start_minute) & (away_df['Minute Exited'] >= end_minute)]
        
         # Filter players who were on the field during this interval for both teams
        home_on_field = home_df[(home_df['Minute Entered'] <= start_minute) & (home_df['Minute Exited'] >= end_minute)]
        away_on_field = away_df[(away_df['Minute Entered'] <= start_minute) & (away_df['Minute Exited'] >= end_minute)]
        
        taxa_at_home = home_on_field['rating'].mean()/home_rating_game
        taxa_at_away = away_on_field['rating'].mean()/away_rating_game


        # Simulate goals for this subgame, adjusted by the duration of the subgame
        goals_home_subgame, goals_away_subgame = simulate_goals(mu1, mu2, duration_subgame, taxa_at_home, taxa_at_away)

        # Step 3: Assign goals to players based on their presence in the subgame
        # Update 'Goals For' and 'Goals Against' for players in Home team
        df.loc[df['player'].isin(home_on_field['player']), 'Goals For'] += goals_home_subgame
        df.loc[df['player'].isin(home_on_field['player']), 'Goals Against'] += goals_away_subgame

        # Update 'Goals For' and 'Goals Against' for players in Away team
        df.loc[df['player'].isin(away_on_field['player']), 'Goals For'] += goals_away_subgame
        df.loc[df['player'].isin(away_on_field['player']), 'Goals Against'] += goals_home_subgame

    return df


In [45]:
years = range(2013, 2015)
campeonatos = ['Serie_A']

for year in years:
    for campeonato in campeonatos:
        print(year, campeonato)
        df_jogos_2014 = pd.read_json(f'../Data/{campeonato}_{year}_games.json')
        df_jogos_2014_ = df_jogos_2014.T
        df_jogos_2014_['id_game'] = range(1, len(df_jogos_2014_) + 1)

        i=0
        for index, row in df_jogos_2014_.iterrows():
            if index-1 < len(df_jogos_2014_):
                processor = FootballDataProcessor(df_jogos_2014_, index-1)
                final_df = processor.process(data_folder, campeonato, year)
                final_df.rename(columns={'player_id': 'player'}, inplace=True)
                final_df['id_game'] = i+1
                final_df['player'] = final_df['player'].astype('string')
                fix_id_players['player'] = fix_id_players['player'].astype('string')
                final_df = pd.merge(final_df, fix_id_players, on='player', how='left')
                final_df = process_game(final_df)
                
                final_df = final_df.drop(columns=['rating'])
            
                
                id_players['player'] = id_players['player'].astype('string')

                final_df_ = pd.merge(final_df, id_players, on='player', how='left')
                


                team_1_name = df_jogos_2014_.iloc[index-1, 0]
                team_2_name = df_jogos_2014_.iloc[index-1, 1]

                # Criar uma instância de FootballMatchRating
                rating_calculator = FootballMatchRating(final_df_, team_1_name, team_2_name)

                updated_match_data = rating_calculator.update_ratings()

                teams_over_9_players = updated_match_data[updated_match_data['Minutes Played'] == 90] \
                                        .groupby('team').filter(lambda x: len(x) > 10)

                updated_match_data['player'] = updated_match_data['player'].astype('string')

                updated_match_data.rename(columns={'player_name_x': 'player_name'}, inplace=True)
                updated_match_data.rename(columns={'time_jogador_x': 'time_jogador'}, inplace=True)

                updated_values = id_players[['player']].merge(
                    updated_match_data[['player', 'rating', 'games_played', 'player_name', 'time_jogador']],
                    on='player',
                    how='left'
                )

                # Atualizar apenas as linhas corespondentes em id_players

                id_players.update(updated_values)
            else:
                print(f"Índice {index-1} fora dos limites para o ano {year}")
                continue


id_players


2013 Serie_A
2014 Serie_A


Unnamed: 0,player,rating,age,games_played,player_name,time_jogador
0,293092,1476.71312,25,54,21Mailson Mailson Francisco de ...,Chapecoense / SC
1,554797,1500.00000,25,50,,
2,413816,1500.00000,25,50,,
3,508147,1500.00000,25,50,,
4,584599,1500.00000,25,50,,
...,...,...,...,...,...,...
3082,346555,1500.00000,25,50,,
3083,299156,1500.00000,25,50,,
3084,319981,1500.00000,25,50,,
3085,693536,1500.00000,25,50,,


In [46]:
final_df


Unnamed: 0,nome_jogador,team,player,player_name,Minutes Played,Minute Entered,Minute Exited,status,Goals For,Goals Against,time_jogador,id_game
0,1Renan Renan Brito Soares T(g)P155703,Goiás / GO,155703,1Renan Renan Brito Soares,90,0,90,Home,7,10,Goiás / GO,1
1,2Tiago Real Tiago Real do Prado TP176512,Goiás / GO,176512,2Tiago Real,90,0,90,Home,7,10,Goiás / GO,1
2,3Felipe Felipe Francisco Macedo TP303256,Goiás / GO,303256,3Felipe Felipe Francisco Macedo,90,0,90,Home,7,10,Goiás / GO,1
3,4Pedro Pedro Henrique Perei ... TP338777,Goiás / GO,338777,4Pedro Pedro Henrique Perei ...,90,0,90,Home,7,10,Goiás / GO,1
4,5Rodrigo Rodrigo Vasconcelos ... TP302774,Goiás / GO,302774,5Rodrigo Rodrigo Vasconcelos ...,90,0,90,Home,7,10,Goiás / GO,1
5,6Felipe Felipe Saturnino Gomes TP347608,Goiás / GO,347608,6Felipe Felipe Saturnino Gomes,18,72,90,Home,4,4,Goiás / GO,1
6,7Ramon Ramon Rodrigo de Freitas TP152825,Goiás / GO,152825,7Ramon Ramon Rodrigo de Freitas,26,64,90,Home,5,8,Goiás / GO,1
7,8David França David Franca Oliveir ... TP150731,Goiás / GO,150731,8David França David Franca Oliveir ...,90,0,90,Home,7,10,Goiás / GO,1
8,9Welinton Welinton Junior Ferr ... TP321011,Goiás / GO,321011,9Welinton Welinton Junior Ferr ...,9,81,90,Home,3,3,Goiás / GO,1
9,10Esquerdinha Rubens Raimundo da Silva TP298531,Goiás / GO,298531,10Esquerdinha Rubens Raimundo da Silva,90,0,90,Home,7,10,Goiás / GO,1


In [47]:
#ordem
x = id_players.sort_values(by='rating', ascending=False)
x.head(20)

Unnamed: 0,player,rating,age,games_played,player_name,time_jogador
154,183840,1618.427385,25,83,6Marlon Marlon Farias Castel...,Criciúma / SC
1531,335342,1609.62545,25,84,22Willie Willie Hortencio Barbosa,Vitória / BA
2534,176375,1608.540751,25,85,7Lins Lins Lima de Brito,Criciúma / SC
568,153784,1605.786342,25,69,1Galatto Rodrigo Jose Galatto,Criciúma / SC
558,141516,1604.224477,25,105,4Durval Severino dos Ramos D ...,Sport / PE
262,311439,1599.759542,25,65,20Mike Mike dos Santos Nena ...,Sport / PE
1067,189538,1594.240328,25,83,90Felipe Felipe de Oliveira Silva,Figueirense / SC
1393,156045,1589.4315,25,75,21Ygor Ygor Maciel Santiago,Internacional / RS
656,175475,1586.972588,25,82,3Matheus Matheus Ferraz Pereira,Criciúma / SC
1924,175445,1585.036581,25,65,87Giovanni Giovanni Aparecido A...,Atlético Mineiro / MG


In [48]:
fix_id_players.sort_values(by='rating', ascending=False)

Unnamed: 0,player,rating
0,129292,1666.369708
1,163518,1666.050405
2,191616,1649.769421
3,166647,1649.430858
4,177749,1646.468489
...,...,...
3082,166491,1365.447827
3083,173742,1365.355759
3084,135462,1342.602490
3085,177998,1336.815655


In [49]:
# ver quantos do 20 jogadores de maior rating em x estão em fix_id_players
x.head(100)['player'].isin(fix_id_players['player'].head(100)).value_counts()

player
False    92
True      8
Name: count, dtype: int64

In [50]:
# unir os dados por player e calcular o coeficiente de correlação de spearman
import numpy as np
from scipy.stats import spearmanr

df_ = pd.merge(fix_id_players.sort_values(by='rating', ascending=False), id_players.sort_values(by='rating', ascending=False), on='player', how='left')

# remover os valores nulos
df_ = df_.dropna()

coeficiente, p_valor = spearmanr(df_['rating_x'], df_['rating_y'])


print(f"Coeficiente de Correlação de Spearman: {coeficiente}")
print(f"Valor-p: {p_valor}")

Coeficiente de Correlação de Spearman: -0.0186859786900387
Valor-p: 0.5447193127261576


In [51]:
# unir os dados por player e calcular o coeficiente de correlação de spearman
import numpy as np
from scipy.stats import spearmanr

df_ = pd.merge(fix_id_players.sort_values(by='rating', ascending=False).head(500), id_players.sort_values(by='rating', ascending=False).head(500), on='player', how='left')

# remover os valores nulos
df_ = df_.dropna()

coeficiente, p_valor = spearmanr(df_['rating_x'], df_['rating_y'])

print(f"Coeficiente de Correlação de Spearman: {coeficiente}")
print(f"Valor-p: {p_valor}")

Coeficiente de Correlação de Spearman: 0.2086310256320101
Valor-p: 0.0011777197889312277


In [52]:
import numpy as np
from scipy.optimize import fsolve
from scipy.stats import skellam

def equation(mu2, p_alvo):
    soma = 3.0
    mu1 = soma - mu2
    # p é a probabilidade do time 1 ganhar mais metade da probabilidade de empate.
    p = 1-skellam.cdf(0, mu1, mu2) + skellam.pmf(0, mu1, mu2) / 2
    return p - p_alvo

p_alvo = 0.1
mu2_inicial = 0.8

# Resolvendo a equação para encontrar o parâmetro mu2
mu2 = fsolve(equation, mu2_inicial, args=(p_alvo))
# Gerando uma amostra da distribuição Skellam só para ver como fica.
sample = skellam.rvs(3-mu2, mu2, size=10)

print("Mu2:", mu2)
print("Sample:", sample)
print("prob obtido: ", 1-skellam.cdf(0, 3-mu2, mu2) + skellam.pmf(0, 3-mu2, mu2) / 2, '\n')

# para produzir um placar que respeite a probabilidade alvo
mu1 = 3 - mu2
print("Mu1:", mu1)

gols1 = np.random.poisson(mu1, 10)

gols2 = np.random.poisson(mu2, 10)
print("Gols1:", gols1)
print("Gols2:", gols2, '\n')

# Testando se os placares correspondem às probabilidades alvo
gols1 = np.random.poisson(mu1, 10000)
gols2 = np.random.poisson(mu2, 10000)
print("Probabilidade de time 1 ganhar ou empatar:", np.mean(gols1 > gols2)+np.mean(gols1 == gols2)/2)


Mu2: [2.58699163]
Sample: [ 1 -3 -3 -5  0 -3 -1  2 -3 -3]
prob obtido:  [0.1] 

Mu1: [0.41300837]
Gols1: [1 3 0 1 0 0 0 0 0 0]
Gols2: [2 4 4 1 2 6 3 2 6 2] 

Probabilidade de time 1 ganhar ou empatar: 0.0993
