# [CDAF] Atividade 4

## Nome
Nome: Rodrigo Felipe Lima Braz


## Referências
- [1] https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- [2] https://socceraction.readthedocs.io/en/latest/api/generated/socceraction.xthreat.ExpectedThreat.html#socceraction.xthreat.ExpectedThreat
- [3] https://socceraction.readthedocs.io/en/latest/api/generated/socceraction.xthreat.get_successful_move_actions.html#socceraction.xthreat.get_successful_move_actions
- [4] https://socceraction.readthedocs.io/en/latest/documentation/valuing_actions/xT.html

In [2]:
# Importando bibliotecas
from tqdm import tqdm
import numpy as np
import pandas as pd
import socceraction.spadl as spd
from socceraction import xthreat as xt

### LaLiga  p/ SPADL com pré-processamentos

In [3]:
# carregando os eventos
path = r"C:\Users\rodri\Desktop\Atividades\Ciencia_de_dados_futebol\atv4\events_Spain.json"
events = pd.read_json(path_or_buf=path)
events.head(4)

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],3542,"[{'y': 61, 'x': 37}, {'y': 50, 'x': 50}]",2565548,Pass,682,1H,2.994582,85,180864419
1,8,Simple pass,[{'id': 1801}],274435,"[{'y': 50, 'x': 50}, {'y': 30, 'x': 45}]",2565548,Pass,682,1H,3.13702,85,180864418
2,8,Simple pass,[{'id': 1801}],364860,"[{'y': 30, 'x': 45}, {'y': 12, 'x': 38}]",2565548,Pass,682,1H,6.709668,85,180864420
3,8,Simple pass,[{'id': 1801}],3534,"[{'y': 12, 'x': 38}, {'y': 69, 'x': 32}]",2565548,Pass,682,1H,8.805497,85,180864421


In [4]:
# pré processamento em colunas da tabela de eventos para facilitar a conversão p/ SPADL
events = events.rename(columns={'id': 'event_id', 'eventId': 'type_id', 'subEventId': 'subtype_id',
                                'teamId': 'team_id', 'playerId': 'player_id', 'matchId': 'game_id'})
events['milliseconds'] = events['eventSec'] * 1000
events['period_id'] = events['matchPeriod'].replace({'1H': 1, '2H': 2})

In [5]:
# carregando as partidas, pois vamos saber quais times jogam em casa e fora p/ usar como parametro do SPADL
path = r"C:\Users\rodri\Desktop\Atividades\Ciencia_de_dados_futebol\atv4\matches_Spain.json"
matches = pd.read_json(path_or_buf=path)
matches.head(4)

Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,date,referees,duration,competitionId
0,Played,4406122,38,"{'676': {'scoreET': 0, 'coachId': 92894, 'side...",181144,2018-05-20 18:45:00,676,Camp Nou,2565922,"Barcelona - Real Sociedad, 1 - 0","May 20, 2018 at 8:45:00 PM GMT+2","[{'refereeId': 398931, 'role': 'referee'}, {'r...",Regular,795
1,Played,4406122,38,"{'679': {'scoreET': 0, 'coachId': 3427, 'side'...",181144,2018-05-20 16:30:00,0,Estadio Wanda Metropolitano,2565925,"Atl\u00e9tico Madrid - Eibar, 2 - 2","May 20, 2018 at 6:30:00 PM GMT+2","[{'refereeId': 395056, 'role': 'referee'}, {'r...",Regular,795
2,Played,4406122,38,"{'691': {'scoreET': 0, 'coachId': 444778, 'sid...",181144,2018-05-20 14:15:00,691,San Mam\u00e9s Barria,2565919,"Athletic Club - Espanyol, 0 - 1","May 20, 2018 at 4:15:00 PM GMT+2","[{'refereeId': 384957, 'role': 'referee'}, {'r...",Regular,795
3,Played,4406122,38,"{'674': {'scoreET': 0, 'coachId': 210074, 'sid...",181144,2018-05-20 10:00:00,674,Estadio de Mestalla,2565924,"Valencia - Deportivo La Coru\u00f1a, 2 - 1","May 20, 2018 at 12:00:00 PM GMT+2","[{'refereeId': 398913, 'role': 'referee'}, {'r...",Regular,795


In [6]:
# as informações dos times de cada partida estão em um dicionário dentro da coluna 'teamsData', então vamos separar essas informações
team_matches = []
for i in tqdm(range(len(matches))):
    match = pd.DataFrame(matches.loc[i, 'teamsData']).T
    match['matchId'] = matches.loc[i, 'wyId']
    team_matches.append(match)
team_matches = pd.concat(team_matches).reset_index(drop=True)

100%|██████████| 380/380 [00:00<00:00, 1384.22it/s]


In [7]:
# fazendo a conversão p/ SPADL, padronizando a direção de jogo da esquerda p/ a direita e adicionando os nomes dos tipos de ações
spadl = []
game_ids = events.game_id.unique().tolist()
for g in tqdm(game_ids):
    match_events = events.loc[events.game_id == g]
    match_home_id = team_matches.loc[(team_matches.matchId == g) & (team_matches.side == 'home'), 'teamId'].values[0]
    match_actions = spd.wyscout.convert_to_actions(events=match_events, home_team_id=match_home_id)
    match_actions = spd.play_left_to_right(actions=match_actions, home_team_id=match_home_id)
    match_actions = spd.add_names(match_actions)
    spadl.append(match_actions)
spadl = pd.concat(spadl).reset_index(drop=True)

100%|██████████| 380/380 [02:16<00:00,  2.79it/s]


In [8]:
# adicionando o nome dos jogadores
path = r"C:\Users\rodri\Desktop\Atividades\Ciencia_de_dados_futebol\atv4\players.json"
players = pd.read_json(path_or_buf=path)
players['player_name'] = players['firstName'] + ' ' + players['lastName']
players = players[['wyId', 'player_name']].rename(columns={'wyId': 'player_id'})
spadl = spadl.merge(players, on='player_id', how='left')
spadl.head(4)

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,bodypart_id,type_id,result_id,action_id,type_name,result_name,bodypart_name,player_name
0,2565548,1,2.994582,682,3542,38.85,26.52,52.5,34.0,180864419,0,0,1,0,pass,success,foot,Manuel Trigueros Mu\u00f1oz
1,2565548,1,3.13702,682,274435,52.5,34.0,47.25,47.6,180864418,0,0,1,1,pass,success,foot,Enes \u00dcnal
2,2565548,1,6.709668,682,364860,47.25,47.6,39.9,59.84,180864420,0,0,1,2,pass,success,foot,Rodrigo Hern\u00e1ndez Cascante
3,2565548,1,8.805497,682,3534,39.9,59.84,33.6,21.08,180864421,0,0,1,3,pass,success,foot,Jaume Vicent Costa Jord\u00e1


## Questão 1
- Crei um dataframe "shots" à partir do dataframe "spadl", contendo apenas os chutes.
- Crie 4 colunas no dataframe "shots" a serem usadas como features de um modelo de xG.
- Justifique a escolha das features.

In [24]:
spadl['type_name'].unique()
shots = spadl[(spadl['type_name'] == 'shot') | (spadl['type_name'] == 'shot_freekick') | (spadl['type_name'] == 'shot_penalty')]

#Usarei o tipo da ação, ou seja, um penalti tem um xG muito elevado, já uma falta, um pouco menor
shots['xG_type'] = shots['type_id']

#Usarei localização de onde começou a ação, para medir o quão perto ou longe do gol o jogador está, também para medir o ângulo
shots['xG_startx'] = shots['start_x']
shots['xG_starty'] = shots['start_y']

#Usarei a parte do corpo, para saber dado o tipo da ação, e a localização, qual parte do corpo é mais efetivo de se usar.
factorized, uniques = pd.factorize(shots['bodypart_name'])
shots['xG_bodypart'] = factorized
shots.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots['xG_type'] = shots['type_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots['xG_startx'] = shots['start_x']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots['xG_starty'] = shots['start_y']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,...,result_id,action_id,type_name,result_name,bodypart_name,player_name,xG_type,xG_startx,xG_starty,xG_bodypart
20,2565548,1,57.771186,695,225089,97.65,44.88,105.0,34.0,180865315,...,0,20,shot,fail,foot,Jos\u00e9 Luis Morales Nogales,11,97.65,44.88,0
22,2565548,1,60.727239,695,255738,84.0,27.88,84.0,27.88,180864547,...,0,22,shot,fail,foot,Jefferson Andr\u00e9s Lerma Sol\u00eds,11,84.0,27.88,0
93,2565548,1,446.986112,682,37831,92.4,29.24,92.4,29.24,180864486,...,0,93,shot,fail,foot,Carlos Arturo Bacca Ahumada,11,92.4,29.24,0
96,2565548,1,488.929113,682,15214,91.35,23.12,105.0,27.2,180864491,...,0,96,shot,fail,foot,Antonio Rukavina,11,91.35,23.12,0
178,2565548,1,948.872079,695,225089,78.75,40.8,105.0,34.0,180864792,...,0,178,shot,fail,foot,Jos\u00e9 Luis Morales Nogales,11,78.75,40.8,0


## Questão 2
- Crie uma coluna numérica binária "goal" no dataframe "shots" indicando se o chute resultou em gol ou não.
- Use regressão logística [1] p/ treinar (.fit(X_train, y_train)) um modelo de xG usando as features criadas na questão 1.
- Use 70% dos dados para treino e 30% para teste.
- Reporte a acurácia do modelo para os conjuntos de treino (.score(X_train, y_train)) e teste (.score(X_test, y_test)).

In [26]:
shots['result_name'].unique()
shots['goal'] = shots['result_name'].map(lambda x: 1 if x == 'success' else 0)

array(['fail', 'success'], dtype=object)

In [30]:
from sklearn.linear_model import LinearRegression


In [45]:
treino = shots[['xG_type', 'xG_startx', 'xG_starty', 'xG_bodypart', 'goal']]

treino = treino.sample(frac=0.7, random_state=42)
result_treino = treino['goal']

teste = shots.drop(treino.index)
result_teste = teste['goal']

treino.drop(columns = ['goal'], inplace= True, axis = 1)
teste.drop(columns = ['goal'], inplace= True, axis = 1)

teste = teste[['xG_type', 'xG_startx', 'xG_starty', 'xG_bodypart']]


Unnamed: 0,xG_type,xG_startx,xG_starty,xG_bodypart
22,11,84.00,27.88,0
93,11,92.40,29.24,0
96,11,91.35,23.12,0
178,11,78.75,40.80,0
196,11,81.90,21.08,0
...,...,...,...,...
473214,13,82.95,52.36,0
473284,11,70.35,36.72,0
473298,11,94.50,12.24,0
473828,13,96.60,11.56,0


In [46]:
model = LinearRegression()
model.fit(treino, result_treino)


In [47]:
model.score(treino, result_treino)

0.05683148587372733

In [49]:
model.score(teste, result_teste)

0.06709681583896376

## Questão 3
- Use o modelo treinado na questão 2 p/ prever a probabilidade de gol de todos os chutes do dataframe "shots". Reporte essas probabilidades no dataframe "shots" em uma coluna "xG".
- Agrupe o dataframe "shots" por "player_name" e reporte a soma dos "goal" e "xG".
- Reporte os 10 jogadores com maior xG.
- Reporte os 10 jogadores com maior diferença de Gols e xG.

In [57]:
y = model.predict(shots[['xG_type', 'xG_startx', 'xG_starty', 'xG_bodypart']])
y

In [62]:
shots['xG'] = y
shots

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots['xG'] = y


Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,...,type_name,result_name,bodypart_name,player_name,xG_type,xG_startx,xG_starty,xG_bodypart,goal,xG
20,2565548,1,57.771186,695,225089,97.65,44.88,105.0,34.00,180865315,...,shot,fail,foot,Jos\u00e9 Luis Morales Nogales,11,97.65,44.88,0,0,0.201301
22,2565548,1,60.727239,695,255738,84.00,27.88,84.0,27.88,180864547,...,shot,fail,foot,Jefferson Andr\u00e9s Lerma Sol\u00eds,11,84.00,27.88,0,0,0.072689
93,2565548,1,446.986112,682,37831,92.40,29.24,92.4,29.24,180864486,...,shot,fail,foot,Carlos Arturo Bacca Ahumada,11,92.40,29.24,0,0,0.153823
96,2565548,1,488.929113,682,15214,91.35,23.12,105.0,27.20,180864491,...,shot,fail,foot,Antonio Rukavina,11,91.35,23.12,0,0,0.144981
178,2565548,1,948.872079,695,225089,78.75,40.80,105.0,34.00,180864792,...,shot,fail,foot,Jos\u00e9 Luis Morales Nogales,11,78.75,40.80,0,0,0.018972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473673,2565927,2,1944.188119,682,267134,94.50,46.24,94.5,46.24,253302329,...,shot,fail,foot,Roger Beyker Mart\u00ednez Tobinson,11,94.50,46.24,0,0,0.170467
473806,2565927,2,2385.837008,682,134174,96.60,51.00,105.0,37.40,253302547,...,shot,success,foot,Samuel Castillejo Azuaga,11,96.60,51.00,0,1,0.189785
473828,2565927,2,2672.823612,682,134174,96.60,11.56,96.6,11.56,253302585,...,shot_freekick,fail,foot,Samuel Castillejo Azuaga,13,96.60,11.56,0,0,0.294652
473851,2565927,2,2722.835144,675,3321,86.10,47.60,105.0,30.60,253302642,...,shot,fail,foot,Karim Benzema,11,86.10,47.60,0,0,0.088739


In [66]:
player_xG = shots.groupby('player_name')[['goal', 'xG']].sum()
player_xG = player_xG.sort_values(by='xG', ascending=False)
#10 jogadores com maior xD
player_xG.head(10)

Unnamed: 0_level_0,goal,xG
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Lionel Andr\u00e9s Messi Cuccittini,34,26.356843
Cristiano Ronaldo dos Santos Aveiro,26,25.668567
Luis Alberto Su\u00e1rez D\u00edaz,25,20.31133
Gerard Moreno Balaguero,16,14.540626
Iago Aspas Juncal,22,13.607845
I\u00f1aki Williams Arthuer,7,11.548885
Maximiliano G\u00f3mez Gonz\u00e1lez,18,11.461453
Jonathan Calleri,9,11.283987
Rodrigo Moreno Machado,16,10.949209
Ra\u00fal Garc\u00eda Escudero,10,10.339416


In [67]:
player_xG['dif'] = player_xG['goal'] - player_xG['xG']


Unnamed: 0_level_0,goal,xG,dif
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lionel Andr\u00e9s Messi Cuccittini,34,26.356843,7.643157
Cristiano Ronaldo dos Santos Aveiro,26,25.668567,0.331433
Luis Alberto Su\u00e1rez D\u00edaz,25,20.311330,4.688670
Gerard Moreno Balaguero,16,14.540626,1.459374
Iago Aspas Juncal,22,13.607845,8.392155
...,...,...,...
Emanuel Cecchini,0,-0.020407,0.020407
Alin Dorinel To\u0219ca,0,-0.024567,0.024567
Gabriel Mart\u00edn Pe\u00f1alba,0,-0.041854,0.041854
Rub\u00e9n Pardo Guti\u00e9rrez,0,-0.052875,0.052875


In [68]:
#10 jogadores com maior dif de gol / xG
player_xG = player_xG.sort_values(by='dif', ascending=False)
player_xG.head(10)

Unnamed: 0_level_0,goal,xG,dif
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cristhian Ricardo Stuani Curbelo,21,9.271841,11.728159
Antoine Griezmann,19,9.364934,9.635066
Iago Aspas Juncal,22,13.607845,8.392155
Lionel Andr\u00e9s Messi Cuccittini,34,26.356843,7.643157
Maximiliano G\u00f3mez Gonz\u00e1lez,18,11.461453,6.538547
Gareth Frank Bale,16,9.559291,6.440709
Santiago Mina Lorenzo,12,6.577333,5.422667
Willian Jos\u00e9 da Silva,15,9.743507,5.256493
Enis Bardhi,9,3.856347,5.143653
Rodrigo Moreno Machado,16,10.949209,5.050791


## Questão 4 [4]
- Instancie um objeto ExpectedThreat [2] com parâmetros l=25 e w=16.
- Faça o fit do modelo ExpectedThreat com o dataframe "spadl".

# iterations:  1


<socceraction.xthreat.ExpectedThreat at 0x14c318c0c40>

## Questão 5
- Crie um dataframe "prog_actions" à partir do dataframe "spadl", contendo apenas as ações de progressão e que são bem-sucedidas [3].
- Use o método rate do objeto ExpectedThreat p/ calcular o valor de cada ação de progressão do dataframe "prog_actions", em uma coluna chamada "action_value".
- Agrupe o dataframe "prog_actions" por "player_name" e reporte a soma dos "action_value".
- Reporte os 10 jogadores com maior "action_value".

In [81]:
prog_actions = xt.get_successful_move_actions(spadl)
prog_actions

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,bodypart_id,type_id,result_id,action_id,type_name,result_name,bodypart_name,player_name
0,2565548,1,2.994582,682,3542,38.85,26.52,52.50,34.00,180864419,0,0,1,0,pass,success,foot,Manuel Trigueros Mu\u00f1oz
1,2565548,1,3.137020,682,274435,52.50,34.00,47.25,47.60,180864418,0,0,1,1,pass,success,foot,Enes \u00dcnal
2,2565548,1,6.709668,682,364860,47.25,47.60,39.90,59.84,180864420,0,0,1,2,pass,success,foot,Rodrigo Hern\u00e1ndez Cascante
3,2565548,1,8.805497,682,3534,39.90,59.84,33.60,21.08,180864421,0,0,1,3,pass,success,foot,Jaume Vicent Costa Jord\u00e1
4,2565548,1,14.047492,682,3695,33.60,21.08,32.55,42.84,180864422,0,0,1,4,pass,success,foot,\u00c1lvaro Gonz\u00e1lez Sober\u00f3n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473887,2565927,2,2931.782904,682,3695,37.80,44.88,47.25,32.64,253302665,0,0,1,1480,pass,success,foot,\u00c1lvaro Gonz\u00e1lez Sober\u00f3n
473888,2565927,2,2932.188168,682,20623,47.25,32.64,69.30,51.00,253302667,0,21,1,1481,dribble,success,foot,Roberto Soriano
473889,2565927,2,2939.077491,682,20623,69.30,51.00,92.40,66.64,253302671,0,0,1,1482,pass,success,foot,Roberto Soriano
473890,2565927,2,2940.515560,682,122832,92.40,66.64,101.85,53.72,253302673,0,21,1,1483,dribble,success,foot,Salem Mohammed Al Dawsari


In [109]:
def correct_error(a, b):
    return np.nan_to_num(a / b)

xt._safe_divide = correct_error


In [110]:
#prog_actions['action_value']
xts = xt.ExpectedThreat(l = 25, w =16)




In [111]:
xts.fit(spadl)


  return np.nan_to_num(a / b)


# iterations:  30


<socceraction.xthreat.ExpectedThreat at 0x14c2e722dc0>

In [114]:
actions =  xts.rate(prog_actions)
prog_actions['action_value'] = actions
prog_actions.head(6)

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,original_event_id,bodypart_id,type_id,result_id,action_id,type_name,result_name,bodypart_name,player_name,action_value
0,2565548,1,2.994582,682,3542,38.85,26.52,52.5,34.0,180864419,0,0,1,0,pass,success,foot,Manuel Trigueros Mu\u00f1oz,0.001583
1,2565548,1,3.13702,682,274435,52.5,34.0,47.25,47.6,180864418,0,0,1,1,pass,success,foot,Enes \u00dcnal,-0.000136
2,2565548,1,6.709668,682,364860,47.25,47.6,39.9,59.84,180864420,0,0,1,2,pass,success,foot,Rodrigo Hern\u00e1ndez Cascante,-0.00324
3,2565548,1,8.805497,682,3534,39.9,59.84,33.6,21.08,180864421,0,0,1,3,pass,success,foot,Jaume Vicent Costa Jord\u00e1,-6.9e-05
4,2565548,1,14.047492,682,3695,33.6,21.08,32.55,42.84,180864422,0,0,1,4,pass,success,foot,\u00c1lvaro Gonz\u00e1lez Sober\u00f3n,0.000376
5,2565548,1,18.480831,682,3277,32.55,42.84,38.85,11.56,180864423,0,0,1,5,pass,success,foot,V\u00edctor Ru\u00edz Torre,0.000483


In [116]:
player_actions = prog_actions.groupby('player_name')[['action_value']].sum()
player_actions.head(5)

Unnamed: 0_level_0,action_value
player_name,Unnamed: 1_level_1
A\u00efssa Mandi,3.398481
Aar\u00f3n Mart\u00edn Caricol,5.578629
Achraf Hakimi Mouh,1.369938
Adalberto Pe\u00f1aranda Maestre,0.25166
Adnan Januzaj,4.14267


In [117]:
#10 Jogadores com mais action values, que engraçado, Messi dnv...rs
player_actions = player_actions.sort_values(by='action_value', ascending=False)
player_actions.head(10)


Unnamed: 0_level_0,action_value
player_name,Unnamed: 1_level_1
Lionel Andr\u00e9s Messi Cuccittini,10.650189
Marcelo Vieira da Silva J\u00fanior,10.264535
\u00c1lvaro Odriozola Arzallus,8.708854
Jos\u00e9 Luis Morales Nogales,7.81904
Hugo Mallo Novegil,7.431915
Juan Francisco Moreno Fuertes,7.281309
\u00c9ver Maximiliano David Banega,7.01516
Lucas V\u00e1zquez Iglesias,6.908507
Jordi Alba Ramos,6.824937
Jos\u00e9 Luis Gay\u00e1 Pe\u00f1a,6.81135
