In [93]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [94]:
df =  pd.read_csv("results.csv")

In [95]:
df["date"] = pd.to_datetime(df["date"])

In [96]:
df.isna().sum()

date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [97]:
df.sort_values("date").tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
44336,2022-12-04,England,Senegal,3,0,FIFA World Cup,Al Khor,Qatar,True
44337,2022-12-05,Japan,Croatia,1,1,FIFA World Cup,Al Wakrah,Qatar,True
44338,2022-12-05,Brazil,South Korea,4,1,FIFA World Cup,Doha,Qatar,True
44339,2022-12-06,Morocco,Spain,0,0,FIFA World Cup,Al Rayyan,Qatar,True
44340,2022-12-06,Portugal,Switzerland,6,1,FIFA World Cup,Lusail,Qatar,True


In [98]:
df = df[(df["date"] >= "2010-8-1")].reset_index(drop=True)

In [99]:
df.home_team.value_counts()

Qatar            128
Mexico           126
United States    126
Japan            114
Bahrain          106
                ... 
Saint Helena       1
Rhodes             1
Romani people      1
Aymara             1
Kabylia            1
Name: home_team, Length: 290, dtype: int64

In [100]:
rankings = pd.read_csv("fifa_ranking-2022-10-06.csv")
rankings.head()

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,1,Germany,GER,57.0,0.0,0,UEFA,1992-12-31
1,96,Syria,SYR,11.0,0.0,0,AFC,1992-12-31
2,97,Burkina Faso,BFA,11.0,0.0,0,CAF,1992-12-31
3,99,Latvia,LVA,10.0,0.0,0,UEFA,1992-12-31
4,100,Burundi,BDI,10.0,0.0,0,CAF,1992-12-31


In [101]:
rankings["rank_date"] = pd.to_datetime(rankings["rank_date"])
rankings = rankings[(rankings["rank_date"] >= "2010-8-1")].reset_index(drop=True)

In [102]:
rankings["country_full"] = rankings["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic", "South Korea").str.replace("USA", "United States")

In [103]:
df_wc_ranked = df.merge(rankings[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]], left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"], axis=1)
df_wc_ranked.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,total_points,previous_points,rank,rank_change
0,2010-08-11,Albania,Uzbekistan,1,0,Friendly,Durrës,Albania,False,455.0,455.0,70,-1
1,2010-08-11,Algeria,Gabon,1,2,Friendly,Algiers,Algeria,False,759.0,759.0,33,0
2,2010-08-11,Angola,Uruguay,0,2,Friendly,Lisbon,Portugal,True,357.0,356.0,87,1
3,2010-08-11,Armenia,Iran,1,3,Friendly,Yerevan,Armenia,False,318.0,318.0,96,0
4,2010-08-11,Austria,Switzerland,0,1,Friendly,Klagenfurt,Austria,False,536.0,536.0,60,0


In [104]:
df_wc_ranked = df_wc_ranked.merge(rankings[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]], left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(["rank_date", "country_full"], axis=1)
df_wc_ranked.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,total_points_home,previous_points_home,rank_home,rank_change_home,total_points_away,previous_points_away,rank_away,rank_change_away
0,2010-08-11,Albania,Uzbekistan,1,0,Friendly,Durrës,Albania,False,455.0,455.0,70,-1,351.0,352.0,88,1
1,2010-08-11,Algeria,Gabon,1,2,Friendly,Algiers,Algeria,False,759.0,759.0,33,0,755.0,755.0,34,0
2,2010-08-11,Angola,Uruguay,0,2,Friendly,Lisbon,Portugal,True,357.0,356.0,87,1,1152.0,1152.0,6,0
3,2010-08-11,Armenia,Iran,1,3,Friendly,Yerevan,Armenia,False,318.0,318.0,96,0,517.0,522.0,65,1
4,2010-08-11,Austria,Switzerland,0,1,Friendly,Klagenfurt,Austria,False,536.0,536.0,60,0,940.0,940.0,17,-1


In [108]:
def result_finder(home, away):
    if home > away:
        return pd.Series([0, 3, 0])
    if home < away:
        return pd.Series([1, 0, 3])
    else:
        return pd.Series([2, 1, 1])

results = df.apply(lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
df[["result", "home_team_points", "away_team_points"]] = results

In [109]:
df = df_wc_ranked
df["rank_dif"] = df["rank_home"] - df["rank_away"]
df["sg"] = df["home_score"] - df["away_score"]
df["points_home_by_rank"] = df["home_team_points"]/df["rank_away"]
df["points_away_by_rank"] = df["away_team_points"]/df["rank_home"]

In [110]:
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,total_points_home,...,previous_points_away,rank_away,rank_change_away,rank_dif,sg,result,home_team_points,away_team_points,points_home_by_rank,points_away_by_rank
0,2010-08-11,Albania,Uzbekistan,1,0,Friendly,Durrës,Albania,False,455.0,...,352.0,88,1,-18,1,0,3,0,0.034091,0.0
1,2010-08-11,Algeria,Gabon,1,2,Friendly,Algiers,Algeria,False,759.0,...,755.0,34,0,-1,-1,1,0,3,0.0,0.090909
2,2010-08-11,Angola,Uruguay,0,2,Friendly,Lisbon,Portugal,True,357.0,...,1152.0,6,0,81,-2,1,0,3,0.0,0.034483
3,2010-08-11,Armenia,Iran,1,3,Friendly,Yerevan,Armenia,False,318.0,...,522.0,65,1,31,-2,1,0,3,0.0,0.03125
4,2010-08-11,Austria,Switzerland,0,1,Friendly,Klagenfurt,Austria,False,536.0,...,940.0,17,-1,43,-1,1,0,3,0.0,0.05


In [111]:
home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away","rank_change_home", "total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]

away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home","rank_change_away", "total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]

In [112]:
home_team.head()

Unnamed: 0,date,home_team,home_score,away_score,rank_home,rank_away,rank_change_home,total_points_home,result,rank_dif,points_home_by_rank,home_team_points
0,2010-08-11,Albania,1,0,70,88,-1,455.0,0,-18,0.034091,3
1,2010-08-11,Algeria,1,2,33,34,0,759.0,1,-1,0.0,0
2,2010-08-11,Angola,0,2,87,6,1,357.0,1,81,0.0,0
3,2010-08-11,Armenia,1,3,96,65,0,318.0,1,31,0.0,0
4,2010-08-11,Austria,0,1,60,17,0,536.0,1,43,0.0,0


In [113]:
away_team.head()

Unnamed: 0,date,away_team,away_score,home_score,rank_away,rank_home,rank_change_away,total_points_away,result,rank_dif,points_away_by_rank,away_team_points
0,2010-08-11,Uzbekistan,0,1,88,70,1,351.0,0,-18,0.0,0
1,2010-08-11,Gabon,2,1,34,33,0,755.0,1,-1,0.090909,3
2,2010-08-11,Uruguay,2,0,6,87,0,1152.0,1,81,0.034483,3
3,2010-08-11,Iran,3,1,65,96,1,517.0,1,31,0.03125,3
4,2010-08-11,Switzerland,1,0,17,60,-1,940.0,1,43,0.05,3


In [114]:
home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf") for h in home_team.columns]

away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf") for a in away_team.columns]

In [115]:
home_team.head()

Unnamed: 0,date,team,score,suf_score,rank,rank_suf,rank_change,total_points,result,rank_dif,points_by_rank,team_points
0,2010-08-11,Albania,1,0,70,88,-1,455.0,0,-18,0.034091,3
1,2010-08-11,Algeria,1,2,33,34,0,759.0,1,-1,0.0,0
2,2010-08-11,Angola,0,2,87,6,1,357.0,1,81,0.0,0
3,2010-08-11,Armenia,1,3,96,65,0,318.0,1,31,0.0,0
4,2010-08-11,Austria,0,1,60,17,0,536.0,1,43,0.0,0


In [32]:
away_team['team']=='Argentina')

187

In [34]:
#team_stats = home_team.append(away_team)
sum(team_stats['team']=='Argentina')

188

In [35]:
team_stats.shape

(374, 12)

In [36]:
team_stats_raw = team_stats.copy()

Maintenant, nous avons une base de données prête à créer des "predictive features". Il s’agira de :
- la moyenne des buts de l'équipe dans l'histoire du coupe du monde.
- la moyenne des buts de l'équipe dans les cinqs derniére matches.
- la moyenne des buts rentrer à l'équipe dans l'histoire du coupe du monde.
- la moyenne des buts rentrer à l'équipe dans cinqs derniére matches.
- la moyenne de FIFA Rank de l'équipe dans l'histoire du coupe du monde.
- la moyenne de FIFA Rank de l'équipe lors des cinqs derniére matches.
- Points FIFA gagnés lors de la monde coupe du monde.
- Points FIFA gagnés lors cinqs derniére matches.
- Points de jeu moyens au cycle.
- Moyenne des points de jeu lors des 5 derniers matchs.
- Moyenne des points de jeu par rang affrontés au cycle.
- Moyenne des points de jeu par rang affrontés lors des 5 derniers matchs.

In [41]:
team_stats.loc[(team_stats["team"] =='Argentina') & (team_stats["date"] < '2016-12-12')].sort_values(by=['date'], ascending=False)


Unnamed: 0,date,team,score,suf_score,rank,rank_suf,rank_change,total_points,result,rank_dif,points_by_rank,team_points
164,2016-06-02,Argentina,0,5,181,164,3,93.0,0,-17,0.000000,0
163,2016-06-02,Argentina,2,0,32,180,-1,786.0,1,148,0.016667,3
162,2016-06-02,Argentina,2,0,178,174,1,102.0,1,-4,0.017241,3
161,2016-06-02,Argentina,0,3,177,173,2,106.0,0,-4,0.000000,0
160,2016-06-02,Argentina,1,0,163,174,1,151.0,1,11,0.017241,3
...,...,...,...,...,...,...,...,...,...,...,...,...
29,2010-08-11,Argentina,1,2,21,22,0,890.0,0,1,0.000000,0
28,2010-08-11,Argentina,1,2,145,83,-2,110.0,0,-62,0.000000,0
27,2010-08-11,Argentina,0,2,59,73,0,540.0,0,14,0.000000,0
26,2010-08-11,Argentina,0,0,110,89,-1,237.0,2,-21,0.011236,1


In [44]:
stats_val = []

for index, row in team_stats.iterrows():
    team = row["team"]
    date = row["date"]
    past_games = team_stats.loc[(team_stats["team"] == team) & (team_stats["date"] < date)].sort_values(by=['date'], ascending=False)
    last5 = past_games.head(5)
    
    goals = past_games["score"].mean()
    goals_l5 = last5["score"].mean()
    
    goals_suf = past_games["suf_score"].mean()
    goals_suf_l5 = last5["suf_score"].mean()
    
    rank = past_games["rank_suf"].mean()
    rank_l5 = last5["rank_suf"].mean()
    
    if len(last5) > 0:
        points = past_games["total_points"].values[0] - past_games["total_points"].values[-1]#qtd de pontos ganhos
        points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1] 
    else:
        points = 0
        points_l5 = 0
        
    gp = past_games["team_points"].mean()
    gp_l5 = last5["team_points"].mean()
    
    gp_rank = past_games["points_by_rank"].mean()
    gp_rank_l5 = last5["points_by_rank"].mean()
    
    stats_val.append([goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank, gp_rank_l5])

In [45]:
stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5", "points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5", "game_points_rank_mean", "game_points_rank_mean_l5"]

stats_df = pd.DataFrame(stats_val, columns=stats_cols)

full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
#full_df['tournament']
stats_df

Unnamed: 0,goals_mean,goals_mean_l5,goals_suf_mean,goals_suf_mean_l5,rank_mean,rank_mean_l5,points_mean,points_mean_l5,game_points_mean,game_points_mean_l5,game_points_rank_mean,game_points_rank_mean_l5
0,,,,,,,0.00,0.00,,,,
1,,,,,,,0.00,0.00,,,,
2,,,,,,,0.00,0.00,,,,
3,,,,,,,0.00,0.00,,,,
4,,,,,,,0.00,0.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
369,0.819672,0.8,1.721311,2.0,78.158470,56.6,-501.00,902.00,0.950820,1.0,0.027368,0.026934
370,0.819672,0.8,1.721311,2.0,78.158470,56.6,-501.00,902.00,0.950820,1.0,0.027368,0.026934
371,0.819672,0.8,1.721311,2.0,78.158470,56.6,-501.00,902.00,0.950820,1.0,0.027368,0.026934
372,0.827957,1.0,1.720430,2.2,77.768817,57.4,1156.00,-649.00,0.956989,1.0,0.027182,0.026144


In [46]:
full_df.isna().sum()

date                          0
team                          0
score                         0
suf_score                     0
rank                          0
rank_suf                      0
rank_change                   0
total_points                  0
result                        0
rank_dif                      0
points_by_rank                0
team_points                   0
goals_mean                  164
goals_mean_l5               164
goals_suf_mean              164
goals_suf_mean_l5           164
rank_mean                   164
rank_mean_l5                164
points_mean                   0
points_mean_l5                0
game_points_mean            164
game_points_mean_l5         164
game_points_rank_mean       164
game_points_rank_mean_l5    164
dtype: int64

In [47]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(full_df)

          date                  team  score  suf_score  rank  rank_suf  \
0   2010-08-11               Albania      1          0    70        88   
1   2010-08-11               Algeria      1          2    33        34   
2   2010-08-11                Angola      0          2    87         6   
3   2010-08-11               Armenia      1          3    96        65   
4   2010-08-11               Austria      0          1    60        17   
5   2010-08-11            Azerbaijan      1          1   105        85   
6   2010-08-11                 Benin      0          0    61       145   
7   2010-08-11               Bolivia      1          1    50        36   
8   2010-08-11          Burkina Faso      3          0    45       106   
9   2010-08-11                  Chad      1          3   124        64   
10  2010-08-11              China PR      1          1    77        68   
11  2010-08-11                Cyprus      1          0    63       201   
12  2010-08-11        Czech Republic  

NameError: name 'base_df' is not defined

In [49]:
home_team_stats = full_df.iloc[:int(full_df.shape[0]/2),:]
away_team_stats = full_df.iloc[int(full_df.shape[0]/2):,:]

In [50]:
home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
away_team_stats = away_team_stats[away_team_stats.columns[-12:]]

In [51]:
home_team_stats.columns = ['home_'+str(col) for col in home_team_stats.columns]
away_team_stats.columns = ['away_'+str(col) for col in away_team_stats.columns]

In [52]:
match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
full_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,total_points_home,...,away_goals_suf_mean,away_goals_suf_mean_l5,away_rank_mean,away_rank_mean_l5,away_points_mean,away_points_mean_l5,away_game_points_mean,away_game_points_mean_l5,away_game_points_rank_mean,away_game_points_rank_mean_l5
0,2010-08-11,Albania,Uzbekistan,1,0,Friendly,Durrës,Albania,False,455.0,...,,,,,0.0,0.0,,,,
1,2010-08-11,Algeria,Gabon,1,2,Friendly,Algiers,Algeria,False,759.0,...,,,,,0.0,0.0,,,,
2,2010-08-11,Angola,Uruguay,0,2,Friendly,Lisbon,Portugal,True,357.0,...,,,,,0.0,0.0,,,,
3,2010-08-11,Armenia,Iran,1,3,Friendly,Yerevan,Armenia,False,318.0,...,,,,,0.0,0.0,,,,
4,2010-08-11,Austria,Switzerland,0,1,Friendly,Klagenfurt,Austria,False,536.0,...,,,,,0.0,0.0,,,,


In [53]:
full_df["tournament"]

0                   Friendly
1                   Friendly
2                   Friendly
3                   Friendly
4                   Friendly
               ...          
182                 Friendly
183                 Friendly
184             Copa América
185                 Friendly
186    Mahinda Rajapaksa Cup
Name: tournament, Length: 187, dtype: object

In [54]:
def find_friendly(x):
    if x == "Friendly":
        return 1
    else: return 0

full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
full_df = pd.get_dummies(full_df, columns=["is_friendly"])

In [55]:
base_df = full_df[["date", "home_team", "away_team", "rank_home", "rank_away","home_score", "away_score","result", "rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
       'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
       'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
       'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
       'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
       'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5','home_game_points_mean', 'home_game_points_mean_l5','home_game_points_rank_mean', 'home_game_points_rank_mean_l5','away_game_points_mean',
       'away_game_points_mean_l5', 'away_game_points_rank_mean',
       'away_game_points_rank_mean_l5',
       'is_friendly_0', 'is_friendly_1']]

base_df.tail()

Unnamed: 0,date,home_team,away_team,rank_home,rank_away,home_score,away_score,result,rank_dif,rank_change_home,...,home_game_points_mean,home_game_points_mean_l5,home_game_points_rank_mean,home_game_points_rank_mean_l5,away_game_points_mean,away_game_points_mean_l5,away_game_points_rank_mean,away_game_points_rank_mean_l5,is_friendly_0,is_friendly_1
182,2019-06-14,Cameroon,Mali,51,62,1,1,2,-11,-3,...,,,,,0.95082,1.0,0.027368,0.026934,0,1
183,2019-06-14,Madagascar,Mauritania,108,103,1,3,1,5,1,...,,,,,0.95082,1.0,0.027368,0.026934,0,1
184,2019-06-14,Brazil,Bolivia,3,62,3,0,0,-59,0,...,,,,,0.95082,1.0,0.027368,0.026934,1,0
185,2021-05-27,Turkey,Azerbaijan,29,110,2,1,0,-81,0,...,2.0,2.0,0.04127,0.04127,0.956989,1.0,0.027182,0.026144,0,1
186,2021-11-19,Sri Lanka,Seychelles,204,197,3,3,2,7,0,...,1.0,1.0,0.006289,0.006289,0.951872,0.8,0.027036,0.009477,1,0


In [56]:
df = base_df.copy()

In [66]:
def no_draw(x):
    if x == 2:
        return 1
    else:
        return x
    
df["target"] = df["result"].apply(lambda x: no_draw(x))
df.head()

Unnamed: 0,date,home_team,away_team,rank_home,rank_away,home_score,away_score,result,rank_dif,rank_change_home,...,home_game_points_mean_l5,home_game_points_rank_mean,home_game_points_rank_mean_l5,away_game_points_mean,away_game_points_mean_l5,away_game_points_rank_mean,away_game_points_rank_mean_l5,is_friendly_0,is_friendly_1,target
0,2010-08-11,Albania,Uzbekistan,70,88,1,0,0,-18,-1,...,,,,,,,,0,1,0
1,2010-08-11,Algeria,Gabon,33,34,1,2,1,-1,0,...,,,,,,,,0,1,1
2,2010-08-11,Angola,Uruguay,87,6,0,2,1,81,1,...,,,,,,,,0,1,1
3,2010-08-11,Armenia,Iran,96,65,1,3,1,31,0,...,,,,,,,,0,1,1
4,2010-08-11,Austria,Switzerland,60,17,0,1,1,43,0,...,,,,,,,,0,1,1


In [67]:
dif = df.copy()
dif.loc[:, "goals_dif"] = dif["home_goals_mean"] - dif["away_goals_mean"]
dif.loc[:, "goals_dif_l5"] = dif["home_goals_mean_l5"] - dif["away_goals_mean_l5"]
dif.loc[:, "goals_suf_dif"] = dif["home_goals_suf_mean"] - dif["away_goals_suf_mean"]
dif.loc[:, "goals_suf_dif_l5"] = dif["home_goals_suf_mean_l5"] - dif["away_goals_suf_mean_l5"]
dif.loc[:, "goals_made_suf_dif"] = dif["home_goals_mean"] - dif["away_goals_suf_mean"]
dif.loc[:, "goals_made_suf_dif_l5"] = dif["home_goals_mean_l5"] - dif["away_goals_suf_mean_l5"]
dif.loc[:, "goals_suf_made_dif"] = dif["home_goals_suf_mean"] - dif["away_goals_mean"]
dif.loc[:, "goals_suf_made_dif_l5"] = dif["home_goals_suf_mean_l5"] - dif["away_goals_mean_l5"]
dif.loc[:, "dif_points"] = dif["home_game_points_mean"] - dif["away_game_points_mean"]
dif.loc[:, "dif_points_l5"] = dif["home_game_points_mean_l5"] - dif["away_game_points_mean_l5"]
dif.loc[:, "dif_points_rank"] = dif["home_game_points_rank_mean"] - dif["away_game_points_rank_mean"]
dif.loc[:, "dif_points_rank_l5"] = dif["home_game_points_rank_mean_l5"] - dif["away_game_points_rank_mean_l5"]
dif.loc[:, "dif_rank_agst"] = dif["home_rank_mean"] - dif["away_rank_mean"]
dif.loc[:, "dif_rank_agst_l5"] = dif["home_rank_mean_l5"] - dif["away_rank_mean_l5"]
dif.loc[:, "goals_per_ranking_dif"] = (dif["home_goals_mean"] / dif["home_rank_mean"]) - (dif["away_goals_mean"] / dif["away_rank_mean"])
dif.loc[:, "goals_per_ranking_suf_dif"] = (dif["home_goals_suf_mean"] / dif["home_rank_mean"]) - (dif["away_goals_suf_mean"] / dif["away_rank_mean"])
dif.loc[:, "goals_per_ranking_dif_l5"] = (dif["home_goals_mean_l5"] / dif["home_rank_mean"]) - (dif["away_goals_mean_l5"] / dif["away_rank_mean"])
dif.loc[:, "goals_per_ranking_suf_dif_l5"] = (dif["home_goals_suf_mean_l5"] / dif["home_rank_mean"]) - (dif["away_goals_suf_mean_l5"] / dif["away_rank_mean"])

In [68]:
def create_db(df):
    columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean", "home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5", "home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5", "home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean", "home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5","is_friendly_0", "is_friendly_1"]
    
    base = df.loc[:, columns]
    base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
    base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
    base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
    base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
    base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (base["away_goals_mean"] / base["away_rank_mean"])
    base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
    base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
    base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
    base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base["away_game_points_rank_mean_l5"]
    model_df = base[["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5", "goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5", "dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
    return model_df

In [65]:
model_db = create_db(df)
model_db

Unnamed: 0,home_team,away_team,target,rank_dif,goals_dif,goals_dif_l5,goals_suf_dif,goals_suf_dif_l5,goals_per_ranking_dif,dif_rank_agst,dif_rank_agst_l5,dif_points_rank,dif_points_rank_l5,is_friendly_0,is_friendly_1
0,Albania,Uzbekistan,0,-18,,,,,,,,,,0,1
1,Algeria,Gabon,1,-1,,,,,,,,,,0,1
2,Angola,Uruguay,1,81,,,,,,,,,,0,1
3,Armenia,Iran,1,31,,,,,,,,,,0,1
4,Austria,Switzerland,1,43,,,,,,,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,Cameroon,Mali,1,-11,,,,,,,,,,0,1
183,Madagascar,Mauritania,1,5,,,,,,,,,,0,1
184,Brazil,Bolivia,0,-59,,,,,,,,,,1,0
185,Turkey,Azerbaijan,0,-81,0.672043,0.5,-1.220430,-1.7,0.012081,-11.768817,8.6,0.014088,0.015126,0,1


In [69]:
model_db= model_db.dropna()

# MODEL GRADIENT BOOSTING

Le modèle gradient boosting est une technique de machine learning utilisée pour construire des modèles prédictifs, généralement des arbres de décision. C'est un type de méthode d'ensemble, ce qui signifie qu'il combine les prédictions de plusieurs modèles pour améliorer les performances prédictives globales. Dans le gradient boosting, chaque modèle est entraîné pour faire des prédictions légèrement meilleures que le hasard, et les modèles sont ensuite combinés de manière à minimiser l'erreur de prédiction globale. Cela se fait en entraînant séquentiellement de nouveaux modèles pour corriger les erreurs des modèles précédents. Le résultat est un modèle qui fait des prédictions très précises.

In [70]:
X = model_db.iloc[:, 3:]
y = model_db[["target"]]

In [71]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1)
gb = GradientBoostingClassifier(random_state=5)

params = {"learning_rate": [0.01, 0.1, 0.5],
            "min_samples_split": [5, 10],
            "min_samples_leaf": [3, 5],
            "max_depth":[3,5,10],
            "max_features":["sqrt"],
            "n_estimators":[100, 200]
         } 

gb_cv = GridSearchCV(gb, params, cv = 3, n_jobs = -1, verbose = False)

gb_cv.fit(X_train.values, np.ravel(y_train))

GridSearchCV(cv=3, estimator=GradientBoostingClassifier(random_state=5),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 0.5],
                         'max_depth': [3, 5, 10], 'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 5],
                         'min_samples_split': [5, 10],
                         'n_estimators': [100, 200]},
             verbose=False)

In [73]:
gb = gb_cv.best_estimator_
gb

GradientBoostingClassifier(learning_rate=0.01, max_features='sqrt',
                           min_samples_leaf=3, min_samples_split=10,
                           random_state=5)

In [75]:
y_pred = gb.predict(X_test)



### ACCURACY du modèle est égale a 0.63 c'est 63% donc on peut dire que ce modèle est bien approprié

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5

In [149]:
def find_features(team_1, team_2):
    rank_dif = team_1[0] - team_2[0]
    goals_dif = team_1[1] - team_2[1]
    goals_dif_l5 = team_1[2] - team_2[2]
    goals_suf_dif = team_1[3] - team_2[3]
    goals_suf_dif_l5 = team_1[4] - team_2[4]
    goals_per_ranking_dif = (team_1[1]/team_1[5]) - (team_2[1]/team_2[5])
    dif_rank_agst = team_1[5] - team_2[5]
    dif_rank_agst_l5 = team_1[6] - team_2[6]
    dif_gp_rank = team_1[7] - team_2[7]
    dif_gp_rank_l5 = team_1[8] - team_2[8]
    
    return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif, dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]

In [150]:
def find_stats(team_1):
#team_1 = "Qatar"
    past_games = team_stats_raw[(team_stats_raw["team"] == team_1)].sort_values("date")
    last5 = team_stats_raw[(team_stats_raw["team"] == team_1)].sort_values("date").tail(5)

    team_1_rank = past_games["rank"].values[-1]
    team_1_goals = past_games.score.mean()
    team_1_goals_l5 = last5.score.mean()
    team_1_goals_suf = past_games.suf_score.mean()
    team_1_goals_suf_l5 = last5.suf_score.mean()
    team_1_rank_suf = past_games.rank_suf.mean()
    team_1_rank_suf_l5 = last5.rank_suf.mean()
    team_1_gp_rank = past_games.points_by_rank.mean()
    team_1_gp_rank_l5 = last5.points_by_rank.mean()

    return [team_1_rank, team_1_goals, team_1_goals_l5, team_1_goals_suf, team_1_goals_suf_l5, team_1_rank_suf, team_1_rank_suf_l5, team_1_gp_rank, team_1_gp_rank_l5]

In [151]:
matches = [('Argentina','Croatia'),('France','Morocco')]
final=[]
for teams in matches:
    draw = False
    team_1 = find_stats(teams[0])
    team_2 = find_stats(teams[1])

    

    features_g1 = find_features(team_1, team_2)
    features_g2 = find_features(team_2, team_1)

    probs_g1 = gb.predict_proba([features_g1])
    probs_g2 = gb.predict_proba([features_g2])
    
    team_1_prob_g1 = probs_g1[0][0]
    team_1_prob_g2 = probs_g2[0][1]
    team_2_prob_g1 = probs_g1[0][1]
    team_2_prob_g2 = probs_g2[0][0]

    team_1_prob = (probs_g1[0][0] + probs_g2[0][1])/2
    team_2_prob = (probs_g2[0][0] + probs_g1[0][1])/2
    
    
                
    if team_1_prob > team_2_prob:
        winner = teams[0]
        winner_proba = team_1_prob
        final.append(winner)
        print('The Winner is '+teams[0]+'with probability equal to '+str(winner_proba)+', and it has qualified to THE FINAL')
        print("                                                                                                            ")        
    elif team_2_prob > team_1_prob:  
        winner = teams[2]
        winner_proba = team_2_prob
        final.append(winner)
        print('The Winner is '+teams[1]+'with probability equal to '+str(winner_proba)+', and it has qualified to THE FINAL')
        
    



ValueError: X has 12 features, but GradientBoostingClassifier is expecting 14 features as input.