In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# read and display results
results = pd.read_csv("/kaggle/input/international-football-results-from-1872-to-2017/results.csv")
results.head(10)

In [5]:
# Check for types of columns
results.dtypes

### Transform date columns to a date

In [6]:
results['date'] = pd.to_datetime(results['date'])

### Create columns for year, month and day

In [7]:
results['year'] = results['date'].apply(lambda x : x.year)
results['month'] = results['date'].apply(lambda x : x.month)
results['day'] = results['date'].apply(lambda x : x.day)

### Creat home_team_wins and away_team_wins and draw columns

In [8]:
results['home_team_wins'] = (results['home_score'] - results['away_score']) > 0
results['away_team_wins'] = (results['home_score'] - results['away_score']) < 0
results['draw'] = (results['home_score'] - results['away_score']) == 0

### Store home teams and away teams in lists


In [9]:
home_teams = results['home_team'].unique()
away_teams = results['away_team'].unique()

### tournaments, cities and countries

In [10]:
tournaments = results['tournament'].unique()
cities = results['city'].unique()
countries = results['country'].unique()

### See results of Morocco


In [11]:
results_morocco = results.loc[(results.home_team == 'Morocco') | (results.away_team == 'Morocco'), :]
results_morocco

### Get results of a specific two countries

In [12]:
def get_results_of_two_countries(results_, country1, country2):
    results_of_two_countries = results_.loc[((results_.home_team == country1) & (results_.away_team == country2)) 
                                           | ((results_.home_team == country2) & (results_.away_team == country1)), :]
    return results_of_two_countries

### Get results for Morocco and Brazil

In [13]:
get_results_of_two_countries(results, 'Morocco', 'Brazil')

### Define a function that return historical probabilies of win, loose and draw for two countries

In [14]:
def get_hist_proba_of_two_countries(results_, country1, country2):
    probas = dict()
    # get results of country1 and country2
    temp = get_results_of_two_countries(results_, country1, country2)
    temp = temp[['home_team', 'away_team', 'home_team_wins', 'away_team_wins',  'draw']]
    temp = temp.groupby(['home_team', 'away_team']).sum()
    
    probas[(country1, country2)] = {'Win' : 0, 'Loose' : 0, 'Draw' : 0, 'Games' : 0 }
    temp
    
    if len(temp) == 2: # games are played in country1 and country2
        probas[(country1, country2)]['Win'] = temp.loc[(country1, country2)]['home_team_wins'] + temp.loc[(country2, country1)]['away_team_wins']
        probas[(country1, country2)]['Loose'] = temp.loc[(country1, country2)]['away_team_wins'] + temp.loc[(country2, country1)]['home_team_wins']
        probas[(country1, country2)]['Draw'] = temp.loc[(country1, country2)]['draw'] + temp.loc[(country2, country1)]['draw']
        n_games = probas[(country1, country2)]['Win'] + probas[(country1, country2)]['Loose'] + probas[(country1, country2)]['Draw']
        
        if n_games > 0 :
            probas[(country1, country2)]['Win'] = probas[(country1, country2)]['Win']/n_games
            probas[(country1, country2)]['Loose'] = probas[(country1, country2)]['Loose']/n_games
            probas[(country1, country2)]['Draw'] = probas[(country1, country2)]['Draw']/n_games
            probas[(country1, country2)]['Games'] = n_games
            
    
    if len(temp) == 1: # games are played in one country only
        if (country1, country2) in temp.index: # all games were played in country1, so use (country1, country2) as index for temp
            probas[(country1, country2)]['Win'] = temp.loc[(country1, country2)]['home_team_wins']
            probas[(country1, country2)]['Loose'] = temp.loc[(country1, country2)]['away_team_wins']
            probas[(country1, country2)]['Draw'] = temp.loc[(country1, country2)]['draw']
            n_games = probas[(country1, country2)]['Win'] + probas[(country1, country2)]['Loose'] + probas[(country1, country2)]['Draw']
        
            if n_games > 0 :
                probas[(country1, country2)]['Win'] = probas[(country1, country2)]['Win']/n_games
                probas[(country1, country2)]['Loose'] = probas[(country1, country2)]['Loose']/n_games
                probas[(country1, country2)]['Draw'] = probas[(country1, country2)]['Draw']/n_games
                probas[(country1, country2)]['Games'] = n_games
        else: # all games were played in country2, so use (country2, country1) as index for temp
            probas[(country1, country2)]['Win'] = temp.loc[(country2, country1)]['away_team_wins']
            probas[(country1, country2)]['Loose'] = temp.loc[(country2, country1)]['home_team_wins']
            probas[(country1, country2)]['Draw'] = temp.loc[(country2, country1)]['draw']
            n_games = probas[(country1, country2)]['Win'] + probas[(country1, country2)]['Loose'] + probas[(country1, country2)]['Draw']
        
            if n_games > 0 :
                probas[(country1, country2)]['Win'] = probas[(country1, country2)]['Win']/n_games
                probas[(country1, country2)]['Loose'] = probas[(country1, country2)]['Loose']/n_games
                probas[(country1, country2)]['Draw'] = probas[(country1, country2)]['Draw']/n_games
                probas[(country1, country2)]['Games'] = n_games
                                                    
       
    return probas
    

### historical probabilities of Argentina and Brazil

In [15]:
argentina_brazil = get_hist_proba_of_two_countries(results,  'Argentina', 'Brazil')
argentina_brazil

### Define a function that returns historical probabilities for each pair of two countries
### from a given set of results

In [16]:
def get_hist_proba(results_): # not using results to avoid ambiguity !
    # get all pairs of countries having played a match
    matches = list(results_[['home_team', 'away_team', 'home_score']].groupby(['home_team', 'away_team']).sum().index)
    
    # get ride of (country2, country1) if (country1, country2) already exist in the list
    matches2 = [] # new list after removing duplicates
    for m in matches:
        if ((m[0], m[1]) in matches2) or ((m[1], m[0]) in matches2):
            continue
        else:
            matches2.append(m)
    
    countries1 = [] # first country : country1
    countries2 = [] # second country : country2
    games = [] # number of games played by that pair of countries (country1, country2)
    wins = []  # number of games wined by country1 for the pair (country1, country2)
    looses = [] # number of games lost by country1 for the pair (country1, country2)
    draws = [] # number of draws for the pair (country1, country2)
    for m in matches2:
        temp = get_hist_proba_of_two_countries(results_,  m[0], m[1])
        # we have to add two raws : one for country1, country2 and the other for country2, country1
        # country1, country2
        countries1.append(m[0])
        countries2.append(m[1])
        games.append(temp[m]['Games'])
        wins.append(temp[m]['Win'])
        looses.append(temp[m]['Loose'])
        draws.append(temp[m]['Draw'])
        # country2, country1
        countries1.append(m[1])
        countries2.append(m[0])
        games.append(temp[m]['Games']) # games played is the same
        wins.append(temp[m]['Loose']) # for win and loose we have to switch !
        looses.append(temp[m]['Win']) # for win and loose we have to switch !
        draws.append(temp[m]['Draw']) # draw  is the same
        
    
    historical_ratios = pd.DataFrame({'country1' : countries1 , 'country2' : countries2 ,'games' : games,'wins' : wins, 'looses' : looses,  'draws' : draws})

    historical_ratios = historical_ratios.set_index(['country1', 'country2'])

    return historical_ratios
    

In [17]:
# Read qatar 2022 teams file
qatar_2022_teams = pd.read_csv('/kaggle/input/qatar2022worldcupschudule/Qatar2022-teams.csv',sep=';')
qatar_2022_teams

### get results for these teams

In [18]:
# get results for these teams
qatar_2022_teams_list = list(qatar_2022_teams['Team'])
qatar_2022_results = results.loc[(results.home_team.isin(qatar_2022_teams_list)) & (results.away_team.isin(qatar_2022_teams_list)),: ]

qatar_2022_results

## Get historical probabilities of teams participating in 2022 World Cup

In [19]:
hist_proba_qatar_teams = get_hist_proba(qatar_2022_results)
hist_proba_qatar_teams.reset_index().to_csv('/kaggle/working/historical_win-loose-draw_ratios_qatar2022_teams.csv', index=None)
hist_proba_qatar_teams

In [20]:
# Check for Argentina and Brazil - we can search for ('Argentina', 'Brazil') or ('Brazil', 'Argentina')
# and it should give the same results !
print ( 'Probabiliy Argentina wins : ' , hist_proba_qatar_teams.loc[('Argentina', 'Brazil')]['wins'])
print ( 'Probabiliy Brazil wins : ' , hist_proba_qatar_teams.loc[('Argentina', 'Brazil')]['looses'])
print ( 'Probabiliy of draw : ' , hist_proba_qatar_teams.loc[('Argentina', 'Brazil')]['draws'])
print("-----------------------------------------------------------------------------------------------------------")
print ( 'Probabiliy Argentina wins : ' , hist_proba_qatar_teams.loc[('Brazil', 'Argentina')]['looses'])
print ( 'Probabiliy Brazil wins : ' , hist_proba_qatar_teams.loc[('Brazil', 'Argentina')]['wins'])
print ( 'Probabiliy of draw : ' , hist_proba_qatar_teams.loc[('Brazil', 'Argentina')]['draws'])


In [21]:
hist_proba_qatar_teams.reset_index().to_csv('/kaggle/working/hist_proba_qatar_teams.csv', index=None)