In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import datetime
from datetime import datetime
from datetime import timedelta
from datetime import date
import json
from google.cloud import storage
from google.oauth2 import service_account

# Create table_results_current_year

In [3]:
# Ignoring the warning related to Setting with copy.
pd.options.mode.chained_assignment = None

start_year = 2005

# Extract the current year
URL = 'https://www.resultados-futbol.com/ligue_1'
r = requests.get(URL)
page = r.content
soup = BeautifulSoup(page, 'html5lib')

current_year = int(soup.find('div', class_ = "titular-data").text.strip()[0:4])+1

# Define the countries and the years
Years = range(start_year,current_year + 1)

Countries = ['Portugal','Spain','England','Italy','Germany','France']

In [4]:
%%time

# Create a dataframe with the extracted results of all teams for the 6 countries,
# for the current year


# Creation of an empty data frame for all teams in all countries
table_results_current_year = pd.DataFrame()

# Replace the name of the country by the name of the country/league in the
# resultados-futbol website
for Country in Countries:
    if Country == 'Portugal':
        country = 'portugal'
    elif Country == 'Spain':
        country = 'primera'
    elif Country == 'England':
        country = 'premier'
    elif Country == 'Italy':
        country = 'serie_a'
    elif Country == 'Germany':
        country = 'bundesliga'
    elif Country == 'France':
        country = 'ligue_1'
    else:
        print('Country not available, please choose between Portugal,\
    Spain, England, Italy, Germany or France')

    print(f'   Starting {Country} league of {current_year}...')


    # Per each country:


    # Get the URLs of the teams
    URL = f'https://www.resultados-futbol.com/{country}'
    r = requests.get(URL)
    page = r.content
    soup = BeautifulSoup(page, 'html5lib')

    teams_URL_in_national_league = []

    for li in soup.find_all('li', class_ = "shield"):
        a = li.find('a', href=True)
        teams_URL_in_national_league.append(a['href'])


    # Creation of an empty data frame for all teams in one country, one year
    table_all_teams = pd.DataFrame()


    for team in teams_URL_in_national_league:

        URL = f'https://www.resultados-futbol.com/partidos/{team}'
        r = requests.get(URL)
        page = r.content
        soup = BeautifulSoup(page, 'html5lib')


        #  Get the name of the competitions of each team
        list_of_competitions = []

        for div in soup.find_all('div', class_ = "title"):
            img = div.find('img', alt=True)
            list_of_competitions.append(img['alt'])

        # Get the name of the team (not team URL)
        div_class_team = soup.find('div', class_ = "name")
        div_class_team_a = div_class_team.find('a')
        Team = div_class_team_a.get_text()

        # Some teams have a space in the end of the name, we need to
        # delete it
        if Team[-1] == ' ':
            Team = Team[:-1]
        else:
            Team = Team


        # Creation of an empty data frame for all results of one team
        table_team = pd.DataFrame()

        # Creation of data frame all the results of the team,
        # including a column with the name of the competition
        page = pd.read_html(URL)

        # The table with the first competition results is the table
        # that appears in position 5:
        for competition_i in range(5,5+len(list_of_competitions)):
            table_competition_i = pd.DataFrame(page[competition_i])
            table_competition_i['Competition_original_name_URL'] = list_of_competitions[competition_i-5]
            table_team = pd.concat(
                [table_team,table_competition_i],
             ignore_index = True, axis = 0)


        # Concat the table created for one team with the empty dataframe
        table_all_teams = pd.concat(
                [table_all_teams,
                 table_team],
            ignore_index = True, axis = 0)

        print(f'      {Team} concluded')


    print(f'   {Country} league concluded')


    table_all_teams['Year'] = current_year


    # Concat the table created for one country with the empty dataframe
    table_results_current_year = pd.concat(
            [table_results_current_year,
             table_all_teams],
         ignore_index = True, axis = 0)


print(f'{current_year} concluded')
print('\n')

table_results_current_year

   Starting Portugal league of 2022...
      Porto concluded
      Sporting CP concluded
      Benfica concluded
      Sporting Braga concluded
      Gil Vicente concluded
      Vitória Guimarães concluded
      Portimonense concluded
      Estoril concluded
      Marítimo concluded
      Vizela concluded
      CD Santa Clara concluded
      Boavista concluded
      Paços de Ferreira concluded
      Tondela concluded
      Arouca concluded
      Moreirense concluded
      Famalicão concluded
      Belenenses SAD concluded
   Portugal league concluded
   Starting Spain league of 2022...
      Real Madrid concluded
      Sevilla concluded
      Real Betis concluded
      Atlético concluded
      Barcelona concluded
      R. Sociedad concluded
      Villarreal concluded
      Rayo Vallecano concluded
      Athletic concluded
      Valencia concluded
      Osasuna concluded
      Celta concluded
      Espanyol concluded
      Granada concluded
      Elche concluded
      Getafe concluded
 

Unnamed: 0,0,1,2,3,4,5,6,7,8,Competition_original_name_URL,Year
0,,07 Dic 21,Finalizado,Porto,1 - 3,Atlético,,,1625,Champions League Grupo 2,2022
1,,24 Nov 21,Finalizado,Liverpool,2 - 0,Porto,,,556,Champions League Grupo 2,2022
2,,03 Nov 21,Finalizado,Milan,1 - 1,Porto,,,1417,Champions League Grupo 2,2022
3,,19 Oct 21,Finalizado,Porto,1 - 0,Milan,,,765,Champions League Grupo 2,2022
4,,28 Sep 21,Finalizado,Porto,1 - 5,Liverpool,,,261,Champions League Grupo 2,2022
...,...,...,...,...,...,...,...,...,...,...,...
5662,,24 Jul 21,Finalizado,Saint-Étienne,2 - 3,Clermont,,,0,Partidos Amistosos,2022
5663,,21 Jul 21,Aplazado,Saint-Étienne,-,Nice,,,0,Partidos Amistosos,2022
5664,,17 Jul 21,Finalizado,Saint-Étienne,2 - 1,Grenoble,,,10,Partidos Amistosos,2022
5665,,16 Jul 21,Finalizado,Saint-Étienne,1 - 1,Bourg-Péronnas,,,8,Partidos Amistosos,2022


In [5]:
# Create an excel file with the table
table_results_current_year.to_excel(
    'Table_results_current_year_original.xlsx', index = False)

table_results_current_year = pd.read_excel('Table_results_current_year_original.xlsx')

# Create functions to manage the table

In [6]:
def edit_raw_table(raw_table):


    # Drop duplicated games (the original table has all the games of all teams,
    # and as teams play against each other, many games are repeated).

    raw_table.drop_duplicates(inplace=True)

# Clean data, add columns, rename columns, reorder columns
    
    # Exclude all lines with NAs in column 2
    raw_table = raw_table[
        raw_table[2].notna()]
    
    # Exclude all columns with NA
    raw_table = raw_table.dropna(axis=1)
    
    # Rename columns
    raw_table.rename(columns = {1:'Date', 2:'Status', 3:'Home_team',
        4:'Result', 5:'Away_team'}, inplace = True)
    
    # Change the values of the match status
    raw_table['Status'] = raw_table['Status'].apply(
        lambda x: "Postponed" if "Aplazado" in x else \
            "Finalized" if "Finalizado" in x else \
            "Not played yet" if "en " in x else \
            "Still playing" if "'" in x else "Unknown")
    
    # Select only the columns we need
    raw_table = raw_table[[
        "Year","Date","Competition_original_name_URL",
        "Home_team","Away_team","Status","Result"]]

    
    # Create 2 columns with the goals of the home_team and away_team
    list_results = []
    
    for score in list(raw_table['Result']):
        if '-' in score:
            list_results.append(score)
        else:
            list_results.append('-')
    
    raw_table['Result'] = list_results
            
    list_home_score = []
    list_away_score = []
    
    for score in list(raw_table['Result']):
        if score == '-':
            list_home_score.append('-')
            list_away_score.append('-')
        elif '-' in score:
            list_home_score.append(score[:score.index('-')-1])
            list_away_score.append(score[score.index('-')+2:])
        else:
            list_home_score.append('-')
            list_away_score.append('-')
    
    raw_table['Home_score'] = list_home_score
    raw_table['Away_score'] = list_away_score
    
# Create a new column with the name of the country
    # First, we create lists with the names of the competitions per country
    raw_table['Country'] = list(raw_table['Competition_original_name_URL'])

    international_competition_list = ['Champions League', 'Europa League',
                                  'Copa Intercontinental', 'Copa Intertoto',
                                 'Mundial de Clubes', 'Supercopa Europa',
                                      'Previa Champions'
                                 ]

    portuguese_competitions = ['Liga Portuguesa', 
                               'Taça de Portugal', 
                               'Supercopa Portugal', 
                               'Copa de la Liga Portugal',
                              'Liga Portuguesa - Play Offs Ascenso']
    spanish_competitions = ['Primera División', 
                            'Copa del Rey', 
                            'Supercopa de España']
    english_competitions = ['Premier League', 
                            'FA Cup', 
                            'Community Shield', 
                            'EFL Cup']
    italian_competitions = ['Serie A', 
                            'Coppa Italia', 
                            'Supercopa de Italia']
    german_competitions = ['Bundesliga', 
                           'DFB Pokal', 
                           'Supercopa de Alemania', 
                           'Liga Pokal']
    french_competitions = ['Ligue 1', 
                           'Copa de Francia',  
                           'Supercopa Francia', 
                           'Copa de la Liga',
                          'Ligue 1 - Play Offs Ascenso']
    
    # Reset index before applying the following loop
    raw_table.reset_index(inplace = True)

    for i in range(len(list(raw_table['Country']))):
        if raw_table['Country'][i] in international_competition_list:
            raw_table['Country'][i] = 'International'
        elif raw_table['Country'][i] in portuguese_competitions:
            raw_table['Country'][i] = 'Portugal'
        elif raw_table['Country'][i] in spanish_competitions:
            raw_table['Country'][i] = 'Spain'
        elif raw_table['Country'][i] in english_competitions:
            raw_table['Country'][i] = 'England'
        elif raw_table['Country'][i] in italian_competitions:
            raw_table['Country'][i] = 'Italy'
        elif raw_table['Country'][i] in german_competitions:
            raw_table['Country'][i] = 'Germany'
        elif raw_table['Country'][i] in french_competitions:
            raw_table['Country'][i] = 'France'
        else:
            raw_table['Country'][i] = 'Friendly match'


    # Create dictionary to rename the competitions column

    competitions_initial_list = raw_table[
        'Competition_original_name_URL'].unique().tolist()

    dict_competitions = {}

    for initial_competition in competitions_initial_list:
        dict_competitions[initial_competition] = []

    for international_competition in international_competition_list:
        for initial_competition in competitions_initial_list:
            if international_competition in initial_competition:
                dict_competitions[initial_competition] = international_competition

    national_leagues_list = ['Bundesliga', 'Liga Portuguesa', 'Ligue 1',
                            'Premier League', 'Primera División', 'Serie A']

    for national_league in national_leagues_list:
        for initial_competition in competitions_initial_list:
            if national_league in initial_competition:
                dict_competitions[initial_competition] = 'National League'

    national_cups_list = ['Copa de Francia', 'Copa del Rey', 'Coppa Italia',
                            'FA Cup', 'DFB Pokal', 'Taça de Portugal']

    for national_cup in national_cups_list:
        for initial_competition in competitions_initial_list:
            if national_cup in initial_competition:
                dict_competitions[initial_competition] = 'National Cup'

    national_super_cup_list = ['Community Shield', 'Supercopa Francia', 'Supercopa Portugal',
                            'Supercopa de Alemania', 'Supercopa de Italia', 'Supercopa de España']

    for national_super_cup in national_super_cup_list:
        for initial_competition in competitions_initial_list:
            if national_super_cup in initial_competition:
                dict_competitions[initial_competition] = 'National Super Cup'

    national_leagues_cup_list = ['Copa de la Liga', 'EFL Cup', 'Liga Pokal']

    for national_leagues_cup in national_leagues_cup_list:
        for initial_competition in competitions_initial_list:
            if national_leagues_cup in initial_competition:
                dict_competitions[initial_competition] = 'National League Cup'
    
    # Convert the names of the remaining competitions in English
    
    dict_competitions['Trofeo Premier League Asia'] = 'Friendly match'            
    dict_competitions['Copa Intercontinental'] = 'Intercontinental Cup'
    dict_competitions['Copa Intertoto'] = 'Intertoto Cup'
    dict_competitions['Supercopa Europa'] = 'European Supercup'
    dict_competitions['Mundial de Clubes'] = 'Club World Cup'
    dict_competitions['Bundesliga - Play Offs Ascenso'] = 'Bundesliga Play Offs'
    dict_competitions['Liga Portuguesa - Play Offs Ascenso'] ='Liga Portuguesa Play Offs'
    dict_competitions['Ligue 1 - Play Offs Ascenso'] ='Ligue 1 Play Offs'

    # Still, there are competitions that have not been added to the dictionary:
    for competition in list(dict_competitions.keys()):
        if dict_competitions[competition] == 'Previa Champions':
            dict_competitions[competition] = 'Champions League'
    
    # For the ones that have not been added yet, we consider them as Friendly match
    for competition in list(dict_competitions.keys()):
        if dict_competitions[competition] == []:
            dict_competitions[competition] = 'Friendly match'


    # Add a column with the competition "standardized" name,
    # created with the dictionary above

    list_competition_standardized_name = []

    list_competition_original_name_URL = list(
        raw_table['Competition_original_name_URL'])

    for competition_original_name_URL in list_competition_original_name_URL:
        list_competition_standardized_name.append(
            dict_competitions[competition_original_name_URL])

    raw_table['Competition'] = list_competition_standardized_name

    raw_table = raw_table[
        ['Year', 'Country', 'Date',
            'Competition_original_name_URL', 'Competition', 'Home_team', 'Away_team', 
            'Status','Result', 'Home_score', 'Away_score']]
    
    
    # convert months in Spanish to months in English

    dict_months = {
        'Ene': 'Jan','Feb': 'Feb','Mar': 'Mar','Abr': 'Apr',
        'May': 'May','Jun': 'Jun','Jul': 'Jul','Ago': 'Aug',
        'Sep': 'Sep','Oct': 'Oct','Nov': 'Nov','Dic': 'Dec',
    }


    for i in range(len(list(raw_table['Date']))):
        month_es = raw_table['Date'][i][3:6]
        raw_table['Date'][i] = raw_table['Date'][i][0:3] +\
        dict_months[month_es] + raw_table['Date'][i][6:]
    

    # Create a new column with the date in the datetime format
    datetime_date_list = []

    for date in raw_table['Date']:
        date_object = datetime.strptime(date, "%d %b %y")
        datetime_date_list.append(date_object)
    
    raw_table.insert(list(raw_table.columns).index('Date')+1,'Datetime_date_list', datetime_date_list)
    
    
    # Drop results of non played games from previous seasons and reset index
    
    No_result = raw_table[(raw_table['Result'] == '-') &\
                          (raw_table['Year'] < current_year)
                         ].index
    raw_table.drop(No_result, inplace = True)
    raw_table.reset_index(inplace = True)


    # Add columns with 1x2 result and points for home and away team
    
    raw_table.loc[
        raw_table[
            'Home_score'] == raw_table['Away_score'], '1x2'] = 'x'
    raw_table.loc[
        raw_table[
            'Home_score'] > raw_table['Away_score'], '1x2'] = '1'
    raw_table.loc[
        raw_table[
            'Home_score'] < raw_table['Away_score'], '1x2'] = '2'
    
    raw_table['1x2'] = raw_table.apply(
    lambda row: row['1x2'] if row['Result'] != '-' else '-', axis=1)
    
    raw_table.loc[
        raw_table[
            '1x2'] == 'x', 'Points_Home_Team'] = int('1')
    raw_table.loc[
        raw_table[
            '1x2'] == '1', 'Points_Home_Team'] = int('3')
    raw_table.loc[
        raw_table[
            '1x2'] == '2', 'Points_Home_Team'] = int('0')
    raw_table.loc[
        raw_table[
            '1x2'] == 'x', 'Points_Away_Team'] = int('1')
    raw_table.loc[
        raw_table[
            '1x2'] == '1', 'Points_Away_Team'] = int('0')
    raw_table.loc[
        raw_table[
            '1x2'] == '2', 'Points_Away_Team'] = int('3')



    # Drop index column
    raw_table.drop(['index'], axis=1, inplace=True)
    
    return raw_table

In [7]:
def fatigue(raw_table):

    # Calculate the number of games played by each team in the last days.
    last_days = 21


    # Create a new column with the number of games played by the home_team
    # in the last days, excluding friendly matches.
    number_of_games_last_days_home_team_list = []

    for i in range(raw_table.shape[0]):
        Team = raw_table['Home_team'][i]
        match_day = raw_table['Datetime_date_list'][i]
        match_day_minus_last_days = match_day - timedelta(days = last_days)

        number_of_games_last_days_home_team_list.append(
            sum(
        (raw_table.Datetime_date_list >= match_day_minus_last_days) & 
        (raw_table.Datetime_date_list < match_day) & 
        ((raw_table.Home_team == Team) | (raw_table.Away_team == Team)) & 
        (raw_table.Competition != 'Friendly match')
        )
        )

    raw_table['number_of_games_last_days_home_team'] = number_of_games_last_days_home_team_list

    # Create a new column with the number of games played by the away_team
    # in the last days, excluding friendly matches.
    number_of_games_last_days_away_team_list = []

    for i in range(raw_table.shape[0]):
        Team = raw_table['Away_team'][i]
        match_day = raw_table['Datetime_date_list'][i]
        match_day_minus_last_days = match_day - timedelta(days = last_days)

        number_of_games_last_days_away_team_list.append(
            sum(
        (raw_table.Datetime_date_list >= match_day_minus_last_days) & 
        (raw_table.Datetime_date_list < match_day) & 
        ((raw_table.Away_team == Team) | (raw_table.Home_team == Team)) & 
        (raw_table.Competition != 'Friendly match')
        )
        )

    raw_table['number_of_games_last_days_away_team'] = number_of_games_last_days_away_team_list

    # Note: teams outside the 6 national leagues will appear in international
    # games with 0 or very little games in the last days. However this is not
    # a problem, as we are not predicting game results of their national leagues,
    # nor international competitions.
    
    # Exclude all competitions which are not national leagues, which is the only type
    # of competition we will make predictions
    raw_table = raw_table[raw_table['Competition'] == 'National League']
    
    # Re-set index after having dropped many rows
    raw_table.reset_index(drop=True,inplace=True)
    
    return raw_table

In [8]:
def points_respective_year_and_last_games(raw_table):
    
    # Calculate the number of points that each team has in the respective
    # season and also in the last games.
    
    n_last_games = 5
    
    # Create new columns that show the number of points of the home_team
    # and away_team
    
    number_of_points_respective_year_home_team_list = []
    number_of_points_last_games_home_team_list = []
    
    number_of_points_respective_year_away_team_list = []
    number_of_points_last_games_away_team_list = []
    
    
    for i in range(raw_table.shape[0]):
        
        # First, let's start with the points accumulated by the home_team
        # in the respective year and in the last games.

        home_team = raw_table.loc[i, 'Home_team']

        df_aux_respective_year_home_team = raw_table[((raw_table['Home_team'] == home_team) |\
            (raw_table['Away_team'] == home_team)) &\
            (raw_table['Year'] == raw_table.loc[i, 'Year']) &\
            (raw_table['Datetime_date_list'] < raw_table.loc[i, 'Datetime_date_list']) &\
            (raw_table['Competition'] == 'National League')
                          ]
        
        df_aux_last_games_home_team = df_aux_respective_year_home_team.sort_values(
            'Datetime_date_list')[-n_last_games:]
          
        if df_aux_last_games_home_team.shape[0] > 0:
            
            Points_respective_year_home_team = \
            (df_aux_respective_year_home_team[df_aux_respective_year_home_team['Home_team'] == home_team]\
             ['Points_Home_Team'].sum() +\
            df_aux_respective_year_home_team[df_aux_respective_year_home_team['Away_team'] == home_team]\
             ['Points_Away_Team'].sum())        
            
            Points_last_games_home_team = \
            (df_aux_last_games_home_team[df_aux_last_games_home_team['Home_team'] == home_team]\
             ['Points_Home_Team'].sum() +\
            df_aux_last_games_home_team[df_aux_last_games_home_team['Away_team'] == home_team]\
             ['Points_Away_Team'].sum())
        
        else:
            Points_respective_year_home_team = 0
            Points_last_games_home_team = 0
           
        number_of_points_respective_year_home_team_list.append(Points_respective_year_home_team)
        number_of_points_last_games_home_team_list.append(Points_last_games_home_team)
        
        
        # Now, let's calculate how many points the away_team has accumulated
        # in the respective year and in the last games
        
        away_team = raw_table.loc[i, 'Away_team']
        
        df_aux_respective_year_away_team = raw_table[((raw_table['Home_team'] == away_team) |\
            (raw_table['Away_team'] == away_team)) &\
            (raw_table['Year'] == raw_table.loc[i, 'Year']) &\
            (raw_table['Datetime_date_list'] < raw_table.loc[i, 'Datetime_date_list']) &\
            (raw_table['Competition'] == 'National League')
                                                 ]
                                                  
        df_aux_last_games_away_team = df_aux_respective_year_away_team.sort_values(
            'Datetime_date_list')[-n_last_games:]                             
                                                  
        if df_aux_last_games_away_team.shape[0] > 0:
            
            Points_respective_year_away_team = \
            (df_aux_respective_year_away_team[df_aux_respective_year_away_team['Home_team'] == away_team]\
             ['Points_Home_Team'].sum() +\
            df_aux_respective_year_away_team[df_aux_respective_year_away_team['Away_team'] == away_team]\
             ['Points_Away_Team'].sum())
            
            Points_last_games_away_team = \
            (df_aux_last_games_away_team[df_aux_last_games_away_team['Home_team'] == away_team]\
             ['Points_Home_Team'].sum() +\
            df_aux_last_games_away_team[df_aux_last_games_away_team['Away_team'] == away_team]\
             ['Points_Away_Team'].sum())
        
        else:
            Points_respective_year_away_team = 0
            Points_last_games_away_team = 0

        number_of_points_respective_year_away_team_list.append(Points_respective_year_away_team)
        number_of_points_last_games_away_team_list.append(Points_last_games_away_team)
        
    raw_table['points_respective_year_home_team'] = number_of_points_respective_year_home_team_list
    raw_table['points_respective_year_away_team'] = number_of_points_respective_year_away_team_list
    
    raw_table['points_last_games_home_team'] = number_of_points_last_games_home_team_list
    raw_table['points_last_games_away_team'] = number_of_points_last_games_away_team_list


    # We are only counting the points for the National League competitions,
    # as we are not predicting game results of other competitions.

    return raw_table

In [9]:
def points_between_teams(raw_table):

    # Calculate the number of points that each team has got in the
    # same match in the previous years.

    
    # Create new columns that show the number of points of the home_team
    # and away_team for the same match in the previous years
    
    number_of_points_between_teams_home_team_list = []
    
    number_of_points_between_teams_away_team_list = []
    
    for i in range(raw_table.shape[0]):
        
        home_team = raw_table.loc[i, 'Home_team']
        away_team = raw_table.loc[i, 'Away_team']

        df_aux = raw_table[((raw_table['Home_team'] == home_team) &\
            (raw_table['Away_team'] == away_team)) &\
            (raw_table['Datetime_date_list'] < raw_table.loc[i, 'Datetime_date_list']) &\
            (raw_table['Competition'] == 'National League')
                          ]
        
        if df_aux.shape[0] > 0:
            
            Points_home_team = \
            df_aux[df_aux['Home_team'] == home_team]['Points_Home_Team'].sum()
            
            Points_away_team = \
            df_aux[df_aux['Away_team'] == away_team]['Points_Away_Team'].sum()
        
        else:
            Points_home_team = 0
            
            Points_away_team = 0
            
        number_of_points_between_teams_home_team_list.append(Points_home_team)
            
        number_of_points_between_teams_away_team_list.append(Points_away_team)
        
        
        
    raw_table['points_between_teams_home_team'] = number_of_points_between_teams_home_team_list
        
    raw_table['points_between_teams_away_team'] = number_of_points_between_teams_away_team_list


    # We are only counting the points for the National League competitions,
    # as we are not predicting game results of other competitions.

    return raw_table

# Apply functions to table_results_current_year

In [10]:
%%time
table_results_current_year = edit_raw_table(table_results_current_year)
table_results_current_year

CPU times: user 2.22 s, sys: 20.6 ms, total: 2.24 s
Wall time: 2.26 s


Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,Home_score,Away_score,1x2,Points_Home_Team,Points_Away_Team
0,2022,Friendly match,07 Dec 21,2021-12-07,Champions League Grupo 2,Champions League,Porto,Atlético,Finalized,1 - 3,1,3,2,0.0,3.0
1,2022,Friendly match,24 Nov 21,2021-11-24,Champions League Grupo 2,Champions League,Liverpool,Porto,Finalized,2 - 0,2,0,1,3.0,0.0
2,2022,Friendly match,03 Nov 21,2021-11-03,Champions League Grupo 2,Champions League,Milan,Porto,Finalized,1 - 1,1,1,x,1.0,1.0
3,2022,Friendly match,19 Oct 21,2021-10-19,Champions League Grupo 2,Champions League,Porto,Milan,Finalized,1 - 0,1,0,1,3.0,0.0
4,2022,Friendly match,28 Sep 21,2021-09-28,Champions League Grupo 2,Champions League,Porto,Liverpool,Finalized,1 - 5,1,5,2,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3235,2022,France,02 Jan 22,2022-01-02,Copa de Francia,National Cup,Jura Sud,Saint-Étienne,Finalized,1 - 4,1,4,2,0.0,3.0
3236,2022,France,19 Dec 21,2021-12-19,Copa de Francia,National Cup,Lyon-Duchère,Saint-Étienne,Finalized,0 - 1,0,1,2,0.0,3.0
3237,2022,Friendly match,17 Jul 21,2021-07-17,Partidos Amistosos,Friendly match,Saint-Étienne,Grenoble,Finalized,2 - 1,2,1,1,3.0,0.0
3238,2022,Friendly match,16 Jul 21,2021-07-16,Partidos Amistosos,Friendly match,Saint-Étienne,Bourg-Péronnas,Finalized,1 - 1,1,1,x,1.0,1.0


In [11]:
%%time
table_results_current_year = fatigue(table_results_current_year)
table_results_current_year

CPU times: user 14 s, sys: 86.3 ms, total: 14.1 s
Wall time: 14.2 s


Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,Home_score,Away_score,1x2,Points_Home_Team,Points_Away_Team,number_of_games_last_days_home_team,number_of_games_last_days_away_team
0,2022,Portugal,15 May 22,2022-05-15,Liga Portuguesa,National League,Porto,Estoril,Not played yet,-,-,-,-,,,3,3
1,2022,Portugal,08 May 22,2022-05-08,Liga Portuguesa,National League,Benfica,Porto,Not played yet,-,-,-,-,,,2,3
2,2022,Portugal,30 Apr 22,2022-04-30,Liga Portuguesa,National League,Porto,Vizela,Not played yet,-,-,-,-,,,4,3
3,2022,Portugal,24 Apr 22,2022-04-24,Liga Portuguesa,National League,Sporting Braga,Porto,Not played yet,-,-,-,-,,,3,4
4,2022,Portugal,16 Apr 22,2022-04-16,Liga Portuguesa,National League,Porto,Portimonense,Not played yet,-,-,-,-,,,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,2022,France,06 Mar 22,2022-03-06,Ligue 1,National League,Saint-Étienne,Metz,Not played yet,-,-,-,-,,,3,3
2128,2022,France,12 Dec 21,2021-12-12,Ligue 1,National League,Metz,Lorient,Finalized,4 - 1,4,1,1,3.0,0.0,4,4
2129,2022,France,30 Oct 21,2021-10-30,Ligue 1,National League,Metz,Saint-Étienne,Finalized,1 - 1,1,1,x,1.0,1.0,2,2
2130,2022,France,10 Apr 22,2022-04-10,Ligue 1,National League,Lorient,Saint-Étienne,Not played yet,-,-,-,-,,,2,2


In [12]:
%%time
table_results_current_year = points_respective_year_and_last_games(table_results_current_year)
table_results_current_year

CPU times: user 19.7 s, sys: 86.9 ms, total: 19.8 s
Wall time: 19.9 s


Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,...,Away_score,1x2,Points_Home_Team,Points_Away_Team,number_of_games_last_days_home_team,number_of_games_last_days_away_team,points_respective_year_home_team,points_respective_year_away_team,points_last_games_home_team,points_last_games_away_team
0,2022,Portugal,15 May 22,2022-05-15,Liga Portuguesa,National League,Porto,Estoril,Not played yet,-,...,-,-,,,3,3,53.0,25.0,0.0,0.0
1,2022,Portugal,08 May 22,2022-05-08,Liga Portuguesa,National League,Benfica,Porto,Not played yet,-,...,-,-,,,2,3,44.0,53.0,0.0,0.0
2,2022,Portugal,30 Apr 22,2022-04-30,Liga Portuguesa,National League,Porto,Vizela,Not played yet,-,...,-,-,,,4,3,53.0,20.0,0.0,0.0
3,2022,Portugal,24 Apr 22,2022-04-24,Liga Portuguesa,National League,Sporting Braga,Porto,Not played yet,-,...,-,-,,,3,4,35.0,53.0,0.0,0.0
4,2022,Portugal,16 Apr 22,2022-04-16,Liga Portuguesa,National League,Porto,Portimonense,Not played yet,-,...,-,-,,,2,2,53.0,26.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2127,2022,France,06 Mar 22,2022-03-06,Ligue 1,National League,Saint-Étienne,Metz,Not played yet,-,...,-,-,,,3,3,15.0,19.0,3.0,0.0
2128,2022,France,12 Dec 21,2021-12-12,Ligue 1,National League,Metz,Lorient,Finalized,4 - 1,...,1,1,3.0,0.0,4,4,12.0,15.0,5.0,0.0
2129,2022,France,30 Oct 21,2021-10-30,Ligue 1,National League,Metz,Saint-Étienne,Finalized,1 - 1,...,1,x,1.0,1.0,2,2,6.0,5.0,3.0,2.0
2130,2022,France,10 Apr 22,2022-04-10,Ligue 1,National League,Lorient,Saint-Étienne,Not played yet,-,...,-,-,,,2,2,17.0,15.0,0.0,0.0


# Correct Team names of table_results_current_year

In [13]:
table_results_previous_years = pd.read_excel('Table_results_previous_years_edited.xlsx')

In [14]:
# Create 2 lists (one for previous years and another for the current year)
# with the unique names of the teams that have played the national leagues.

list_teams_previous_years = []
list1 = list_teams_previous_years

list_teams_current_year = []
list2 = list_teams_current_year


list1 = list(set(list(table_results_previous_years['Home_team']) +\
                 list(table_results_previous_years['Away_team'])))

list2 = list(set(list(table_results_current_year['Home_team']) +\
                 list(table_results_current_year['Away_team'])))

df_match = pd.DataFrame({'Name current year': list2})

df_match

Unnamed: 0,Name current year
0,Girondins Bordeaux
1,Wolfsburg
2,Lazio
3,Levante
4,Real Betis
...,...
111,Sassuolo
112,Burnley
113,Eintracht
114,Moreirense


In [15]:
# Use the library fuzzywuzzy to match the names in both lists. The library
# gives us a score from 0 to 100: the higher the score, the more similar the
# names are. First I create a column with the name and the score of the 2
# top teams most similar, and then a second column only with the score of the
# most similar name.

df_match['Suggested from previous years'] = df_match['Name current year'].apply(
        (lambda x: process.extract(x, list1)[0:2])
    )

df_match['Suggested from previous years Score'] = df_match['Name current year'].apply(
        (lambda x: process.extractOne(x, list1)[1])
    )

df_match.sort_values(by = 'Suggested from previous years Score').head(20)

Unnamed: 0,Name current year,Suggested from previous years,Suggested from previous years Score
82,Brentford,"[(Watford, 62), (Benevento, 56)]",62
74,Clermont,"[(Palermo FC, 64), (Levante, 53)]",64
102,Vizela,"[(Aston Villa, 66), (Sevilla, 62)]",66
14,Venezia,"[(Valencia, 67), (Spezia, 62)]",67
58,Salernitana,"[(SPAL, 68), (Lens, 68)]",68
85,Brighton Hove Alb.,"[(Brighton & Hove Albion, 87), (Torino, 60)]",87
113,Eintracht,"[(Eintracht Frankfurt, 90), (Inter, 72)]",90
80,Olympique,"[(Olympique Marseille, 90), (Olympique Lyonnai...",90
32,Vitória,"[(Vitória Guimarães, 90), (Vitória Setúbal, 90)]",90
61,Paços Ferreira,"[(Paços de Ferreira, 95), (SC Freiburg, 58)]",95


In [16]:
# The first 5 teams (Brentford, Clermont, Vizela, Venezia, Salernitana)
# are suggested names from other teams as they appear in the table for
# the first time in 2022 (it is their first appearence in the most important
# national league since the start year of this project). However, the
# following 6 teams are the ones with similar names in previous years.
# Those (and only those) are the names we need to correct.

# For Vitória and Olympique we get the same score for the 1st and 2nd suggestion.
# I've checked manually what is the right one.

table_results_current_year.loc[(
    table_results_current_year['Home_team'] == 'Brighton Hove Alb.'), 'Home_team'] = 'Brighton & Hove Albion'
table_results_current_year.loc[(
    table_results_current_year['Away_team'] == 'Brighton Hove Alb.'), 'Away_team'] = 'Brighton & Hove Albion'

table_results_current_year.loc[(
    table_results_current_year['Home_team'] == 'Vitória'), 'Home_team'] = 'Vitória Guimarães'
table_results_current_year.loc[(
    table_results_current_year['Away_team'] == 'Vitória'), 'Away_team'] = 'Vitória Guimarães'

table_results_current_year.loc[(
    table_results_current_year['Home_team'] == 'Olympique'), 'Home_team'] = 'Olympique Marseille'
table_results_current_year.loc[(
    table_results_current_year['Away_team'] == 'Olympique'), 'Away_team'] = 'Olympique Marseille'

table_results_current_year.loc[(
    table_results_current_year['Home_team'] == 'Eintracht'), 'Home_team'] = 'Eintracht Frankfurt'
table_results_current_year.loc[(
    table_results_current_year['Away_team'] == 'Eintracht'), 'Away_team'] = 'Eintracht Frankfurt'

table_results_current_year.loc[(
    table_results_current_year['Home_team'] == 'Mönchengladbach'), 'Home_team'] = 'B. Mönchengladbach'
table_results_current_year.loc[(
    table_results_current_year['Away_team'] == 'Mönchengladbach'), 'Away_team'] = 'B. Mönchengladbach'

table_results_current_year.loc[(
    table_results_current_year['Home_team'] == 'Paços Ferreira'), 'Home_team'] = 'Paços de Ferreira'
table_results_current_year.loc[(
    table_results_current_year['Away_team'] == 'Paços Ferreira'), 'Away_team'] = 'Paços de Ferreira'

In [17]:
table_results_current_year.to_excel(
    'Table_results_current_year_edited.xlsx', index = False)

# Join Tables previous years and current year

In [18]:
table_results_previous_years = pd.read_excel('Table_results_previous_years_edited.xlsx')
table_results_current_year = pd.read_excel('Table_results_current_year_edited.xlsx')

In [19]:
table_results_all_years = pd.concat(
            [table_results_previous_years,
             table_results_current_year],
         ignore_index = True, axis = 0)

In [20]:
table_results_all_years

Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,...,Away_score,1x2,Points_Home_Team,Points_Away_Team,number_of_games_last_days_home_team,number_of_games_last_days_away_team,points_respective_year_home_team,points_respective_year_away_team,points_last_games_home_team,points_last_games_away_team
0,2005,Portugal,22 May 05,2005-05-22,Liga Portuguesa,National League,Boavista,Benfica,Finalized,1 - 1,...,1,x,1.0,1.0,2,2,49,64,1,10
1,2005,Portugal,14 May 05,2005-05-14,Liga Portuguesa,National League,Benfica,Sporting CP,Finalized,1 - 0,...,0,1,3.0,0.0,3,5,61,61,7,13
2,2005,Portugal,07 May 05,2005-05-07,Liga Portuguesa,National League,Penafiel,Benfica,Finalized,1 - 0,...,0,1,3.0,0.0,3,4,37,61,9,10
3,2005,Portugal,30 Apr 05,2005-04-30,Liga Portuguesa,National League,Benfica,Os Belenenses,Finalized,1 - 0,...,0,1,3.0,0.0,4,3,58,42,10,10
4,2005,Portugal,24 Apr 05,2005-04-24,Liga Portuguesa,National League,Estoril,Benfica,Finalized,1 - 2,...,2,2,0.0,3.0,3,4,26,55,3,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37744,2022,France,06 Mar 22,2022-03-06,Ligue 1,National League,Saint-Étienne,Metz,Not played yet,-,...,-,-,,,3,3,15,19,3,0
37745,2022,France,12 Dec 21,2021-12-12,Ligue 1,National League,Metz,Lorient,Finalized,4 - 1,...,1,1,3.0,0.0,4,4,12,15,5,0
37746,2022,France,30 Oct 21,2021-10-30,Ligue 1,National League,Metz,Saint-Étienne,Finalized,1 - 1,...,1,x,1.0,1.0,2,2,6,5,3,2
37747,2022,France,10 Apr 22,2022-04-10,Ligue 1,National League,Lorient,Saint-Étienne,Not played yet,-,...,-,-,,,2,2,17,15,0,0


# Apply function points_between_teams

In [21]:
%%time
table_results_all_years = points_between_teams(table_results_all_years)
table_results_all_years

CPU times: user 6min 14s, sys: 1.47 s, total: 6min 16s
Wall time: 6min 22s


Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,...,Points_Home_Team,Points_Away_Team,number_of_games_last_days_home_team,number_of_games_last_days_away_team,points_respective_year_home_team,points_respective_year_away_team,points_last_games_home_team,points_last_games_away_team,points_between_teams_home_team,points_between_teams_away_team
0,2005,Portugal,22 May 05,2005-05-22,Liga Portuguesa,National League,Boavista,Benfica,Finalized,1 - 1,...,1.0,1.0,2,2,49,64,1,10,0.0,0.0
1,2005,Portugal,14 May 05,2005-05-14,Liga Portuguesa,National League,Benfica,Sporting CP,Finalized,1 - 0,...,3.0,0.0,3,5,61,61,7,13,0.0,0.0
2,2005,Portugal,07 May 05,2005-05-07,Liga Portuguesa,National League,Penafiel,Benfica,Finalized,1 - 0,...,3.0,0.0,3,4,37,61,9,10,0.0,0.0
3,2005,Portugal,30 Apr 05,2005-04-30,Liga Portuguesa,National League,Benfica,Os Belenenses,Finalized,1 - 0,...,3.0,0.0,4,3,58,42,10,10,0.0,0.0
4,2005,Portugal,24 Apr 05,2005-04-24,Liga Portuguesa,National League,Estoril,Benfica,Finalized,1 - 2,...,0.0,3.0,3,4,26,55,3,10,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37744,2022,France,06 Mar 22,2022-03-06,Ligue 1,National League,Saint-Étienne,Metz,Not played yet,-,...,,,3,3,15,19,3,0,17.0,5.0
37745,2022,France,12 Dec 21,2021-12-12,Ligue 1,National League,Metz,Lorient,Finalized,4 - 1,...,3.0,0.0,4,4,12,15,5,0,4.0,7.0
37746,2022,France,30 Oct 21,2021-10-30,Ligue 1,National League,Metz,Saint-Étienne,Finalized,1 - 1,...,1.0,1.0,2,2,6,5,3,2,11.0,11.0
37747,2022,France,10 Apr 22,2022-04-10,Ligue 1,National League,Lorient,Saint-Étienne,Not played yet,-,...,,,2,2,17,15,0,0,24.0,9.0


In [22]:
table_results_all_years.to_excel(
    'Table_results_all_years_edited.xlsx', index = False)

# Join the table with fifa data

In [23]:
table_fifa_all_years = pd.read_excel('Table_fifa_all_years_edited.xlsx')
table_fifa_all_years

Unnamed: 0,Year,Country,Name,ATT,MID,DEF,OVR,Link-team,Team-ID,Fifa_team_all_names,Rival_team,Budget_Mill_€
0,2005,Portugal,FC Porto,85,82,77,81,/team/236/fc-porto/fifa05/,236,"['FC Porto', 'F.C. Porto']",Benfica,16.0
1,2005,Portugal,Benfica,80,74,74,75,/team/234/benfica/fifa05/,234,"['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']",Sporting Lisbon,5.0
2,2005,Portugal,Sporting Lisbon,63,70,65,68,/team/237/sporting-lisbon/fifa05/,237,"['Sporting Lisbon', 'Sporting CP Lisbon', 'Spo...",Benfica,1.7
3,2005,Portugal,Belenenses,67,62,62,64,/team/1889/belenenses/fifa05/,1889,"['Belenenses', 'CF Os Belenenses', 'Belém', 'C...",Boavista,0.4
4,2005,Portugal,Nacional,69,59,61,63,/team/1891/nacional/fifa05/,1891,"['Nacional', 'Clube Desportivo Nacional', 'CD ...",Marítimo,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
2067,2022,France,Clermont Foot,72,71,73,71,/team/1815/clermont-foot/,1815,['Clermont Foot'],AS Saint-Étienne,4.5
2068,2012,England,Arsenal,84,80,81,82,/team/1/arsenal/fifa12/,1,"['Arsenal', 'Arsenal FC']",Tottenham Hotspur,30.0
2069,2013,England,Southampton,73,74,71,73,/team/17/southampton/fifa13/,17,['Southampton'],Portsmouth,7.5
2070,2014,Spain,Villarreal,75,74,72,74,/team/483/villarreal/fifa14/,483,"['Villarreal', 'Villarreal C.F.', 'Villarreal ...",Valencia Club de Fútbol,8.0


In [24]:
table_results_all_years = pd.read_excel('Table_results_all_years_edited.xlsx')
table_results_all_years.head(3)

Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,...,Points_Home_Team,Points_Away_Team,number_of_games_last_days_home_team,number_of_games_last_days_away_team,points_respective_year_home_team,points_respective_year_away_team,points_last_games_home_team,points_last_games_away_team,points_between_teams_home_team,points_between_teams_away_team
0,2005,Portugal,22 May 05,2005-05-22,Liga Portuguesa,National League,Boavista,Benfica,Finalized,1 - 1,...,1.0,1.0,2,2,49,64,1,10,0,0
1,2005,Portugal,14 May 05,2005-05-14,Liga Portuguesa,National League,Benfica,Sporting CP,Finalized,1 - 0,...,3.0,0.0,3,5,61,61,7,13,0,0
2,2005,Portugal,07 May 05,2005-05-07,Liga Portuguesa,National League,Penafiel,Benfica,Finalized,1 - 0,...,3.0,0.0,3,4,37,61,9,10,0,0


#### Create dictionary to match team names

In [25]:
%%time

# create a dictionary whose keys are the team names in the results table
# and the values are the possible names in FIFA for each one of those teams

d = {}

# Build 2 lists: one with unique values of all teams in results table and
# another with unique values of all teams in fifa table

for year in Years:
    for country in Countries:
        results_teams_list_year_country = list(set(
            list(table_results_all_years[(table_results_all_years['Year'] == year) & 
                     (table_results_all_years['Country'] == country)]
                     ['Home_team']) +\
            list(table_results_all_years[(table_results_all_years['Year'] == year) & 
                     (table_results_all_years['Country'] == country)]
                     ['Away_team'])))

        fifa_teams_list_year_country = list(set(
            list(table_fifa_all_years[(table_fifa_all_years['Year'] == year) & 
                     (table_fifa_all_years['Country'] == country)]
                     ['Name'])))
        
        print(year)
        print(country)
        print('\n')

        
# Code to find a possible mismatch between the number of teams per year and
# per country in the results and fifa tables (if data is accurate, it should
# not happen)
        
        
        if len(results_teams_list_year_country) != len(fifa_teams_list_year_country):
            print('********************************************************')
            print('********************************************************')
            print('********************************************************')
            print(f'ERROR: {year},{country},\n\
            Number of teams results table: {len(results_teams_list_year_country)},\n\
            Number of teams fifa table: {len(fifa_teams_list_year_country)}')
            print('********************************************************')
            print('********************************************************')
            print('********************************************************')
            print('\n')
        
        else:

            # We create an auxiliar DataFrame (df_aux) in order to sort the list of results
            # teams per year and per country by wuzzyfuzzy score, starting on the highest
            # score. This will allow to start matching names that have a higher level of
            # confidence, and consequently will allow to have a shorter list of names to
            # match the last teams, whose level of confidence is lower.
            
                
            data = {'Results_name':results_teams_list_year_country}
            df_aux = pd.DataFrame(data)
            
            df_aux['Fifa_name_suggested'] = df_aux['Results_name'].apply(
        (lambda x: process.extractOne(x, fifa_teams_list_year_country)[0])
    )
            
            df_aux['Fifa_name_suggested_fuzzywuzzy_score'] = df_aux['Results_name'].apply(
        (lambda x: process.extractOne(x, fifa_teams_list_year_country)[1])
    )
            
            df_aux.sort_values(by = ['Fifa_name_suggested_fuzzywuzzy_score'], ascending = False,
                          inplace = True)
            
            results_teams_list_year_country = list(df_aux['Results_name'])
            
  
            # Here we look for the best match between a results team name and a fifa team
            # name. For that we run different loops with different levels of fuzzywuzzy
            # score, starting in the highest one.
            
            
            fuzzywuzzy_score_level = [100,90,80,70,60,50,40,30,20,10,0]
            
            for results_team in results_teams_list_year_country:
                
                if results_team in d.keys():
                    continue
                
                else:

                    for score_level in fuzzywuzzy_score_level:

                        fuzzywuzzy_min_score = score_level

                        fifa_team_suggested = process.extractOne(results_team, 
                                fifa_teams_list_year_country)[0]
                        fifa_team_suggested_score = process.extractOne(results_team, 
                                fifa_teams_list_year_country)[1]

                        
                        # We remove the fifa_team_suggested from the list so the same name is
                        # not suggested twice
                        if fifa_team_suggested_score >= fuzzywuzzy_min_score:
                            fifa_teams_list_year_country.remove(fifa_team_suggested)

                            if fifa_team_suggested not in d.values():
                                
                                # With .loc we will create a dataframe just showing the rows in
                                # which the the team name is the fifa_team_suggested and then
                                # with .iloc we will get all the names of the team that appear
                                # in the fifa table
                                if results_team not in d.keys():
                                    d[results_team] = table_fifa_all_years.loc[
                                        table_fifa_all_years['Name']==fifa_team_suggested]\
                                    ['Fifa_team_all_names'].iloc[0]

                                else:
                                    d[results_team].append(
                                table_fifa_all_years.loc[table_fifa_all_years['Name']==fifa_team_suggested]
                                     ['Fifa_team_all_names'].iloc[0])
                                

                            print('Results team:',results_team,'-->','Fifa team:',fifa_team_suggested,
                                  '-->',fifa_team_suggested_score)

                            break
            
            print('\n')
            print('-----------------------------')
            print('\n')

2005
Portugal


Results team: Boavista --> Fifa team: Boavista --> 100
Results team: Nacional --> Fifa team: Nacional --> 100
Results team: Moreirense --> Fifa team: Moreirense --> 100
Results team: Marítimo --> Fifa team: Marítimo --> 100
Results team: Gil Vicente --> Fifa team: Gil Vicente --> 100
Results team: Sporting Braga --> Fifa team: Sporting Braga --> 100
Results team: Benfica --> Fifa team: Benfica --> 100
Results team: Rio Ave --> Fifa team: Rio Ave --> 100
Results team: União de Leiria --> Fifa team: União Leiria --> 95
Results team: Os Belenenses --> Fifa team: Belenenses --> 95
Results team: Beira Mar SC --> Fifa team: SC Beira-Mar --> 95
Results team: Estoril --> Fifa team: Estoril Praia --> 90
Results team: Porto --> Fifa team: FC Porto --> 90
Results team: Penafiel --> Fifa team: F.C. Penafiel --> 90
Results team: Vitória Setúbal --> Fifa team: Vitória Futebol Clube --> 86
Results team: Vitória Guimarães --> Fifa team: Vitória SC --> 86
Results team: Sporting CP --> F

Results team: Nancy --> Fifa team: AS Nancy-Lorraine --> 90
Results team: Troyes --> Fifa team: ES Troyes AC --> 90


-----------------------------


2007
Portugal


Results team: Desportivo Aves --> Fifa team: Deportivo Aves --> 97


-----------------------------


2007
Spain


Results team: Gimnàstic Tarragona --> Fifa team: Tarragona --> 90
Results team: Recreativo --> Fifa team: Recreativo de Huelva --> 90


-----------------------------


2007
England


Results team: Watford --> Fifa team: Watford --> 100
Results team: Sheffield United --> Fifa team: Sheffield United --> 100
Results team: Reading --> Fifa team: Reading FC --> 95


-----------------------------


2007
Italy


Results team: Torino --> Fifa team: Torino --> 100
Results team: Catania --> Fifa team: Catania --> 100


-----------------------------


2007
Germany


Results team: Alemannia Aachen --> Fifa team: Alemannia Aachen --> 100
Results team: Energie Cottbus --> Fifa team: FC Energie Cottbus --> 95


--------------

2018
England


Results team: Huddersfield Town --> Fifa team: Huddersfield Town --> 100
Results team: Brighton & Hove Albion --> Fifa team: Brighton & Hove Albion --> 100


-----------------------------


2018
Italy


Results team: Benevento --> Fifa team: Benevento --> 100
Results team: SPAL --> Fifa team: Spal --> 100


-----------------------------


2018
Germany




-----------------------------


2018
France


Results team: Amiens SC --> Fifa team: Amiens SC Football --> 90


-----------------------------


2019
Portugal


Results team: CD Santa Clara --> Fifa team: Santa Clara --> 95
Results team: Belenenses SAD --> Fifa team: Os Belenenses --> 85


-----------------------------


2019
Spain


Results team: Huesca --> Fifa team: SD Huesca --> 90


-----------------------------


2019
England




-----------------------------


2019
Italy




-----------------------------


2019
Germany




-----------------------------


2019
France


Results team: Nîmes --> Fifa team: Nîmes Olym

In [26]:
d

{'Boavista': "['Boavista', 'Boavista Futebol Clube', 'Boavista FC']",
 'Nacional': "['Nacional', 'Clube Desportivo Nacional', 'CD Nacional', 'Funchal']",
 'Moreirense': "['Moreirense', 'Moreira de Cónegos', 'Moreirense FC']",
 'Marítimo': "['Marítimo', 'Marítimo da Madeira', 'CS Marítimo', 'C. Funchal']",
 'Gil Vicente': "['Gil Vicente', 'V. Barcelos', 'Gil Vicente FC']",
 'Sporting Braga': "['Sporting Braga', 'SC Braga', 'Braga']",
 'Benfica': "['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']",
 'Rio Ave': "['Rio Ave', 'Rio Ave FC']",
 'União de Leiria': "['União Leiria', 'União Desportivo de Leiria', 'União de Leiria, SAD']",
 'Os Belenenses': "['Belenenses', 'CF Os Belenenses', 'Belém', 'C.F. Os Belenenses', 'Os Belenenses']",
 'Beira Mar SC': "['SC Beira-Mar', 'SC Beira Mar']",
 'Estoril': "['Estoril Praia', 'Estoril', 'GD Estoril-Praia']",
 'Porto': "['FC Porto', 'F.C. Porto']",
 'Penafiel': "['F.C. Penafiel', 'FC Penafiel']",
 'Vitória Setúbal': "['Vitória Futebol Clube', 'Vitór

In [27]:
# Manually correct the errors of the dictionary. These errors happen because one of the
# teams has a very different name in the results table compared to the fifa table, or
# because one team has an opponent team with a very similar name, so in these rare cases
# the library fuzzywuzzy does not work well.

print('Before manual correction:')
print('Milan:',d['Milan'])
print('Inter:',d['Inter'])
print('Fiorentina:',d['Fiorentina'])
print('Köln:',d['Köln'])
print('Feirense:',d['Feirense'])
print('Saint-Étienne:',d['Saint-Étienne'])
print('PSG:',d['PSG'])

d['Milan'] = table_fifa_all_years[table_fifa_all_years['Name']=='AC Milan']\
['Fifa_team_all_names'].iloc[0]
d['Inter'] = table_fifa_all_years[table_fifa_all_years['Name']=='Inter Milan']\
['Fifa_team_all_names'].iloc[0]
d['Fiorentina'] = table_fifa_all_years[table_fifa_all_years['Name']=='Firenze']\
['Fifa_team_all_names'].iloc[0]
d['Köln'] = table_fifa_all_years[table_fifa_all_years['Name']=='FC Cologne']\
['Fifa_team_all_names'].iloc[0]
d['Feirense'] = table_fifa_all_years[table_fifa_all_years['Name']=='F. Santa Maria da Feira']\
['Fifa_team_all_names'].iloc[0]
d['Saint-Étienne'] = table_fifa_all_years[table_fifa_all_years['Name']=='AS Saint-Etienne']\
['Fifa_team_all_names'].iloc[0]
d['PSG'] = table_fifa_all_years[table_fifa_all_years['Name']=='Paris Saint-Germain']\
['Fifa_team_all_names'].iloc[0]
d['Olympique'] = table_fifa_all_years[table_fifa_all_years['Name']=='Olympique de Marseille']\
['Fifa_team_all_names'].iloc[0]

print('\n')
print('After manual correction:')
print('Milan:',d['Milan'])
print('Inter:',d['Inter'])
print('Fiorentina:',d['Fiorentina'])
print('Köln:',d['Köln'])
print('Feirense:',d['Feirense'])
print('Saint-Étienne:',d['Saint-Étienne'])
print('PSG:',d['PSG'])

Before manual correction:
Milan: ['Inter Milan', 'Inter']
Inter: ['Firenze', 'Fiorentina', 'ACF Fiorentina']
Fiorentina: ['AC Milan', 'Milan']
Köln: ['Hertha BSC Berlin', 'Hertha BSC', 'Hertha Berlin']
Feirense: ['Paços de Ferreira', 'Paços Ferreira', 'FC Paços de Ferreira']
Saint-Étienne: ['AS Saint-Etienne', 'A.S. Saint-Etienne', 'AS Saint-Étienne']
PSG: ['Paris Saint-Germain']


After manual correction:
Milan: ['AC Milan', 'Milan']
Inter: ['Inter Milan', 'Inter']
Fiorentina: ['Firenze', 'Fiorentina', 'ACF Fiorentina']
Köln: ['FC Cologne', '1. FC Köln']
Feirense: ['F. Santa Maria da Feira', 'CD Feirense']
Saint-Étienne: ['AS Saint-Etienne', 'A.S. Saint-Etienne', 'AS Saint-Étienne']
PSG: ['Paris Saint-Germain']


In [28]:
d

{'Boavista': "['Boavista', 'Boavista Futebol Clube', 'Boavista FC']",
 'Nacional': "['Nacional', 'Clube Desportivo Nacional', 'CD Nacional', 'Funchal']",
 'Moreirense': "['Moreirense', 'Moreira de Cónegos', 'Moreirense FC']",
 'Marítimo': "['Marítimo', 'Marítimo da Madeira', 'CS Marítimo', 'C. Funchal']",
 'Gil Vicente': "['Gil Vicente', 'V. Barcelos', 'Gil Vicente FC']",
 'Sporting Braga': "['Sporting Braga', 'SC Braga', 'Braga']",
 'Benfica': "['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']",
 'Rio Ave': "['Rio Ave', 'Rio Ave FC']",
 'União de Leiria': "['União Leiria', 'União Desportivo de Leiria', 'União de Leiria, SAD']",
 'Os Belenenses': "['Belenenses', 'CF Os Belenenses', 'Belém', 'C.F. Os Belenenses', 'Os Belenenses']",
 'Beira Mar SC': "['SC Beira-Mar', 'SC Beira Mar']",
 'Estoril': "['Estoril Praia', 'Estoril', 'GD Estoril-Praia']",
 'Porto': "['FC Porto', 'F.C. Porto']",
 'Penafiel': "['F.C. Penafiel', 'FC Penafiel']",
 'Vitória Setúbal': "['Vitória Futebol Clube', 'Vitór

#### Join Results table and Fifa table

In [29]:
# Add 2 auxiliar columns to table_results_all_years with all the fifa team names for both home
# and away team to help joining both tables

table_results_all_years['Home_team_fifa_team_all_names'] = table_results_all_years['Home_team'].apply(
        (lambda x: d[x])
    )

table_results_all_years['Away_team_fifa_team_all_names'] = table_results_all_years['Away_team'].apply(
        (lambda x: d[x])
    )

table_results_all_years

Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,...,number_of_games_last_days_home_team,number_of_games_last_days_away_team,points_respective_year_home_team,points_respective_year_away_team,points_last_games_home_team,points_last_games_away_team,points_between_teams_home_team,points_between_teams_away_team,Home_team_fifa_team_all_names,Away_team_fifa_team_all_names
0,2005,Portugal,22 May 05,2005-05-22,Liga Portuguesa,National League,Boavista,Benfica,Finalized,1 - 1,...,2,2,49,64,1,10,0,0,"['Boavista', 'Boavista Futebol Clube', 'Boavis...","['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']"
1,2005,Portugal,14 May 05,2005-05-14,Liga Portuguesa,National League,Benfica,Sporting CP,Finalized,1 - 0,...,3,5,61,61,7,13,0,0,"['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']","['Sporting Lisbon', 'Sporting CP Lisbon', 'Spo..."
2,2005,Portugal,07 May 05,2005-05-07,Liga Portuguesa,National League,Penafiel,Benfica,Finalized,1 - 0,...,3,4,37,61,9,10,0,0,"['F.C. Penafiel', 'FC Penafiel']","['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']"
3,2005,Portugal,30 Apr 05,2005-04-30,Liga Portuguesa,National League,Benfica,Os Belenenses,Finalized,1 - 0,...,4,3,58,42,10,10,0,0,"['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']","['Belenenses', 'CF Os Belenenses', 'Belém', 'C..."
4,2005,Portugal,24 Apr 05,2005-04-24,Liga Portuguesa,National League,Estoril,Benfica,Finalized,1 - 2,...,3,4,26,55,3,10,0,0,"['Estoril Praia', 'Estoril', 'GD Estoril-Praia']","['Benfica', 'Sport Lisboa Benfica', 'SL Benfica']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37744,2022,France,06 Mar 22,2022-03-06,Ligue 1,National League,Saint-Étienne,Metz,Not played yet,-,...,3,3,15,19,3,0,17,5,"['AS Saint-Etienne', 'A.S. Saint-Etienne', 'AS...","['FC Metz', 'Football Club de Metz']"
37745,2022,France,12 Dec 21,2021-12-12,Ligue 1,National League,Metz,Lorient,Finalized,4 - 1,...,4,4,12,15,5,0,4,7,"['FC Metz', 'Football Club de Metz']","['Lorient', 'FC Lorient Bretagne Sud', 'FC Lor..."
37746,2022,France,30 Oct 21,2021-10-30,Ligue 1,National League,Metz,Saint-Étienne,Finalized,1 - 1,...,2,2,6,5,3,2,11,11,"['FC Metz', 'Football Club de Metz']","['AS Saint-Etienne', 'A.S. Saint-Etienne', 'AS..."
37747,2022,France,10 Apr 22,2022-04-10,Ligue 1,National League,Lorient,Saint-Étienne,Not played yet,-,...,2,2,17,15,0,0,24,9,"['Lorient', 'FC Lorient Bretagne Sud', 'FC Lor...","['AS Saint-Etienne', 'A.S. Saint-Etienne', 'AS..."


In [30]:
%%time

# Join table_results_all_years and fifa table

# first we start adding to the results table the columns we need
# from the fifa table just for the home teams

df_join_only_home = pd.DataFrame()

Columns = ['ATT','MID','DEF','OVR','Rival_team','Budget_Mill_€']


for year in Years:
    for country in Countries:
        df_1 = table_results_all_years[(table_results_all_years['Year'] == year) &
                  (table_results_all_years['Country'] == country)
                  ]

        df_2 = table_fifa_all_years[(table_fifa_all_years['Year'] == year) &
                  (table_fifa_all_years['Country'] == country)
                  ]

        df_aux = pd.merge(df_1, df_2[['Fifa_team_all_names','ATT','MID','DEF',
                                      'OVR','Rival_team','Budget_Mill_€']], 
            left_on= 'Home_team_fifa_team_all_names', 
            right_on = 'Fifa_team_all_names')

        df_join_only_home = pd.concat([df_join_only_home,df_aux])



df_join_only_home.drop(['Fifa_team_all_names'], axis=1, inplace=True)

# Here we rename the new columns and add "Home_team" as a prefix
for column in Columns:
    df_join_only_home.rename({
    column:f'Home_team_{column}'
    }, axis=1, inplace=True)


# Now that we have added the home team data, let's add it for the away team.
# Here one of the tables to merge is the one we had generated for the
# home_team
    
df_join_total = pd.DataFrame()

for year in Years:
    for country in Countries:
        df_1 = df_join_only_home[(df_join_only_home['Year'] == year) &
                  (df_join_only_home['Country'] == country)
                  ]

        df_2 = table_fifa_all_years[(table_fifa_all_years['Year'] == year) &
                  (table_fifa_all_years['Country'] == country)
                  ]

        df_aux = pd.merge(df_1, df_2[['Fifa_team_all_names','ATT','MID','DEF',
                                      'OVR','Rival_team','Budget_Mill_€']], 
            left_on= 'Away_team_fifa_team_all_names', 
            right_on = 'Fifa_team_all_names')

        df_join_total = pd.concat([df_join_total,df_aux])


df_join_total.drop(['Fifa_team_all_names'], axis=1, inplace=True)

# Here we rename the new columns and add "Away_team" as a prefix
for column in Columns:
    df_join_total.rename({
    column:f'Away_team_{column}'
    }, axis=1, inplace=True)

df_join_total

CPU times: user 2.59 s, sys: 163 ms, total: 2.76 s
Wall time: 2.76 s


Unnamed: 0,Year,Country,Date,Datetime_date_list,Competition_original_name_URL,Competition,Home_team,Away_team,Status,Result,...,Home_team_DEF,Home_team_OVR,Home_team_Rival_team,Home_team_Budget_Mill_€,Away_team_ATT,Away_team_MID,Away_team_DEF,Away_team_OVR,Away_team_Rival_team,Away_team_Budget_Mill_€
0,2005,Portugal,22 May 05,2005-05-22,Liga Portuguesa,National League,Boavista,Benfica,Finalized,1 - 1,...,53,58,Vitória SC,0.375,80,74,74,75,Sporting Lisbon,5.0
1,2005,Portugal,07 May 05,2005-05-07,Liga Portuguesa,National League,Penafiel,Benfica,Finalized,1 - 0,...,55,61,FC Porto,0.100,80,74,74,75,Sporting Lisbon,5.0
2,2005,Portugal,24 Apr 05,2005-04-24,Liga Portuguesa,National League,Estoril,Benfica,Finalized,1 - 2,...,52,59,Vitória Futebol Clube,2.000,80,74,74,75,Sporting Lisbon,5.0
3,2005,Portugal,10 Apr 05,2005-04-10,Liga Portuguesa,National League,Rio Ave,Benfica,Finalized,1 - 0,...,48,52,Benfica,2.900,80,74,74,75,Sporting Lisbon,5.0
4,2005,Portugal,19 Mar 05,2005-03-19,Liga Portuguesa,National League,Vitória Setúbal,Benfica,Finalized,0 - 2,...,56,61,Belenenses,0.350,80,74,74,75,Sporting Lisbon,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2022,France,03 Oct 21,2021-10-03,Ligue 1,National League,Stade Rennais,PSG,Finalized,2 - 0,...,75,76,FC Nantes,14.000,88,83,85,86,Olympique de Marseille,160.0
376,2022,France,22 Sep 21,2021-09-22,Ligue 1,National League,Metz,PSG,Finalized,1 - 2,...,73,73,AS Nancy,7.000,88,83,85,86,Olympique de Marseille,160.0
377,2022,France,29 Aug 21,2021-08-29,Ligue 1,National League,Stade de Reims,PSG,Finalized,0 - 2,...,74,73,ESTAC Troyes,8.500,88,83,85,86,Olympique de Marseille,160.0
378,2022,France,20 Aug 21,2021-08-20,Ligue 1,National League,Stade Brestois,PSG,Finalized,2 - 4,...,73,73,En Avant Guingamp,6.500,88,83,85,86,Olympique de Marseille,160.0


In [31]:
# In order to simply the code from now on, and considering we
# already have all the info we need in just 1 table, let's call
# simply df to the new data frame

df = df_join_total

#### New column: Rivals

In [32]:
# Let's add another column that tells us if the game between the 2 teams
# is between rivals. If they are, the column will show 1, otherwise it
# will show 0.

df['Rivals'] = df.apply(
lambda x: 1 if (x['Home_team'] == x['Away_team_Rival_team'] or x['Away_team'] == x['Home_team_Rival_team']) else 0, axis = 1
)

# Let's drop some irrelevant columns from the df

df.drop(['Datetime_date_list','Competition_original_name_URL',
    'Home_score', 'Away_score',
    'Points_Home_Team', 'Points_Away_Team',
    'Home_team_fifa_team_all_names',
    'Away_team_fifa_team_all_names'], axis = 1, inplace = True)

df

Unnamed: 0,Year,Country,Date,Competition,Home_team,Away_team,Status,Result,1x2,number_of_games_last_days_home_team,...,Home_team_OVR,Home_team_Rival_team,Home_team_Budget_Mill_€,Away_team_ATT,Away_team_MID,Away_team_DEF,Away_team_OVR,Away_team_Rival_team,Away_team_Budget_Mill_€,Rivals
0,2005,Portugal,22 May 05,National League,Boavista,Benfica,Finalized,1 - 1,x,2,...,58,Vitória SC,0.375,80,74,74,75,Sporting Lisbon,5.0,0
1,2005,Portugal,07 May 05,National League,Penafiel,Benfica,Finalized,1 - 0,1,3,...,61,FC Porto,0.100,80,74,74,75,Sporting Lisbon,5.0,0
2,2005,Portugal,24 Apr 05,National League,Estoril,Benfica,Finalized,1 - 2,2,3,...,59,Vitória Futebol Clube,2.000,80,74,74,75,Sporting Lisbon,5.0,0
3,2005,Portugal,10 Apr 05,National League,Rio Ave,Benfica,Finalized,1 - 0,1,2,...,52,Benfica,2.900,80,74,74,75,Sporting Lisbon,5.0,1
4,2005,Portugal,19 Mar 05,National League,Vitória Setúbal,Benfica,Finalized,0 - 2,2,4,...,61,Belenenses,0.350,80,74,74,75,Sporting Lisbon,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2022,France,03 Oct 21,National League,Stade Rennais,PSG,Finalized,2 - 0,1,4,...,76,FC Nantes,14.000,88,83,85,86,Olympique de Marseille,160.0,0
376,2022,France,22 Sep 21,National League,Metz,PSG,Finalized,1 - 2,2,2,...,73,AS Nancy,7.000,88,83,85,86,Olympique de Marseille,160.0,0
377,2022,France,29 Aug 21,National League,Stade de Reims,PSG,Finalized,0 - 2,2,3,...,73,ESTAC Troyes,8.500,88,83,85,86,Olympique de Marseille,160.0,0
378,2022,France,20 Aug 21,National League,Stade Brestois,PSG,Finalized,2 - 4,2,2,...,73,En Avant Guingamp,6.500,88,83,85,86,Olympique de Marseille,160.0,0


In [33]:
df.to_excel(
    'Merged_table_from_2005.xlsx', index = False)

# Read df as the dataframe with all info (table_results + fifa, all years)

In [34]:
df = pd.read_excel('Merged_table_from_2005.xlsx')
df

Unnamed: 0,Year,Country,Date,Competition,Home_team,Away_team,Status,Result,1x2,number_of_games_last_days_home_team,...,Home_team_OVR,Home_team_Rival_team,Home_team_Budget_Mill_€,Away_team_ATT,Away_team_MID,Away_team_DEF,Away_team_OVR,Away_team_Rival_team,Away_team_Budget_Mill_€,Rivals
0,2005,Portugal,22 May 05,National League,Boavista,Benfica,Finalized,1 - 1,x,2,...,58,Vitória SC,0.375,80,74,74,75,Sporting Lisbon,5.0,0
1,2005,Portugal,07 May 05,National League,Penafiel,Benfica,Finalized,1 - 0,1,3,...,61,FC Porto,0.100,80,74,74,75,Sporting Lisbon,5.0,0
2,2005,Portugal,24 Apr 05,National League,Estoril,Benfica,Finalized,1 - 2,2,3,...,59,Vitória Futebol Clube,2.000,80,74,74,75,Sporting Lisbon,5.0,0
3,2005,Portugal,10 Apr 05,National League,Rio Ave,Benfica,Finalized,1 - 0,1,2,...,52,Benfica,2.900,80,74,74,75,Sporting Lisbon,5.0,1
4,2005,Portugal,19 Mar 05,National League,Vitória Setúbal,Benfica,Finalized,0 - 2,2,4,...,61,Belenenses,0.350,80,74,74,75,Sporting Lisbon,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37744,2022,France,03 Oct 21,National League,Stade Rennais,PSG,Finalized,2 - 0,1,4,...,76,FC Nantes,14.000,88,83,85,86,Olympique de Marseille,160.0,0
37745,2022,France,22 Sep 21,National League,Metz,PSG,Finalized,1 - 2,2,2,...,73,AS Nancy,7.000,88,83,85,86,Olympique de Marseille,160.0,0
37746,2022,France,29 Aug 21,National League,Stade de Reims,PSG,Finalized,0 - 2,2,3,...,73,ESTAC Troyes,8.500,88,83,85,86,Olympique de Marseille,160.0,0
37747,2022,France,20 Aug 21,National League,Stade Brestois,PSG,Finalized,2 - 4,2,2,...,73,En Avant Guingamp,6.500,88,83,85,86,Olympique de Marseille,160.0,0


# Train and apply the model

In [35]:
%%time

# Let's extract the games of the current weekday and the ones of the following one

current_weekday_matches = []
following_weekday_matches = []

for Country in Countries:

    if Country == 'Portugal':
        country = 'portugal'
    elif Country == 'Spain':
        country = 'primera'
    elif Country == 'England':
        country = 'premier'
    elif Country == 'Italy':
        country = 'serie_a'
    elif Country == 'Germany':
        country = 'bundesliga'
    elif Country == 'France':
        country = 'ligue_1'
    else:
        print('Country not available, please choose between Portugal,\
    Spain, England, Italy, Germany or France')


    # Current weekday matches (the current weekday is the one that opens by
    # default when we open the page of the respective league in the "resultados
    # futbol" website).

    URL = f'https://www.resultados-futbol.com/{country}'
    r = requests.get(URL)
    page = r.content
    soup = BeautifulSoup(page, 'html5lib')
    for match_ in range(len(soup.find_all(class_ = "summary hidden"))):
        match = soup.find_all(class_ = "summary hidden")[match_].text
        current_weekday_matches.append(match)
    
    
    current_weekday = soup.find_all('div', class_ = "j_cur")[1].find('a').text
    current_weekday_number = int(re.findall(r'\d+', current_weekday)[0])
    following_weekday_number = current_weekday_number+1
    
    # Following weekday matches

    URL = f'https://www.resultados-futbol.com/{country}/grupo1/{following_weekday_number}'
    r = requests.get(URL)
    page = r.content
    soup = BeautifulSoup(page, 'html5lib')
    for match_ in range(len(soup.find_all(class_ = "summary hidden"))):
        match = soup.find_all(class_ = "summary hidden")[match_].text
        following_weekday_matches.append(match)

CPU times: user 17.3 s, sys: 73.1 ms, total: 17.4 s
Wall time: 22.2 s


In [36]:
# Add a column with the type of weekday (current, following, other)

df['Weekday'] = df.apply(
lambda row: 'Current' if ((row['Year'] == current_year) and \
    ((row['Home_team'] + ' - ' + row['Away_team']) in current_weekday_matches)) \
    else 'Following' if ((row['Year'] == current_year) and \
    ((row['Home_team'] + ' - ' + row['Away_team']) in following_weekday_matches)) \
    else 'Other', axis=1)

df[df['Weekday']=='Current'][['Year','Country','Date','Home_team','Away_team','Status','Weekday']]

Unnamed: 0,Year,Country,Date,Home_team,Away_team,Status,Weekday
35696,2022,Portugal,30 Jan 22,Portimonense,Tondela,Still playing,Current
35703,2022,Portugal,02 Feb 22,Benfica,Gil Vicente,Not played yet,Current
35727,2022,Portugal,02 Feb 22,Belenenses SAD,Sporting CP,Not played yet,Current
35736,2022,Portugal,30 Jan 22,Porto,Marítimo,Not played yet,Current
35813,2022,Portugal,30 Jan 22,Vizela,Vitória Guimarães,Still playing,Current
35832,2022,Portugal,01 Feb 22,CD Santa Clara,Boavista,Not played yet,Current
35846,2022,Portugal,31 Jan 22,Estoril,Paços de Ferreira,Not played yet,Current
35857,2022,Portugal,30 Jan 22,Sporting Braga,Moreirense,Not played yet,Current
35888,2022,Portugal,31 Jan 22,Famalicão,Arouca,Not played yet,Current
35938,2022,Spain,21 Jan 22,Espanyol,Real Betis,Finalized,Current


In [37]:
df.columns

Index(['Year', 'Country', 'Date', 'Competition', 'Home_team', 'Away_team',
       'Status', 'Result', '1x2', 'number_of_games_last_days_home_team',
       'number_of_games_last_days_away_team',
       'points_respective_year_home_team', 'points_respective_year_away_team',
       'points_last_games_home_team', 'points_last_games_away_team',
       'points_between_teams_home_team', 'points_between_teams_away_team',
       'Home_team_ATT', 'Home_team_MID', 'Home_team_DEF', 'Home_team_OVR',
       'Home_team_Rival_team', 'Home_team_Budget_Mill_€', 'Away_team_ATT',
       'Away_team_MID', 'Away_team_DEF', 'Away_team_OVR',
       'Away_team_Rival_team', 'Away_team_Budget_Mill_€', 'Rivals', 'Weekday'],
      dtype='object')

In [38]:
# The train dataset is all the dataset with finalized games

df_train = df[df['Status'] == 'Finalized']

In [39]:
# The columns we use to train the algorithm are the ones we have selected in the 
# previous notebook, when we were looking for the best model.

X_train = df_train[['Year', 'Country', 'Home_team', 'Away_team',
       'number_of_games_last_days_home_team', 'number_of_games_last_days_away_team',
       'points_respective_year_home_team', 'points_respective_year_away_team',
       'points_last_games_home_team', 'points_last_games_away_team',
       'points_between_teams_home_team', 'points_between_teams_away_team',
       'Home_team_OVR', 'Away_team_OVR', 'Rivals']]

y_train = df_train['1x2']

In [40]:
X_train.dtypes

Year                                    int64
Country                                object
Home_team                              object
Away_team                              object
number_of_games_last_days_home_team     int64
number_of_games_last_days_away_team     int64
points_respective_year_home_team        int64
points_respective_year_away_team        int64
points_last_games_home_team             int64
points_last_games_away_team             int64
points_between_teams_home_team          int64
points_between_teams_away_team          int64
Home_team_OVR                           int64
Away_team_OVR                           int64
Rivals                                  int64
dtype: object

In [41]:
# Although the columns "Year" and "Rivals" are of type integer, they are basically
# "tags" and should be considered as text, because its values being higher or lower
# should not be considered better or worse, unlike the other columns of type integer.

X_train_numeric_data = X_train.select_dtypes(include=['int64', 'float64']
                ).drop(['Year','Rivals'],axis=1)

X_train_categorical_data = X_train[list(X_train.select_dtypes(include=['object']).columns) +\
                                      ['Year','Rivals']]

In [42]:
# Let's create a dataframe with only the games we need to predict.

df_predictions = df[(df['Year'] == current_year) & \
                   (df['Weekday'] != 'Other')]

df_predictions

Unnamed: 0,Year,Country,Date,Competition,Home_team,Away_team,Status,Result,1x2,number_of_games_last_days_home_team,...,Home_team_Rival_team,Home_team_Budget_Mill_€,Away_team_ATT,Away_team_MID,Away_team_DEF,Away_team_OVR,Away_team_Rival_team,Away_team_Budget_Mill_€,Rivals,Weekday
35632,2022,Portugal,05 Feb 22,National League,Marítimo,Estoril,Not played yet,-,-,3,...,Santa Clara,3.9,68,70,70,69,Belenenses,2.0,0,Following
35638,2022,Portugal,06 Feb 22,National League,Boavista,Vizela,Not played yet,-,-,4,...,Vitória de Guimarães,4.2,70,70,68,69,Moreirense,1.6,0,Following
35656,2022,Portugal,05 Feb 22,National League,Paços de Ferreira,Portimonense,Not played yet,-,-,3,...,Vitória de Guimarães,3.3,68,72,70,70,Tondela,3.3,0,Following
35681,2022,Portugal,06 Feb 22,National League,Gil Vicente,CD Santa Clara,Not played yet,-,-,2,...,Moreirense,2.9,67,69,69,70,Marítimo,2.3,0,Following
35696,2022,Portugal,30 Jan 22,National League,Portimonense,Tondela,Still playing,0 - 0,x,4,...,Tondela,3.3,70,68,69,69,Arouca,2.1,1,Current
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37665,2022,France,05 Feb 22,National League,Saint-Étienne,Montpellier,Not played yet,-,-,4,...,Olympique Lyonnais,12.0,72,76,73,74,Nîmes Olympique,7.5,0,Following
37678,2022,France,05 Feb 22,National League,Monaco,Olympique Lyonnais,Not played yet,-,-,3,...,OGC Nice,26.0,79,78,76,78,AS Saint-Étienne,50.0,0,Following
37697,2022,France,06 Feb 22,National League,Nice,Clermont,Not played yet,-,-,2,...,AS Monaco,14.5,72,71,73,71,AS Saint-Étienne,4.5,0,Following
37723,2022,France,23 Jan 22,National League,Girondins Bordeaux,Strasbourg,Finalized,4 - 3,1,3,...,Olympique de Marseille,9.0,77,74,74,75,FC Metz,7.5,0,Current


In [43]:
# We just need the columns that are used in the train set.

X_predict = df_predictions[['Year', 'Country', 'Home_team', 'Away_team',
       'number_of_games_last_days_home_team', 'number_of_games_last_days_away_team',
       'points_respective_year_home_team', 'points_respective_year_away_team',
       'points_last_games_home_team', 'points_last_games_away_team',
       'points_between_teams_home_team', 'points_between_teams_away_team',
       'Home_team_OVR', 'Away_team_OVR', 'Rivals']]

In [44]:
# Again, let's consider the columns "Year" and "Rivals" as categorical and not
# numeric.

numeric_columns = X_predict.select_dtypes(include=['int64', 'float64']
        ).drop(['Year','Rivals'],axis=1).columns

categorical_columns = list(X_predict.select_dtypes(include=['object']).columns) + ['Year','Rivals']

In [45]:
%%time

# We will apply here the same pipeline we used when finding the best
# model. It consists in applying the transformer Standard Scaler to
# the numeric columns and OneHotEncoder to the categorical ones.
# Then, apply the best model (LogisticRegression) with the best
# parameters (C=10)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(C=10))])

model.fit(X_train, y_train)
y_pred = model.predict(X_predict)

y_pred

CPU times: user 1.55 s, sys: 34.7 ms, total: 1.58 s
Wall time: 1.67 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['1', '1', '1', '1', '1', '1', '2', '1', '1', '2', '2', '2', '1',
       '1', '1', 'x', 'x', '2', '1', '1', '1', '1', '2', '1', '1', '1',
       '1', '1', '1', '2', '1', '2', '2', '1', '1', 'x', '1', '1', '1',
       '1', '1', '2', '1', '2', '1', '1', '2', '1', '2', '2', '1', 'x',
       '2', '1', '1', '2', '1', '2', '1', '1', '2', '1', '2', '1', '2',
       '2', '1', '1', '1', 'x', '1', '2', '2', '1', '2', '1', '1', '1',
       '2', '2', '1', 'x', '1', 'x', '1', '1', '2', '2', '2', '1', '2',
       '1', '1', '1', '1', '1', 'x', '1', '1', 'x', '1', 'x', '1', '2',
       '1', '1', '2', '2', '1', '2', '1', 'x', '1', '1', 'x', '2'],
      dtype=object)

# Prepare a dataframe to show the end user the predictions

In [46]:
# Let's check the order of the classification classes

model.classes_

array(['1', '2', 'x'], dtype=object)

In [47]:
# Now let's check the probability of each class, per match

model.predict_proba(X_predict)

array([[0.50728821, 0.21635029, 0.2763615 ],
       [0.41505085, 0.24197595, 0.3429732 ],
       [0.52559353, 0.18236967, 0.2920368 ],
       [0.4507096 , 0.30628458, 0.24300582],
       [0.45356216, 0.24255939, 0.30387845],
       [0.79232694, 0.05249224, 0.15518082],
       [0.09225268, 0.70104404, 0.20670328],
       [0.84865628, 0.04045494, 0.11088878],
       [0.66664959, 0.09342497, 0.23992544],
       [0.07831587, 0.81296573, 0.1087184 ],
       [0.32909501, 0.3824966 , 0.28840839],
       [0.28462276, 0.40098855, 0.31438869],
       [0.4104359 , 0.18667556, 0.40288853],
       [0.36753009, 0.34195043, 0.29051948],
       [0.71024534, 0.12598369, 0.16377096],
       [0.29210485, 0.14529894, 0.56259621],
       [0.25061243, 0.17581331, 0.57357426],
       [0.05328122, 0.82645926, 0.12025952],
       [0.39974009, 0.33059994, 0.26965998],
       [0.54717645, 0.18845672, 0.26436683],
       [0.53239296, 0.17476275, 0.29284428],
       [0.53873998, 0.17363886, 0.28762116],
       [0.

In [48]:
# Finally, let's build a DataFrame with the info we want to show the final user:
# basic info about the game (Year, Country, Weekday, Date, Status, Home_team,
# Away_team and result - for the games that have already finished) and what is
# the prediction of the result (and the probability of each outcome).

list_predictions = []

for i in range(df_predictions.shape[0]):
    list_predictions.append(y_pred[i])


list_probability_1 = []
list_probability_2 = []
list_probability_X = []

for i in range(df_predictions.shape[0]):
    probability_1_i = model.predict_proba(X_predict[i:i+1]).tolist()[0][0]
    probability_1_i = str(round(probability_1_i*100,1)) + '%'
    list_probability_1.append(probability_1_i)

    probability_2_i = model.predict_proba(X_predict[i:i+1]).tolist()[0][1]
    probability_2_i = str(round(probability_2_i*100,1)) + '%'
    list_probability_2.append(probability_2_i)
    
    probability_X_i = model.predict_proba(X_predict[i:i+1]).tolist()[0][2]
    probability_X_i = str(round(probability_X_i*100,1)) + '%'
    list_probability_X.append(probability_X_i)

    
# Let's create the dataframe final_user with the columns predictions and
# probability of each match outcome

df_final_user = df_predictions.copy()
    
df_final_user['Prediction'] = list_predictions
df_final_user['Probability 1'] = list_probability_1
df_final_user['Probability X'] = list_probability_X
df_final_user['Probability 2'] = list_probability_2

# Let's also delete the underscore from the column names Home_team and
# Away_team, so those names look better when showing them to the final user

df_final_user.rename(columns={'Home_team':'Home team','Away_team':'Away team'},
                    inplace=True)

df_final_user = df_final_user[['Year','Country','Weekday','Date',
'Status','Home team','Away team','Result','Prediction', 'Probability 1',
'Probability X', 'Probability 2']]

df_final_user

Unnamed: 0,Year,Country,Weekday,Date,Status,Home team,Away team,Result,Prediction,Probability 1,Probability X,Probability 2
35632,2022,Portugal,Following,05 Feb 22,Not played yet,Marítimo,Estoril,-,1,50.7%,27.6%,21.6%
35638,2022,Portugal,Following,06 Feb 22,Not played yet,Boavista,Vizela,-,1,41.5%,34.3%,24.2%
35656,2022,Portugal,Following,05 Feb 22,Not played yet,Paços de Ferreira,Portimonense,-,1,52.6%,29.2%,18.2%
35681,2022,Portugal,Following,06 Feb 22,Not played yet,Gil Vicente,CD Santa Clara,-,1,45.1%,24.3%,30.6%
35696,2022,Portugal,Current,30 Jan 22,Still playing,Portimonense,Tondela,0 - 0,1,45.4%,30.4%,24.3%
...,...,...,...,...,...,...,...,...,...,...,...,...
37665,2022,France,Following,05 Feb 22,Not played yet,Saint-Étienne,Montpellier,-,x,31.1%,36.1%,32.8%
37678,2022,France,Following,05 Feb 22,Not played yet,Monaco,Olympique Lyonnais,-,1,34.6%,31.1%,34.3%
37697,2022,France,Following,06 Feb 22,Not played yet,Nice,Clermont,-,1,68.9%,18.7%,12.4%
37723,2022,France,Current,23 Jan 22,Finalized,Girondins Bordeaux,Strasbourg,4 - 3,x,34.9%,35.2%,29.9%


In [49]:
# Create a column with the date of the last udpate. This will be used
# to show the user when was the table updated for the last time.

last_update_date = date.today().strftime("%d %b %Y")

df_final_user['Last update date'] = last_update_date

df_final_user

Unnamed: 0,Year,Country,Weekday,Date,Status,Home team,Away team,Result,Prediction,Probability 1,Probability X,Probability 2,Last update date
35632,2022,Portugal,Following,05 Feb 22,Not played yet,Marítimo,Estoril,-,1,50.7%,27.6%,21.6%,30 Jan 2022
35638,2022,Portugal,Following,06 Feb 22,Not played yet,Boavista,Vizela,-,1,41.5%,34.3%,24.2%,30 Jan 2022
35656,2022,Portugal,Following,05 Feb 22,Not played yet,Paços de Ferreira,Portimonense,-,1,52.6%,29.2%,18.2%,30 Jan 2022
35681,2022,Portugal,Following,06 Feb 22,Not played yet,Gil Vicente,CD Santa Clara,-,1,45.1%,24.3%,30.6%,30 Jan 2022
35696,2022,Portugal,Current,30 Jan 22,Still playing,Portimonense,Tondela,0 - 0,1,45.4%,30.4%,24.3%,30 Jan 2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37665,2022,France,Following,05 Feb 22,Not played yet,Saint-Étienne,Montpellier,-,x,31.1%,36.1%,32.8%,30 Jan 2022
37678,2022,France,Following,05 Feb 22,Not played yet,Monaco,Olympique Lyonnais,-,1,34.6%,31.1%,34.3%,30 Jan 2022
37697,2022,France,Following,06 Feb 22,Not played yet,Nice,Clermont,-,1,68.9%,18.7%,12.4%,30 Jan 2022
37723,2022,France,Current,23 Jan 22,Finalized,Girondins Bordeaux,Strasbourg,4 - 3,x,34.9%,35.2%,29.9%,30 Jan 2022


In [50]:
# Let's order matches by date. For taht we need to create a new column
# with the date in a format that python can understand. Then we can
# drop that column,as it does not show value to the user.

datetime_date_list = []

for date in df_final_user['Date']:
    date_object = datetime.strptime(date, "%d %b %y")
    datetime_date_list.append(date_object)
    
df_final_user['Datetime_date_list'] = datetime_date_list

df_final_user.sort_values('Datetime_date_list',inplace=True)

df_final_user.drop('Datetime_date_list',axis=1,inplace=True)

df_final_user

Unnamed: 0,Year,Country,Weekday,Date,Status,Home team,Away team,Result,Prediction,Probability 1,Probability X,Probability 2,Last update date
36470,2022,England,Following,18 Jan 22,Finalized,Brighton & Hove Albion,Chelsea,1 - 1,2,16.4%,33.1%,50.4%,30 Jan 2022
37492,2022,France,Current,21 Jan 22,Finalized,Olympique Lyonnais,Saint-Étienne,1 - 0,1,60.2%,25.0%,14.7%,30 Jan 2022
37024,2022,Italy,Current,21 Jan 22,Finalized,Hellas Verona,Bologna,2 - 1,1,44.5%,29.0%,26.5%,30 Jan 2022
36656,2022,England,Current,21 Jan 22,Finalized,Watford,Norwich City,0 - 3,1,49.2%,31.5%,19.3%,30 Jan 2022
35938,2022,Spain,Current,21 Jan 22,Finalized,Espanyol,Real Betis,1 - 4,1,40.0%,27.0%,33.1%,30 Jan 2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36625,2022,England,Following,09 Feb 22,Not played yet,Tottenham Hotspur,Southampton,-,1,60.4%,23.0%,16.5%,30 Jan 2022
36505,2022,England,Following,09 Feb 22,Not played yet,Aston Villa,Leeds United,-,2,33.0%,16.3%,50.7%,30 Jan 2022
36436,2022,England,Following,09 Feb 22,Not played yet,Man. City,Brentford,-,1,80.7%,12.8%,6.6%,30 Jan 2022
36629,2022,England,Following,10 Feb 22,Not played yet,Wolves,Arsenal,-,2,31.2%,25.2%,43.6%,30 Jan 2022


# Save the dataframe in the Cloud

In [51]:
# First we save it to the local machine
df_final_user.to_csv('Table_final_user.csv', index=False)

# Then, we establish connection to the bucket in Google Cloud Storage service.
# For this, I created first a service account key and have downloaded it to
# my local machine. After that, I read the file in order to access the bucket.
path_to_token = 'service_account_key_tfm.json'
storage_credentials = service_account.Credentials.from_service_account_file(path_to_token)
storage_client = storage.Client(project='tiago-project', credentials = storage_credentials)
destination_bucket = storage_client.bucket('tiago-tfm-kschool')

# Once I have access to the bucket, I can upload there the CSV with the dataframe.
# This dataframe will later be read by the streamlit app I have created in order to
# show the user the predictions of the results.
blob = destination_bucket.blob('Table_final_user.csv')
blob.upload_from_filename('Table_final_user.csv')