In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
matches = pd.read_csv('/kaggle/input/international-football-results-from-1872-to-2017/results.csv')
matches.head()

In [None]:
#Create derived columns/data
matches['tot_goals'] = matches['home_score'] + matches['away_score']
matches['diff_goals'] = matches['home_score'] - matches['away_score']
matches['date'] =  pd.to_datetime(matches['date'], format='%Y-%m-%d')
matches['year_month'] = pd.to_datetime(matches['date']).dt.strftime('%Y-%m')
matches['year'] = pd.to_datetime(matches['date']).dt.year
matches['month'] = pd.to_datetime(matches['date']).dt.month

con=[(matches['home_score']==matches['away_score']),(matches['home_score']>matches['away_score']),(matches['home_score']<matches['away_score'])]
winner_cond_index=[0,1,2]
winner_expression=[np.NaN,matches['home_team'],matches['away_team']]
matches['result']=np.select(con,winner_cond_index)
matches['winner']=np.select(con,winner_expression)

cond1=matches['country'] == 'United States'
matches.loc[cond1, 'country']= 'United States of America'

cond1=matches['country'] == 'Ivory Coast'
matches.loc[cond1, 'country']= "Côte D'Ivoire"

matches['country'] = matches['country'].str.title()
matches['city'] = matches['city'].str.title()

matches.head()

In [None]:
#Considering only matches played at world cup
matches = matches[matches['tournament'] == 'FIFA World Cup']
matches = matches[ (matches['year'] > 1979) & (matches['year'] < 2021)]
print('matches',matches.shape)
matches.head()

In [None]:
#select matches with teams that have been present in at least one FIFA World Cup (This is bacusa if in previous step we want to wide the selection)
world_cup_teams=matches.loc[matches['tournament'] == 'FIFA World Cup']
unique_teams_world_cup=world_cup_teams['home_team'].append(world_cup_teams['away_team']).unique()
mm=matches[(matches['home_team'].isin(unique_teams_world_cup)) | (matches['away_team'].isin(unique_teams_world_cup))]
print('matches',mm.shape)
mm.head()

In [None]:
#create DF only with teams that were present in a world cup, but consider all matches they have played with theirs home,away and total matches
hg=mm.groupby('home_team')[['home_score','away_score','tournament','tot_goals','diff_goals']]
ag=mm.groupby('away_team')[['home_score','away_score','tournament','tot_goals','diff_goals']]

teams=pd.concat([hg.size(),ag.size()], axis=1)
teams.columns=['h_matches','a_matches']
teams=teams.fillna(value={'h_matches':0,'a_matches':0})
teams=teams.loc[unique_teams_world_cup]
teams['t_matches'] = teams['h_matches'] + teams['a_matches']
print('teams',teams.shape)
teams.head()

In [None]:
#merge summary information from howe and away played matches
h_sum=hg.sum().rename(columns={'home_score':'h_scored','away_score': 'h_against','tot_goals':'h_tot','diff_goals':'h_bal'})
teams=pd.merge(teams,h_sum,left_index=True,right_index=True,how='left')

a_sum=ag.sum().rename(columns={'home_score':'a_against','away_score': 'a_scored','tot_goals':'a_tot','diff_goals':'a_bal'})
teams=pd.merge(teams,a_sum,left_index=True,right_index=True,how='left')
teams.head()

In [None]:
#calculate additional total columns for each team
teams['t_scored']=teams['h_scored']+teams['a_scored']
teams['t_against']=teams['h_against']+teams['a_against']
teams['t_tot']=teams['h_tot']+teams['a_tot']
teams['t_bal']=teams['h_bal']+teams['a_bal']*-1
teams.head()

In [None]:
#Add wins,losses and draws when playing at home
home_results=mm.groupby(['home_team','result']).size().unstack().fillna(0)
home_results.columns=['h_draw','h_win','h_loss']
teams=pd.merge(teams,home_results,left_index=True,right_index=True,how='left')
teams.head()

In [None]:
#Add wins,losses and draws when playing as away
away_results=mm.groupby(['away_team','result']).size().unstack().fillna(0)
away_results.columns=['a_draw','a_loss','a_win']
teams=pd.merge(teams,away_results,left_index=True,right_index=True,how='left')
teams.head()

In [None]:
#calculate new columns
teams['t_draw']=teams['h_draw']+teams['a_draw']
teams['t_win']=teams['h_win']+teams['a_win']
teams['t_loss']=teams['h_loss']+teams['a_loss']
teams['points']=teams['t_draw']+teams['t_win']*3
teams['m_bal']=round(teams['t_bal']/teams['t_matches'],2)
teams['m_points']=round(teams['points']/teams['t_matches'],2)
teams['w_rate']=round(teams['t_win']/teams['t_matches'],2)
teams['d_rate']=round(teams['t_draw']/teams['t_matches'],2)
teams['l_rate']=round(teams['t_loss']/teams['t_matches'],2)
teams['m_scored']=round(teams['t_scored']/teams['t_matches'],2)
teams.round({'m_scored' : 2, 'win_rate' : 2})
print('teams',teams.shape)
teams.head()

In [None]:
#select columns and re-order
totals=teams.reindex(['t_matches','t_scored','t_against','t_bal','t_tot','t_win','t_draw','t_loss','points','w_rate','d_rate','l_rate','m_scored','m_points','m_bal'],axis=1)
totals.sort_index()

In [None]:
#Calculate top 10 for each indicator
s01=list(totals['t_matches'].nlargest(10).items())
s02=list(totals['t_scored'].nlargest(10).items())
s03=list(totals['t_against'].nsmallest(10).items())
s04=list(totals['points'].nlargest(10).items())
s05=list(totals['t_bal'].nlargest(10).items())
s06=list(totals['t_win'].nlargest(10).items())
s07=list(totals['w_rate'].nlargest(10).items())
s08=list(totals['m_points'].nlargest(10).items())
s09=list(totals['m_bal'].nlargest(10).items())
s10=list(totals['m_scored'].nlargest(10).items())
stats=pd.DataFrame([s01,s02,s03,s04,s05,s06,s07,s08,s09,s10],index=['matches','g_scored','g_against','points','bal','wins','win_rate','avg_points','avg_bal','avg_scored'])
stats