In [38]:
# import jtplot module in notebook
from jupyterthemes import jtplot

# choose which theme to inherit plotting style from
# onedork | grade3 | oceans16 | chesterish | monokai | solarizedl | solarizedd
jtplot.style(theme='chesterish')

# turn on X- and Y-axis tick marks (default=False)
# turn off the axis grid lines (default=True)
# and set the default figure size
jtplot.style(grid=False)

# reset default matplotlib rcParams
#jtplot.reset()

In [39]:
##Load data set
#Congrats: You already set your working directory by openning this file

#Load pandas
import pandas as pd

#Import CSV & drop variables
df = pd.read_csv('results.csv', sep=',').drop(['city', 'country'], axis=1)
#.set_index('date')

#Remove a warning message
pd.options.mode.chained_assignment = None  # default='warn'

#Check first few rows
df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,False
1,1873-03-08,England,Scotland,4,2,Friendly,False
2,1874-03-07,Scotland,England,2,1,Friendly,False
3,1875-03-06,England,Scotland,2,2,Friendly,False
4,1876-03-04,Scotland,England,3,0,Friendly,False


In [40]:
##Load data set
#Check variables
df.dtypes

date          object
home_team     object
away_team     object
home_score     int64
away_score     int64
tournament    object
neutral         bool
dtype: object

In [None]:
##Load data set
#Check missings (NaN's)
df.isnull().sum(axis=0)

date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
neutral       0
dtype: int64

In [None]:
##Load data set
#Subset data to only include Germany matches 
df_ger = df[(df['home_team'] == 'Germany') | (df['away_team'] == 'Germany')]

df_ger.head()

In [None]:
##Load data set
#Simple recode (total column to be counted)
df_ger['match'] = 1

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Conditional recode (Define Goals Scored)
df_ger['goals'] = df_ger.loc[df_ger['home_team'] == 'Germany','home_score']
df_ger['goals'] = df_ger['goals'].fillna(df_ger.loc[df_ger['away_team'] == 'Germany','away_score'])

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Conditional recode (Define Goals Conceded)
df_ger['conceded'] = df_ger.loc[df_ger['home_team'] == 'Germany','away_score']
df_ger['conceded'] = df_ger['conceded'].fillna(df_ger.loc[df_ger['away_team'] == 'Germany','home_score'])

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Simple recode (Define Goal Differential)
df_ger['goaldiff'] = df_ger['goals'] - df_ger['conceded']

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Dummy recode (Define Friendly v Competitive match)
df_ger['friendly'] = df_ger['tournament'] == 'Friendly'

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Categorical recode (home vs away vs neutral matches)
df_ger.loc[df_ger['home_team'] == 'Germany','home'] = 'home'
df_ger.loc[df_ger['away_team'] == 'Germany','home'] = 'away'
df_ger.loc[df_ger['neutral'] == True,'home'] = 'neutral'

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Conditional recode (opponent)
df_ger['opponent'] = df_ger['home_team'].loc[(df_ger['home_team'] != 'Germany')] 
df_ger['opponent'] = df_ger['opponent'].fillna(df_ger['away_team'].loc[(df_ger['away_team'] != 'Germany')])

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#String variables (Define year)
df_ger['year'] = df_ger['date'].str[:4]

df_ger.tail()

In [None]:
##Recodes and Data Manpulation
#Drop redundant variables
df_ger = df_ger.drop(['home_score','away_score','tournament','neutral','home_team','away_team'], axis=1)

df_ger.tail()

In [None]:
##Descriptive Statistics
#Tabs
tabHome = pd.crosstab(df_ger['home'],df_ger['friendly'])
tabHome.columns=['competitive','friendly']

tabHome

In [None]:
##Descriptive Statistics
#Unweighted proportions
propHome = round((tabHome/tabHome.sum(0))*100, ndigits = 2)

propHome

In [None]:
##Descriptive Statistics
#Goal difference table
goaldiffTable = round(pd.pivot_table(df_ger, index=['home','friendly'], aggfunc='mean'), ndigits = 2)

goaldiffTable

In [None]:
##Descriptive Statistics
#Opponent table
threeOpponents = df_ger.loc[df_ger['opponent'].isin(['Mexico','Sweden','Korea Republic'])] 
opponentTable = round(pd.pivot_table(threeOpponents, index=['opponent'], values=['goaldiff','match'], aggfunc={'mean', 'sum'}), ndigits = 2)

opponentTable

In [None]:
##Graphs
#Histogram
import matplotlib.pyplot as plt

plt.hist(df_ger['goaldiff'], bins='auto', facecolor='blue', alpha=0.5)
plt.title('Histogram of Goal Differences')

plt.show()

In [None]:
##Graphs
#Someone attempted to replicate ggplot2 for python! (kindof works)
!pip install ggplot
from ggplot import *

ggplot(df_ger, aes('goaldiff', fill='friendly')) +\
    geom_histogram(binwidth=.75, position="dodge") +\
    ylab("Count") + xlab("Goal Differential")

In [None]:
##Graphs
#Line chart

plt.figure(figsize=(12,8))

df_goaldiff_byyear = pd.pivot_table(df_ger, values='goaldiff', index=['year'], aggfunc='mean').reset_index()

plt.plot(df_goaldiff_byyear['year'], df_goaldiff_byyear['goaldiff'], color='blue')

plt.xticks([1908, 1938, 1968, 1998, 2018], \
           [1908, 1938, 1968, 1998, 2018])
plt.suptitle('Goal Differential by Year', fontsize=18)

plt.show()

In [None]:
##Graphs
#Line chart
plt.figure(figsize=(12,8))

df_goals_byyear = pd.pivot_table(df_ger, values='goals', index=['year'], aggfunc='mean').reset_index()

plt.plot(df_goals_byyear['year'], df_goals_byyear['goals'], color='green')

plt.xticks([1908, 1938, 1968, 1998, 2018], \
           [1908, 1938, 1968, 1998, 2018])
plt.suptitle('Goals Scored by Year', fontsize=18)

plt.show()

In [None]:
##Graphs
#Line chart
plt.figure(figsize=(12,8))

df_conceded_byyear = pd.pivot_table(df_ger, values='conceded', index=['year'], aggfunc='mean').reset_index()

plt.plot(df_conceded_byyear['year'], df_conceded_byyear['conceded'], color='red')

plt.xticks([1908, 1938, 1968, 1998, 2018], \
           [1908, 1938, 1968, 1998, 2018])
plt.suptitle('Goals Conceded by Year', fontsize=18)

plt.show()

In [None]:
##Graphs
#Line chart
import matplotlib.ticker as ticker
plt.figure(figsize=(12,8))

df_conceded_byyear['conceded_neg'] = df_conceded_byyear['conceded']*-1

plt.axhline(0, color='white', alpha=0.2)
goals = plt.plot(df_goals_byyear['year'], df_goals_byyear['goals'], color='green', alpha=0.4, label = 'Goals Scored') 
conceded = plt.plot(df_conceded_byyear['year'], df_conceded_byyear['conceded_neg'], color='red', alpha=0.4, label = 'Goals Conceded')
goaldiff = plt.plot(df_goaldiff_byyear['year'], df_goaldiff_byyear['goaldiff'], color='blue', label = 'Goal Differential')

plt.xticks([1908, 1938, 1968, 1998, 2018], \
           [1908, 1938, 1968, 1998, 2018])
plt.suptitle('Goal Differential by Year', fontsize=18)
plt.title('Overlay Chart', fontsize=14)
plt.legend()

plt.show()