**Project context** <br>This notebook is part of the project "Bookmakerspy", a project that aims at predicting football games results in the context of the English Premier League (based on data from 2014 to 2018) and ultimately beat bookmakers' odds.<br>
The current notebook "bookmakerspy_data_collection" is the first in a series of 3 notebooks. It is followed by "bookmakerspy_data_preprocessing" and "bookmakerspy_modelisation".

**Information about the notebook**<br>
This notebook is intended for processing with Google Colab, and aims at collecting data from the following sources: https://www.kaggle.com/shubhmamp/english-premier-league-match-data and https://datahub.io/sports-data/english-premier-league in order to create a dataframe containing English Premier League game statistics and player statistics and the corresponding bookmakers' odds.<br>
The kaggle dataset is available in json format and contains games and players statistics between 2014 and 2018. The datahub dataset enables to retrieve bookmakers' odds for the same games.<br>

**Notebook goal**<br>
Running the notebook will result in the creation of an intermediary Google Drive folders containing the relevant data and performing minor pre-processing tasks. Data will then be assembled into a dataset that can then be processed further for the exploration, pre-processing and modelisation steps.

In [None]:
# Connect the notebook with Google drive to collect data from Kaggle
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Upload your personal kaggle.json containing your personal token info. This file can be retrieved via your personal Kaggle account (more info: https://www.kaggle.com/docs/api#authentication)
from google.colab import files
files.upload()

In [None]:
# Create Kaggle folder
! mkdir ~/.kaggle

# Copy kaggle.json into kaggle folder
! cp kaggle.json ~/.kaggle/

# Modify permissions for kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download Kaggle data https://www.kaggle.com/shubhmamp/english-premier-league-match-data
! kaggle datasets download -d shubhmamp/english-premier-league-match-data

In [None]:
# Creation of a "dataset" folder in Google Drive and unzip Kaggle data into this folder
! mkdir '/content/drive/My Drive/dataset'
! unzip english-premier-league-match-data.zip -d '/content/drive/My Drive/dataset'

In [None]:
import json
import pandas as pd

# Match team stats data

In [None]:
# retrieving the files containing statistics
team_stats_14_15_json = json.load(open('/content/drive/My Drive/dataset/datafilev2/datafile/season14-15/season_stats.json'))
team_stats_15_16_json = json.load(open('/content/drive/My Drive/dataset/datafilev2/datafile/season15-16/season_stats.json'))
team_stats_16_17_json = json.load(open('/content/drive/My Drive/dataset/datafilev2/datafile/season16-17/season_stats.json'))
team_stats_17_18_json = json.load(open('/content/drive/My Drive/dataset/datafilev2/datafile/season17-18/season_stats.json'))

In [None]:
def team_stats(json,teamloc):
  
  # data containing home team stats are contained at position 0, whereas data containining away team stats are contained at position 1
  teamidx = 0 if teamloc == 'home' else 1 if teamloc == 'away' else "null"
  
  # creation of a dataframe to gather relevant data
  stats = pd.DataFrame()
  row=0

  # Iterating json data to retrieve stats related to the team
  for match_id, infos_match in json.items():

      stats.loc[row, 'match_id'] = match_id
      
      team = dict(list(infos_match.values())[teamidx])

      for column, team_info in team['team_details'].items():
          stats.loc[row, column] = team_info
      
      for column, team_stat in team['aggregate_stats'].items():
          stats.loc[row, column] = team_stat
      

      row += 1
  
  stats['date'] = pd.to_datetime(stats['date'], dayfirst=True)
  stats = stats.sort_values(by=['date', 'match_id'])
  stats = stats.reset_index(drop=True)
  
  stats['team_rating'] = stats['team_rating'].astype(float)
  
  for column in stats.columns[5:]:
      stats[column] = stats[column].astype(float)
  
  return stats

In [None]:
team_stats_home = {'season_14_15': team_stats(team_stats_14_15_json,'home').add_prefix('home_'), 
                   'season_15_16': team_stats(team_stats_15_16_json,'home').add_prefix('home_'), 
                   'season_16_17': team_stats(team_stats_16_17_json,'home').add_prefix('home_'),  
                   'season_17_18': team_stats(team_stats_17_18_json,'home').add_prefix('home_')}

team_stats_away = {'season_14_15': team_stats(team_stats_14_15_json,'away').add_prefix('away_'), 
                   'season_15_16': team_stats(team_stats_15_16_json,'away').add_prefix('away_'), 
                   'season_16_17': team_stats(team_stats_16_17_json,'away').add_prefix('away_'),  
                   'season_17_18': team_stats(team_stats_17_18_json,'away').add_prefix('away_')}

In [None]:
# Add season info
team_stats_home['season_14_15']['season'] = '2014_2015'
team_stats_home['season_15_16']['season'] = '2015_2016'
team_stats_home['season_16_17']['season'] = '2016_2017'
team_stats_home['season_17_18']['season'] = '2017_2018'

In [None]:
# Concatenation home data
df_home = pd.concat([team_stats_home['season_14_15'],team_stats_home['season_15_16'],team_stats_home['season_16_17'],team_stats_home['season_17_18']])
# Concatenation away data
df_away = pd.concat([team_stats_away['season_14_15'],team_stats_away['season_15_16'],team_stats_away['season_16_17'],team_stats_away['season_17_18']])


In [None]:
df_home.head()

In [None]:
df_away.head()

In [None]:
# Merge away / home on match id
df_merge = df_home.merge(df_away, left_on=['home_match_id'], right_on=['away_match_id'])

In [None]:
# removing columns made redundant by the merge
df_merge = df_merge.rename(columns={"home_match_id": "match_id", "home_date": "date"})
df_merge = df_merge.drop(['away_match_id','away_date','home_goals','away_goals'], axis=1)

In [None]:
df_merge.info()

In [None]:
# even if this is pre-processing already, we are setting all NaNs to 0 as it is the meaning of the NaNs in this first part of our dataframe
# Les NaN dans le dataset sont équivalents à 0
df_merge = df_merge.fillna(0)

In [None]:
df_merge['match_id'] = df_merge['match_id'].astype(int)

# Match players stats data

For each player, we are retrieving the match players stats in order to create an average by players position for a given match, so as to have a more refined indicator than the team rating.

In [None]:
def players_stats(json, teamloc):
    
    IsAway = 0 if teamloc == 'home' else 1 if teamloc == 'away' else None

    stats = pd.DataFrame()
   
    row = 0
    for match_id, infos_match in json.items():
        
        home = dict(list(infos_match.values())[IsAway])
        for column, player_stat in home['Player_stats'].items():
            stats.loc[row, 'season'] = None
            stats.loc[row, 'match_id'] = match_id
            for column1, player_details in home['Player_stats'][column]['player_details'].items():
              stats.loc[row, column1] = player_details
            for column1, match_details in home['Player_stats'][column]['Match_stats'].items():
              stats.loc[row, column1] = match_details

            row+=1
    
    stats = stats.sort_values(by=['match_id'])
    stats = stats.reset_index(drop=True)

    for column in stats.columns[6:]:
        stats[column] = stats[column].astype(float)
    
    return stats

In [None]:
players_stats_home = {'season_14_15': players_stats(team_stats_14_15_json, 'home').add_prefix('home_'), 
                      'season_15_16': players_stats(team_stats_15_16_json, 'home').add_prefix('home_'), 
                      'season_16_17': players_stats(team_stats_16_17_json, 'home').add_prefix('home_'),  
                      'season_17_18': players_stats(team_stats_17_18_json, 'home').add_prefix('home_')}

players_stats_away = {'season_14_15': players_stats(team_stats_14_15_json, 'away').add_prefix('away_'), 
                      'season_15_16': players_stats(team_stats_15_16_json, 'away').add_prefix('away_'), 
                      'season_16_17': players_stats(team_stats_16_17_json, 'away').add_prefix('away_'),  
                      'season_17_18': players_stats(team_stats_17_18_json, 'away').add_prefix('away_')}

In [None]:
# concatenation for home team data
df_players_home = pd.concat([players_stats_home['season_14_15'], players_stats_home['season_15_16'], players_stats_home['season_16_17'], players_stats_home['season_17_18']])

# concatenation for away team data
df_players_away = pd.concat([players_stats_away['season_14_15'], players_stats_away['season_15_16'], players_stats_away['season_16_17'], players_stats_away['season_17_18']])

In [None]:
df_players_home.head()

In [None]:
df_players_away.head()

In [None]:
# keeping only players with a rating
df_players_home_rated = df_players_home.loc[df_players_home['home_player_rating'] > 0,:]
df_players_away_rated = df_players_away.loc[df_players_away['away_player_rating'] > 0,:]

In [None]:
# creating the key for merge for later stage
df_players_home_rated = df_players_home_rated.rename(columns={"home_match_id": "match_id"})
df_players_away_rated = df_players_away_rated.rename(columns={"away_match_id": "match_id"})

In [None]:
# grouping players by position
df_players_home_rated['home_player_position'] = df_players_home_rated['home_player_position_value']
df_players_home_rated['home_player_position'].replace(['1', '2', '3', '4', '5'], ['Goalkeeper', 'Defender', 'Midfielder', 'Forward', 'Substitute'], inplace = True)

df_players_away_rated['away_player_position'] = df_players_away_rated['away_player_position_value']
df_players_away_rated['away_player_position'].replace(['1', '2', '3', '4', '5'], ['Goalkeeper', 'Defender', 'Midfielder', 'Forward', 'Substitute'], inplace = True)

In [None]:
def position_rating(teamloc):
  
  output = pd.DataFrame()
  position_list = ['Goalkeeper', 'Defender', 'Midfielder', 'Forward', 'Substitute']

  if teamloc == 'home':
    df = pd.DataFrame(df_players_home_rated.groupby(['match_id', teamloc + '_player_position']).mean()[teamloc + '_player_rating'].reset_index())
  elif teamloc == 'away':
    df = pd.DataFrame(df_players_away_rated.groupby(['match_id', teamloc + '_player_position']).mean()[teamloc + '_player_rating'].reset_index())
  
  df.index = df['match_id']

  for position in position_list:
    output = pd.concat([output, df.loc[df[teamloc + '_player_position'] == position,:]], axis = 1)
    output = output.rename(columns = {teamloc + '_player_rating':position.lower() + str('_') + teamloc + '_player_rating'})
    output = output.drop(['match_id', teamloc + '_player_position'], axis = 1)

  output = output.reset_index()
  output = output.rename(columns = {'index':'match_id'})

  return output

df_position_home = position_rating('home')

In [None]:
# Creation of ratings dataframes
df_position_home_rating = position_rating('home')
df_position_away_rating = position_rating('away')

In [None]:
df_position_home_rating.head()

In [None]:
df_merge.info()

In [None]:
df_position_rating = df_position_home_rating.merge(df_position_away_rating, on = ['match_id'])
df_position_rating['match_id'] = df_position_rating['match_id'].astype(int)

# Merging both df
df_merge = df_merge.merge(df_position_rating, on = ['match_id'])

# Removing substitutes as not considered as relevant
df_merge = df_merge.drop(columns=['substitute_away_player_rating', 'substitute_home_player_rating']) 

# If there is no attacking player, rating equals 0
df_merge['forward_away_player_rating'] = df_merge['forward_away_player_rating'].fillna(0)

In [None]:
df_merge.info()

# Match odds data

In [None]:
! pip install datapackage

In [None]:
import datapackage

In [None]:
# retrieving odds data relevant for seasons considered

data_url = 'https://datahub.io/sports-data/english-premier-league/datapackage.json'
package = datapackage.Package(data_url)
resources = package.resources

cotes_1415 = pd.read_csv(resources[5].descriptor['path'])
cotes_1516 = pd.read_csv(resources[4].descriptor['path'])
cotes_1617 = pd.read_csv(resources[3].descriptor['path']) 
cotes_1718 = pd.read_csv(resources[2].descriptor['path']) 

In [None]:
df_odds = pd.concat([cotes_1415, cotes_1516, cotes_1617, cotes_1718])

In [None]:
# Converting date
df_odds['date'] = df_odds['Date'].apply(lambda x: pd.to_datetime(x, dayfirst=True))
df_odds = df_odds.drop(['Date'], axis=1)

In [None]:
# Harmonising team names accross datasets

old_names = sorted(df_odds['HomeTeam'].unique())
new_names = sorted(df_merge['home_team_name'].unique())

#print(old_names)
#print(new_names)
df_odds['HomeTeam'] = df_odds['HomeTeam'].replace(old_names, new_names)
df_odds['AwayTeam'] = df_odds['AwayTeam'].replace(old_names, new_names)

In [None]:
# Remove odds columns that contain NaNs
df_odds = df_odds.dropna(axis='columns')

In [None]:
# Remove variables that are redundant with df_merge
df_odds = df_odds.drop(['Div','HS','AS','HST','AST','HC', 'AC'], axis=1)

# Merging Stats and Odds

In [None]:
df_stats_odds = df_merge.merge(df_odds, left_on = ['date', 'home_team_name', 'away_team_name'], right_on = ['date', 'HomeTeam', 'AwayTeam'])

In [None]:
# Removing redundancies and unuseful columns, and making sure id columns are ints
df_stats_odds = df_stats_odds.drop(['HomeTeam','AwayTeam'], axis=1)
df_stats_odds[['match_id','home_team_id','away_team_id']] =  df_stats_odds[['match_id','home_team_id','away_team_id']].astype(int)
df_stats_odds  = df_stats_odds.drop(['Referee'], axis=1)

# CSV Output

In [None]:
df_stats_odds.to_csv('df_stats_odds.csv')