# NOTE! To make predictions, this Dataset must be updated every Match Week  

### Current Dataset can be downloaded from https://www.football-data.co.uk/englandm.php

In [None]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime
pd.set_option('display.max_columns', None)
DATA_PATH = './data/'

In [2]:
df = pd.read_csv(os.path.join(DATA_PATH, 'season2021.csv'))
df.dropna(subset=['Date'], axis=0, how='all', inplace=True)

In [3]:
def parse_date(date):
    """
    Converts date from string to datetime object.
    """
    return datetime.strptime(date, '%d/%m/%y').date()

def parse_date_other(date):
    """
    Converts date when strptime layout is different
    """
    return datetime.strptime(date, '%d/%m/%Y').date()
df.Date = df.Date.apply(parse_date_other)

In [4]:
cols = ['Date', 'HomeTeam', 'AwayTeam', 'HS', 'AS', 
        'FTHG','FTAG', 'FTR', 'B365H', 'B365D', 'B365A', 'season']

playing_stats = df[cols]

In [5]:
def get_matchweek(playing_stat):
    """
    Adds matchweek feature to dataset
    """
    j = 1
    MatchWeek = []
    for i in range(len(playing_stat)):
        MatchWeek.append(j)
        if ((i + 1)% 10) == 0:
            j += 1
    playing_stat['MW'] = MatchWeek
    return playing_stat

playing_stats = get_matchweek(playing_stats)
playing_stats

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Date,HomeTeam,AwayTeam,HS,AS,FTHG,FTAG,FTR,B365H,B365D,B365A,season,MW
0,2020-09-12,Fulham,Arsenal,5,13,0,3,A,6.00,4.33,1.53,2021,1
1,2020-09-12,Crystal Palace,Southampton,5,9,1,0,H,3.10,3.25,2.37,2021,1
2,2020-09-12,Liverpool,Leeds,22,6,4,3,H,1.28,6.00,9.50,2021,1
3,2020-09-12,West Ham,Newcastle,15,15,0,2,A,2.15,3.40,3.40,2021,1
4,2020-09-13,West Brom,Leicester,7,13,0,3,A,3.80,3.60,1.95,2021,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,2021-01-02,Brighton,Wolves,13,11,3,3,D,2.62,3.25,2.75,2021,16
160,2021-01-02,West Brom,Arsenal,7,21,0,4,A,5.75,4.00,1.57,2021,17
161,2021-01-03,Newcastle,Leicester,8,9,1,2,A,5.25,4.20,1.61,2021,17
162,2021-01-03,Chelsea,Man City,9,18,1,3,A,3.00,3.70,2.20,2021,17


In [6]:
# Gets the goals scored agg arranged by teams and matchweek
def get_goals_scored(playing_stat):
    # Get the number of matchweeks in the season
    mw = max(playing_stat['MW'])
    
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # build dict where value is a list of goals scored per match
    for i in range(len(playing_stat)):
        HTGS = playing_stat.iloc[i]['FTHG']
        ATGS = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGS)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGS)
    for iterate in range(5):
        for i, k in enumerate(teams):
            if len(teams[k]) < mw:
                teams[k].append(0)
    # Create a dataframe for goals scored where rows are teams and cols are matchweek.
    GoalsScored = pd.DataFrame(data=teams, index = [i for i in range(mw)]).T
    GoalsScored[0] = 0
    # Aggregate to get uptil that point
    for i in range(2, mw):
        GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
    return GoalsScored

# Gets the goals conceded agg arranged by teams and matchweek
def get_goals_conceded(playing_stat):
    # Get the number of matchweeks in the season
    mw = max(playing_stat['MW'])
    
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    
    # build dict where value is a list of goals conceded per match
    for i in range(len(playing_stat)):
        ATGC = playing_stat.iloc[i]['FTHG']
        HTGC = playing_stat.iloc[i]['FTAG']
        teams[playing_stat.iloc[i].HomeTeam].append(HTGC)
        teams[playing_stat.iloc[i].AwayTeam].append(ATGC)
    for iterate in range(5):
        for i, k in enumerate(teams):
            if len(teams[k]) < mw:
                teams[k].append(0)
            
    # Create a dataframe for goals conceded where rows are teams and cols are matchweek.
    GoalsConceded = pd.DataFrame(data=teams, index = [i for i in range(mw)]).T
    GoalsConceded[0] = 0
    # Aggregate to get uptil that point
    for i in range(1, mw):
        GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
    return GoalsConceded

def get_goal_stats(playing_stat):
    GC = get_goals_conceded(playing_stat)
    GS = get_goals_scored(playing_stat)
   
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []

    for i in range(len(playing_stat)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j += 1
        
    playing_stat['HTGS'] = HTGS
    playing_stat['ATGS'] = ATGS
    playing_stat['HTGC'] = HTGC
    playing_stat['ATGC'] = ATGC
    
    return playing_stat

# Apply to each dataset
playing_stats = get_goal_stats(playing_stats)
mw = max(playing_stats['MW'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [7]:
# Retrospective points
def get_points(result):
    if result == 'W':
        return 3
    elif result == 'D':
        return 1
    else:
        return 0
    
def get_cumulative_points(matchres, mw):
    matchres_points = matchres.applymap(get_points)
    for i in range(2, mw+1):
        matchres_points[i] = matchres_points[i] + matchres_points[i-1]
        
    matchres_points.insert(column =0, loc = 0, value = [0*i for i in range(20)])
    return matchres_points

def get_match_result(playing_stat):
    # Create a dictionary with team names as keys
    teams = {}
    for i in playing_stat.groupby('HomeTeam').mean().T.columns:
        teams[i] = []

    # build dict where value is list of match results
    for i in range(len(playing_stat)):
        if playing_stat.iloc[i].FTR == 'H':
            teams[playing_stat.iloc[i].HomeTeam].append('W')
            teams[playing_stat.iloc[i].AwayTeam].append('L')
        elif playing_stat.iloc[i].FTR == 'A':
            teams[playing_stat.iloc[i].AwayTeam].append('W')
            teams[playing_stat.iloc[i].HomeTeam].append('L')
        else:
            teams[playing_stat.iloc[i].AwayTeam].append('D')
            teams[playing_stat.iloc[i].HomeTeam].append('D')
            
    for iterate in range(5):
        for i, k in enumerate(teams):
            if len(teams[k]) < mw:
                teams[k].append('M')
                
    return pd.DataFrame(data=teams, index = [i for i in range(1, max(playing_stat['MW']+1))]).T

def get_agg_points(playing_stat):
    matchres = get_match_result(playing_stat)
    cum_pts = get_cumulative_points(matchres, max(playing_stat['MW']))
    HTP = []
    ATP = []
    j = 0
    for i in range(len(playing_stat)):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        HTP.append(cum_pts.loc[ht][j])
        ATP.append(cum_pts.loc[at][j])

        if ((i + 1)% 10) == 0:
            j += 1
            
    playing_stat['HTP'] = HTP
    playing_stat['ATP'] = ATP
    return playing_stat

playing_stats = get_agg_points(playing_stats)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
def get_form(playing_stat,num):
    form = get_match_result(playing_stat)
    form_final = form.copy()
    for i in range(num, max(playing_stat['MW'])+1):
        form_final[i] = ''
        j = 0
        while j < num:
            form_final[i] += form[i-j]
            j += 1
    return form_final

def add_form(playing_stat,num):
    form = get_form(playing_stat,num)
    h = ['M' for i in range(num * 10)]  # since form is not available for n MW (n*10)
    a = ['M' for i in range(num * 10)]
    
    j = num
    for i in range((num*10),playing_stat.shape[0]):
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        
        past = form.loc[ht][j]  # get past n results
        h.append(past[num-1])   # 0 index is most recent
        
        past = form.loc[at][j]  # get past n results.
        a.append(past[num-1])   # 0 index is most recent
        
        if ((i + 1)% 10) == 0:
            j = j + 1

    playing_stat['HM' + str(num)] = h                 
    playing_stat['AM' + str(num)] = a

    
    return playing_stat


def add_form_df(playing_statistics):
    playing_statistics = add_form(playing_statistics,1)
    playing_statistics = add_form(playing_statistics,2)
    playing_statistics = add_form(playing_statistics,3)
    playing_statistics = add_form(playing_statistics,4)
    playing_statistics = add_form(playing_statistics,5)
    return playing_statistics

playing_stats = add_form_df(playing_stats)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
# Rearranging columns
cols = ['Date', 'season', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTGS', 'ATGS', 
        'HTGC', 'ATGC', 'HTP', 'ATP', 'B365H', 'B365D', 
        'B365A', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'MW']

playing_stats = playing_stats[cols]

In [10]:
standings = pd.read_csv(DATA_PATH + 'league_standings2.csv')

def get_league_pos(playing_stat, standings, year):
    HomeTeamLP = []
    AwayTeamLP = []
        
    for i in range(len(playing_stat)): 
        ht = playing_stat.iloc[i].HomeTeam
        at = playing_stat.iloc[i].AwayTeam
        # If team was promoted set to default pos of 18th
        try:
            LP = standings.loc[(standings['Team'] == ht) & (standings['Season'] == year )].Position.item()
            HomeTeamLP.append(LP)
        except ValueError as e:
            HomeTeamLP.append(18)
        
        try:
            LP = standings.loc[(standings['Team'] == at) & (standings['Season'] == year )].Position.item()
            AwayTeamLP.append(LP)
        except ValueError as e:
            AwayTeamLP.append(18)

    playing_stat['HomeTeamLP'] = HomeTeamLP
    playing_stat['AwayTeamLP'] = AwayTeamLP

    return playing_stat

playing_stats = get_league_pos(playing_stats, standings, 19)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
playing_stats = (pd.concat([playing_stats], ignore_index=True)
                         .assign(gameId=lambda df: list(df.index + 1))
                         .sort_values('gameId'))

In [12]:
# Gets the form points.
def get_form_points(string):
    total = 0
    for letter in string:
        total += get_points(letter)
    return total

playing_stats['HTFormPtsStr'] = playing_stats['HM1'] + playing_stats['HM2'] + playing_stats['HM3'] + playing_stats['HM4'] + playing_stats['HM5']
playing_stats['ATFormPtsStr'] = playing_stats['AM1'] + playing_stats['AM2'] + playing_stats['AM3'] + playing_stats['AM4'] + playing_stats['AM5']

playing_stats['HTFormPts'] = playing_stats['HTFormPtsStr'].apply(get_form_points)
playing_stats['ATFormPts'] = playing_stats['ATFormPtsStr'].apply(get_form_points)

In [13]:
# Get doal difference
playing_stats['HTGD'] = playing_stats['HTGS'] - playing_stats['HTGC']
playing_stats['ATGD'] = playing_stats['ATGS'] - playing_stats['ATGC']

# Diff in points
playing_stats['DiffPts'] = playing_stats['HTP'] - playing_stats['ATP']

# Difference in form points, last 5 games
playing_stats['DiffFormPts'] = playing_stats['HTFormPts'] - playing_stats['ATFormPts']

# Uncomment this line to create unscaled df for predictions
# this_season = playing_stats.loc[playing_stats['season'] == 1920]
# this_season.to_csv(os.path.join(DATA_PATH, 'season1920_data.csv'))

# Diff in last years league positions
playing_stats['DiffLP'] = playing_stats['HomeTeamLP'] - playing_stats['AwayTeamLP']

In [14]:
playing_stats.to_csv(os.path.join(DATA_PATH, 'test_data_part1.csv'))

In [25]:
playing_stats[150:160]

Unnamed: 0,Date,season,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTGS,ATGS,HTGC,ATGC,HTP,ATP,B365H,B365D,B365A,HM1,HM2,HM3,HM4,HM5,AM1,AM2,AM3,AM4,AM5,MW,HomeTeamLP,AwayTeamLP,gameId,HTFormPtsStr,ATFormPtsStr,HTFormPts,ATFormPts,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
150,2020-12-29,2021,Burnley,Sheffield United,1,0,H,7,8,16,25,16,2,2.37,3.1,3.25,W,L,W,D,W,L,D,L,L,L,16,10,9,151,WLWDW,LDLLL,10,1,-9,-17,14,9,1
151,2020-12-29,2021,Southampton,West Ham,0,0,D,25,23,18,19,25,22,2.1,3.6,3.4,D,L,D,W,W,D,L,D,W,L,16,11,16,152,DLDWW,DLDWL,8,5,7,4,3,3,-5
152,2020-12-29,2021,West Brom,Leeds,0,5,A,11,27,32,26,8,20,3.6,3.6,2.0,D,L,D,L,L,W,L,W,L,L,16,18,18,153,DLDLL,WLWLL,2,6,-21,1,-12,-4,0
153,2020-12-29,2021,Man United,Wolves,1,0,H,32,13,21,21,30,21,1.53,4.2,6.25,W,D,W,W,D,D,L,W,L,L,16,3,7,154,WDWWD,DLWLL,11,4,11,-8,9,7,-4
154,2020-12-30,2021,Newcastle,Liverpool,0,0,D,16,33,26,17,19,32,12.0,6.0,1.25,D,L,D,L,W,D,W,W,D,W,16,13,1,155,DLDLW,DWWDW,5,11,-10,16,-13,-6,12
155,2021-01-01,2021,Everton,West Ham,0,1,A,25,23,20,19,29,22,2.1,3.5,3.5,W,W,W,W,D,D,L,D,W,L,16,12,16,156,WWWWD,DLDWL,13,5,5,4,7,8,-4
156,2021-01-01,2021,Man United,Aston Villa,2,1,H,32,28,21,16,30,26,1.72,4.2,4.2,W,D,W,W,D,L,D,W,W,D,16,3,17,157,WDWWD,LDWWD,11,8,11,12,4,3,-14
157,2021-01-02,2021,Tottenham,Leeds,3,0,H,29,27,14,26,26,20,1.75,4.1,4.2,D,L,L,D,W,W,L,W,L,L,16,6,18,158,DLLDW,WLWLL,5,6,15,1,6,-1,-12
158,2021-01-02,2021,Crystal Palace,Sheffield United,2,0,H,19,8,29,25,18,2,1.95,3.5,3.9,L,L,D,D,W,L,D,L,L,L,16,18,9,159,LLDDW,LDLLL,5,1,-10,-17,16,4,9
159,2021-01-02,2021,Brighton,Wolves,3,3,D,17,13,22,21,13,21,2.62,3.25,2.75,D,D,D,L,L,D,L,W,L,L,16,15,7,160,DDDLL,DLWLL,3,4,-5,-8,-8,-1,8


In [16]:
def create_df(path):
    """
    Function to convert date to datetime and sort by gameId
    """
    df = (pd.read_csv(path, dtype={'season': str})
         .assign(Date=lambda df: pd.to_datetime(df.Date))
         .pipe(lambda df: df.dropna(thresh=len(df) - 2, axis=1))  # Drop cols with NAs
         .dropna(axis=0)  # Drop rows with NAs
         .rename(columns={'Unnamed: 0': 'gameId'})
         .sort_values('gameId')
         .reset_index(drop=True)
         )
    return df

df = create_df(os.path.join(DATA_PATH, 'season2021.csv'))

In [17]:
def create_multiline_df_stats(old_stats_df):
    # Create a list of columns we want and their mappings to more interpretable names
    home_stats_cols = ['Date', 'season', 'HomeTeam', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
                       'HR', 'AR']
    
    away_stats_cols = ['Date', 'season', 'AwayTeam', 'FTAG', 'FTHG', 'HTAG', 'HTHG', 'AS', 'HS', 'AST', 'HST', 'AF', 'HF', 'AC', 'HC', 'AY', 'HY',
                       'AR', 'HR']
    
    stats_cols_mapping = ['Date', 'season', 'Team', 'goalsFor', 'goalsAgainst', 'halfTimeGoalsFor', 'halfTimeGoalsAgainst', 'shotsFor',
                          'shotsAgainst', 'shotsOnTargetFor', 'shotsOnTargetAgainst', 'freesFor', 'freesAgainst', 
                          'cornersFor', 'cornersAgainst', 'yellowsFor', 'yellowsAgainst', 'redsFor', 'redsAgainst']
    
    # Create a dictionary of the old column names to new column names
    home_mapping = {old_col: new_col for old_col, new_col in zip(home_stats_cols, stats_cols_mapping)}
    away_mapping = {old_col: new_col for old_col, new_col in zip(away_stats_cols, stats_cols_mapping)}
    
    # Put each team onto an individual row
    multi_line_stats = (old_stats_df[['gameId'] + home_stats_cols] # Filter for only the home team columns
                    .rename(columns=home_mapping) # Rename the columns
                    .assign(homeGame=1) # Assign homeGame=1 so that we can use a general function later
                    .append((old_stats_df[['gameId'] + away_stats_cols]) # Append the away team columns
                            .rename(columns=away_mapping) # Rename the away team columns
                            .assign(homeGame=0), sort=True)
                    .sort_values(by='gameId') # Sort the values
                    .reset_index(drop=True))
    return multi_line_stats

In [18]:
def create_stats_features_ema(stats, span):
    # Create a restructured DataFrames so that we can calculate EMA
    multi_line_stats = create_multiline_df_stats(stats)
    
    # Create a copy of the DataFrame
    ema_features = multi_line_stats[['Date', 'season', 'gameId', 'Team', 'homeGame']].copy()
    
    # Get the columns that we want to create EMA for
    feature_names = multi_line_stats.drop(columns=['Date', 'season', 'gameId', 'Team', 'homeGame']).columns
    
    # Loop over the features
    for feature_name in feature_names:
        feature_ema = (multi_line_stats.groupby('Team')[feature_name] # Calculate the EMA
                                                  .transform(lambda row: row.ewm(span=span, min_periods=2)
                                                             .mean()
                                                             .shift(1))) # Shift the data down 1 so we don't leak data
        ema_features[feature_name] = feature_ema # Add the new feature to the DataFrame
    return ema_features

In [19]:
# Add weighted average to each row with a span of 50.
df = create_stats_features_ema(df, 50)
df.tail()

Unnamed: 0,Date,season,gameId,Team,homeGame,cornersAgainst,cornersFor,freesAgainst,freesFor,goalsAgainst,goalsFor,halfTimeGoalsAgainst,halfTimeGoalsFor,redsAgainst,redsFor,shotsAgainst,shotsFor,shotsOnTargetAgainst,shotsOnTargetFor,yellowsAgainst,yellowsFor
323,2021-03-01,2021,161,Newcastle,1,6.399954,3.928616,11.934246,10.284487,1.59625,1.090854,0.590204,0.265357,0.13189,0.0,15.035359,9.049679,5.628441,3.306036,1.8812,1.883024
324,2021-03-01,2021,162,Man City,0,3.450434,6.869596,9.942586,10.033976,0.757358,1.481119,0.34584,1.06519,0.0,0.0,7.900795,15.406108,2.437667,5.329999,1.382368,1.483502
325,2021-03-01,2021,162,Chelsea,1,5.204522,6.153365,9.279909,12.316751,1.116503,1.859677,0.559589,0.620829,0.0,0.047379,9.591909,14.220644,2.924908,5.309977,1.415966,1.308365
326,2021-04-01,2021,163,Southampton,1,5.189161,4.394514,12.657861,12.433748,1.092791,1.484782,0.344229,0.889156,0.129172,0.0,9.890132,10.222548,3.932677,4.273505,1.545654,1.514391
327,2021-04-01,2021,163,Liverpool,0,3.359262,6.584086,9.781757,9.866738,1.11491,2.27997,0.717041,1.089927,0.1008,0.0,8.353671,14.933519,3.449155,6.228088,1.772083,0.815252


In [20]:
def restructure_stats_features(stats_features):
    non_features = ['homeGame', 'Team', 'gameId']

    stats_features_restructured = (stats_features.query('homeGame == 1')
                                    .rename(columns={col: 'f_' + col + 'Home' for col in stats_features.columns if col not in non_features})
                                    .rename(columns={'Team': 'HomeTeam'})
                                    .pipe(pd.merge, (stats_features.query('homeGame == 0')
                                                        .rename(columns={'Team': 'AwayTeam'})
                                                        .rename(columns={col: 'f_' + col + 'Away' for col in stats_features.columns 
                                                                         if col not in non_features})), on=['gameId'])
                                    .dropna())
    return stats_features_restructured

df = restructure_stats_features(df)
df.tail()

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_halfTimeGoalsAgainstHome,f_halfTimeGoalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,f_shotsOnTargetAgainstHome,f_shotsOnTargetForHome,f_yellowsAgainstHome,f_yellowsForHome,f_DateAway,f_seasonAway,AwayTeam,homeGame_y,f_cornersAgainstAway,f_cornersForAway,f_freesAgainstAway,f_freesForAway,f_goalsAgainstAway,f_goalsForAway,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
159,2021-02-01,2021,159,Brighton,1,4.046708,5.747446,10.369829,11.872098,1.509318,1.050347,0.588747,0.461302,0.076573,0.163491,8.870309,12.702409,3.238342,3.544956,1.508949,1.340342,2021-02-01,2021,Wolves,0,4.738488,4.978083,10.871905,12.855069,1.314873,0.911116,0.49374,0.337284,0.057871,0.0,10.603202,12.157255,3.299475,4.32757,1.763185,1.601408
160,2021-02-01,2021,160,West Brom,1,7.278968,3.768354,11.271871,10.711609,2.197005,0.655417,1.017676,0.402053,0.0,0.191866,16.19549,8.052705,6.136796,2.62408,1.654027,1.347034,2021-02-01,2021,Arsenal,0,5.209125,5.400823,11.577425,9.712271,1.179794,0.9786,0.752703,0.443435,0.0,0.206947,12.26658,11.069004,3.286768,3.508707,2.084237,1.519142
161,2021-03-01,2021,161,Newcastle,1,6.399954,3.928616,11.934246,10.284487,1.59625,1.090854,0.590204,0.265357,0.13189,0.0,15.035359,9.049679,5.628441,3.306036,1.8812,1.883024,2021-03-01,2021,Leicester,0,4.971831,5.144724,12.867124,9.70366,1.234632,1.728593,0.67641,0.708908,0.0,0.0,10.591852,12.123519,4.189759,3.974057,1.580536,2.026649
162,2021-03-01,2021,162,Chelsea,1,5.204522,6.153365,9.279909,12.316751,1.116503,1.859677,0.559589,0.620829,0.0,0.047379,9.591909,14.220644,2.924908,5.309977,1.415966,1.308365,2021-03-01,2021,Man City,0,3.450434,6.869596,9.942586,10.033976,0.757358,1.481119,0.34584,1.06519,0.0,0.0,7.900795,15.406108,2.437667,5.329999,1.382368,1.483502
163,2021-04-01,2021,163,Southampton,1,5.189161,4.394514,12.657861,12.433748,1.092791,1.484782,0.344229,0.889156,0.129172,0.0,9.890132,10.222548,3.932677,4.273505,1.545654,1.514391,2021-04-01,2021,Liverpool,0,3.359262,6.584086,9.781757,9.866738,1.11491,2.27997,0.717041,1.089927,0.1008,0.0,8.353671,14.933519,3.449155,6.228088,1.772083,0.815252


In [21]:
df.to_csv(os.path.join(DATA_PATH, 'test_data_part2.csv'))

In [22]:
df

Unnamed: 0,f_DateHome,f_seasonHome,gameId,HomeTeam,homeGame_x,f_cornersAgainstHome,f_cornersForHome,f_freesAgainstHome,f_freesForHome,f_goalsAgainstHome,f_goalsForHome,f_halfTimeGoalsAgainstHome,f_halfTimeGoalsForHome,f_redsAgainstHome,f_redsForHome,f_shotsAgainstHome,f_shotsForHome,f_shotsOnTargetAgainstHome,f_shotsOnTargetForHome,f_yellowsAgainstHome,f_yellowsForHome,f_DateAway,f_seasonAway,AwayTeam,homeGame_y,f_cornersAgainstAway,f_cornersForAway,f_freesAgainstAway,f_freesForAway,f_goalsAgainstAway,f_goalsForAway,f_halfTimeGoalsAgainstAway,f_halfTimeGoalsForAway,f_redsAgainstAway,f_redsForAway,f_shotsAgainstAway,f_shotsForAway,f_shotsOnTargetAgainstAway,f_shotsOnTargetForAway,f_yellowsAgainstAway,f_yellowsForAway
19,2020-09-26,2021,19,Crystal Palace,1,6.060000,4.960000,12.020000,11.960000,0.510000,2.020000,0.000000,1.000000,0.000000,0.000000,11.040000,9.590000,4.490000,4.020000,1.510000,1.490000,2020-09-26,2021,Everton,0,2.960000,7.080000,12.960000,8.020000,1.020000,3.040000,0.510000,1.020000,0.510000,0.000000,7.470000,16.020000,4.490000,5.530000,0.490000,0.510000
20,2020-09-26,2021,20,West Brom,1,8.060000,1.490000,9.000000,11.490000,4.020000,1.020000,1.020000,0.510000,0.000000,0.510000,15.040000,6.490000,7.000000,2.530000,1.000000,0.490000,2020-09-26,2021,Chelsea,0,7.570000,1.980000,6.980000,11.470000,1.510000,1.470000,0.000000,0.490000,0.000000,0.510000,15.550000,7.450000,4.530000,3.980000,0.490000,0.000000
22,2020-09-27,2021,22,Sheffield United,1,7.550000,7.920000,9.550000,13.000000,1.490000,0.000000,0.980000,0.000000,0.000000,0.510000,14.570000,6.450000,2.980000,1.490000,2.020000,1.490000,2020-09-27,2021,Leeds,0,5.940000,2.550000,13.590000,9.570000,3.490000,3.510000,1.980000,2.000000,0.000000,0.000000,17.920000,8.040000,6.000000,5.040000,1.510000,0.510000
23,2020-09-27,2021,23,Tottenham,1,3.000000,3.470000,11.590000,16.530000,1.510000,2.550000,0.510000,0.510000,0.000000,0.000000,14.490000,9.000000,5.530000,5.510000,2.040000,2.020000,2020-09-27,2021,Newcastle,0,4.430000,7.000000,14.020000,11.590000,1.530000,0.980000,1.020000,0.000000,0.510000,0.000000,13.980000,10.410000,4.530000,0.980000,0.980000,2.510000
25,2020-09-27,2021,25,West Ham,1,7.000000,6.470000,9.040000,13.000000,2.000000,0.510000,0.510000,0.510000,0.000000,0.000000,10.920000,14.490000,2.510000,3.000000,0.980000,1.490000,2020-09-27,2021,Wolves,0,8.940000,4.490000,10.450000,6.490000,1.530000,1.490000,1.020000,0.980000,0.000000,0.000000,11.550000,10.490000,5.570000,2.470000,2.510000,0.490000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,2021-02-01,2021,159,Brighton,1,4.046708,5.747446,10.369829,11.872098,1.509318,1.050347,0.588747,0.461302,0.076573,0.163491,8.870309,12.702409,3.238342,3.544956,1.508949,1.340342,2021-02-01,2021,Wolves,0,4.738488,4.978083,10.871905,12.855069,1.314873,0.911116,0.493740,0.337284,0.057871,0.000000,10.603202,12.157255,3.299475,4.327570,1.763185,1.601408
160,2021-02-01,2021,160,West Brom,1,7.278968,3.768354,11.271871,10.711609,2.197005,0.655417,1.017676,0.402053,0.000000,0.191866,16.195490,8.052705,6.136796,2.624080,1.654027,1.347034,2021-02-01,2021,Arsenal,0,5.209125,5.400823,11.577425,9.712271,1.179794,0.978600,0.752703,0.443435,0.000000,0.206947,12.266580,11.069004,3.286768,3.508707,2.084237,1.519142
161,2021-03-01,2021,161,Newcastle,1,6.399954,3.928616,11.934246,10.284487,1.596250,1.090854,0.590204,0.265357,0.131890,0.000000,15.035359,9.049679,5.628441,3.306036,1.881200,1.883024,2021-03-01,2021,Leicester,0,4.971831,5.144724,12.867124,9.703660,1.234632,1.728593,0.676410,0.708908,0.000000,0.000000,10.591852,12.123519,4.189759,3.974057,1.580536,2.026649
162,2021-03-01,2021,162,Chelsea,1,5.204522,6.153365,9.279909,12.316751,1.116503,1.859677,0.559589,0.620829,0.000000,0.047379,9.591909,14.220644,2.924908,5.309977,1.415966,1.308365,2021-03-01,2021,Man City,0,3.450434,6.869596,9.942586,10.033976,0.757358,1.481119,0.345840,1.065190,0.000000,0.000000,7.900795,15.406108,2.437667,5.329999,1.382368,1.483502
