**Module that preprocess la liga matches data and merges it to a single file**

## Libraries

In [1]:
from pathlib import Path # Path manipulation
import os # OS library

import pandas as pd # Data import, manipulation and processing 
from datetime import datetime

from data_functions import * # Private library of functions related to La Liga Dataset
from data_preprocessing import * # Private library of functions related to La Liga Dataset


## Variables

In [2]:
data_folder = Path("../data")
filename = '05-20_modified.csv'
file_path = data_folder / filename

## Functions

In [3]:
def get_scores_prop(scores):
    l = ['Wins', 'Draws', 'Loses']
    values = [scores[x] for x in l]
    total = sum(values)
    if total == 0:
        return [0, 0, 0]
    values = [x/total for x in values]
    return values

In [4]:
def get_averages(df, team):
    columns = ['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR']
    
    home_columns = [x for x in columns if 'H' in x]
    away_columns = [x for x in columns if 'A' in x]
    
    
    if( df.empty ):
        ret = [0] * (len(home_columns))
        return ret, ret, ret
    
    home_means = df.loc[df['HomeTeam'] == team][home_columns].mean().fillna(0).to_list()
    away_means = df.loc[df['AwayTeam'] == team][away_columns].mean().fillna(0).to_list()
    
    total_means = [x+y for (x,y) in zip(home_means,away_means)]
    
    return total_means, home_means, away_means

In [5]:
def add_data_row_from_match(df, df_new, match):
    match_date = match['Date']
    home_team = match['HomeTeam']
    away_team = match['AwayTeam']
    
    season = match['season']
    jornada = match['jornada']
    division = match['division']
    h_team = match['HomeTeam']
    a_team = match['AwayTeam']
    result = match['FTR']
    
    df_s = get_season(df, season, season_end=match_date)
    ht_scores = get_team_scores(df_s, home_team)
    at_scores = get_team_scores(df_s, away_team)
    
    ht_means = get_averages(df_s, home_team)
    at_means = get_averages(df_s, away_team)

    scores = []
    scores.append(ht_scores[0])
    scores.append(ht_scores[1])
    scores.append(at_scores[0])
    scores.append(at_scores[2])

    prop_scores = []
    for s in scores:
        for p in get_scores_prop(s):
            prop_scores.append(p)
            
    means = []
    means.append(ht_means[0])
    means.append(ht_means[1])
    means.append(at_means[0])
    means.append(at_means[2])
    
    means = [item for sublist in means for item in sublist]
    
    row = []
    row.append(season)
    row.append(jornada)
    row.append(division)
    row.append(h_team)
    row.append(a_team)
    row = row + prop_scores + means
    row.append(result)

    df_new.loc[len(df_new)]= row

In [6]:
def create_input_df(df):
    p_prefix = ['ht_', 'at_']
    p_infix = ['total', 'home', 'away']
    p_suf = ['_wins%', '_draws%', '_loses%']
    p_suf2 = ['_shots','_t_shots','_fouls','_corners','_y_cards','_r_cards']

    columns = ['season', 'jornada','division', 'HomeTeam', 'AwayTeam']
    for pref in p_prefix:
        for inf in p_infix:
            for suf in p_suf:
                if(pref[0] == inf[0] or inf[0] == 't'):
                    columns.append(pref+inf+suf)
    
    for pref in p_prefix:
        for inf in p_infix:
            for suf in p_suf2:
                if(pref[0] == inf[0] or inf[0] == 't'):
                    columns.append(pref+inf+suf)

    columns.append('result')
                    
    input_df = pd.DataFrame(columns=columns)
    for index, match in df.iterrows():
        add_data_row_from_match(df, input_df, match)
    return input_df

## Execution

In [3]:
from data_functions import read_data

df = read_data(file_path)
df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,division,...,HF,AF,HC,AC,HY,AY,HR,AR,season,jornada
0,2005-08-27,Alaves,Barcelona,0.0,0.0,D,0.0,0.0,D,1,...,17.0,19.0,3.0,7.0,0.0,1.0,0.0,0.0,05-06,1
1,2005-08-27,Ath Bilbao,Sociedad,3.0,0.0,H,0.0,0.0,D,1,...,13.0,19.0,3.0,4.0,0.0,1.0,0.0,0.0,05-06,1
2,2005-08-27,Valencia,Betis,1.0,0.0,H,0.0,0.0,D,1,...,18.0,14.0,8.0,5.0,2.0,3.0,0.0,0.0,05-06,1
3,2005-08-28,Ath Madrid,Zaragoza,0.0,0.0,D,0.0,0.0,D,1,...,16.0,22.0,8.0,4.0,2.0,7.0,0.0,0.0,05-06,1
4,2005-08-28,Cadiz,Real Madrid,1.0,2.0,A,0.0,1.0,A,1,...,19.0,25.0,8.0,8.0,2.0,2.0,0.0,0.0,05-06,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6489,2020-03-08,Osasuna,Espanol,1.0,0.0,H,0.0,0.0,D,1,...,17.0,12.0,6.0,2.0,3.0,3.0,0.0,1.0,19-20,27
6488,2020-03-08,Valladolid,Ath Bilbao,1.0,4.0,A,0.0,2.0,A,1,...,9.0,13.0,8.0,1.0,1.0,2.0,0.0,0.0,19-20,27
6487,2020-03-08,Levante,Granada,1.0,1.0,D,1.0,0.0,H,1,...,21.0,13.0,6.0,4.0,3.0,3.0,0.0,0.0,19-20,27
6832,2020-03-08,Tenerife,Ponferradina,1.0,0.0,H,0.0,0.0,D,2,...,13.0,12.0,7.0,3.0,2.0,0.0,0.0,0.0,19-20,31


In [4]:
train_df = get_season(df, "04-17").copy()
test_df = get_season(df, "17-19").copy()

In [5]:
prueba_df = get_season(train_df,'10-11')
input_df = create_input_df(prueba_df)
input_df

Unnamed: 0,season,jornada,division,HomeTeam,AwayTeam,ht_total_wins%,ht_total_draws%,ht_total_loses%,ht_home_wins%,ht_home_draws%,...,at_total_corners,at_total_y_cards,at_total_r_cards,at_away_shots,at_away_t_shots,at_away_fouls,at_away_corners,at_away_y_cards,at_away_r_cards,result
0,10-11,1,1,Malaga,Valencia,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A
1,10-11,1,1,Levante,Sevilla,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A
2,10-11,1,1,Hercules,Ath Bilbao,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A
3,10-11,1,1,Sociedad,Villarreal,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
4,10-11,1,1,Espanol,Getafe,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,10-11,38,1,Malaga,Barcelona,0.351351,0.189189,0.459459,0.388889,0.166667,...,13.6433,3.73099,0.111111,14.7222,7.22222,10.0556,5.22222,1.88889,0.111111,A
376,10-11,38,1,Levante,Zaragoza,0.324324,0.243243,0.432432,0.5,0.222222,...,9.01754,5.99123,0.488304,11.5,3.72222,18.2778,3.33333,2.83333,0.277778,A
377,10-11,38,1,Espanol,Sevilla,0.405405,0.108108,0.486486,0.611111,0.111111,...,10.4327,5.14912,0.383041,12.1667,4.66667,14.6111,4.22222,2.83333,0.277778,A
378,10-11,38,1,Santander,Ath Bilbao,0.324324,0.27027,0.405405,0.444444,0.333333,...,11.845,5.1345,0.374269,12.1111,4.72222,16.3333,5.05556,2.55556,0.111111,A


In [6]:
input_df['result'].value_counts()

H    197
A    104
D     79
Name: result, dtype: int64