**Module that preprocess la liga matches data and merges it to a single file**

## Libraries

In [1]:
from pathlib import Path # Path manipulation
import os # OS library

import pandas as pd # Data import, manipulation and processing 
from datetime import datetime

from data_functions import * # Private library of functions related to La Liga Dataset
from data_preprocessing import * # Private library of functions related to La Liga Dataset


## Variables

In [2]:
data_folder = Path("../data")
filename = '05-20_modified.csv'
file_path = data_folder / filename

## Execution

In [3]:
from data_functions import read_data

df = read_data(file_path)
df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,division,...,HF,AF,HC,AC,HY,AY,HR,AR,season,jornada
0,2005-08-27,Alaves,Barcelona,0.0,0.0,D,0.0,0.0,D,1,...,17.0,19.0,3.0,7.0,0.0,1.0,0.0,0.0,05-06,1
1,2005-08-27,Ath Bilbao,Sociedad,3.0,0.0,H,0.0,0.0,D,1,...,13.0,19.0,3.0,4.0,0.0,1.0,0.0,0.0,05-06,1
2,2005-08-27,Valencia,Betis,1.0,0.0,H,0.0,0.0,D,1,...,18.0,14.0,8.0,5.0,2.0,3.0,0.0,0.0,05-06,1
3,2005-08-28,Ath Madrid,Zaragoza,0.0,0.0,D,0.0,0.0,D,1,...,16.0,22.0,8.0,4.0,2.0,7.0,0.0,0.0,05-06,1
4,2005-08-28,Cadiz,Real Madrid,1.0,2.0,A,0.0,1.0,A,1,...,19.0,25.0,8.0,8.0,2.0,2.0,0.0,0.0,05-06,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6489,2020-03-08,Osasuna,Espanol,1.0,0.0,H,0.0,0.0,D,1,...,17.0,12.0,6.0,2.0,3.0,3.0,0.0,1.0,19-20,27
6488,2020-03-08,Valladolid,Ath Bilbao,1.0,4.0,A,0.0,2.0,A,1,...,9.0,13.0,8.0,1.0,1.0,2.0,0.0,0.0,19-20,27
6487,2020-03-08,Levante,Granada,1.0,1.0,D,1.0,0.0,H,1,...,21.0,13.0,6.0,4.0,3.0,3.0,0.0,0.0,19-20,27
6832,2020-03-08,Tenerife,Ponferradina,1.0,0.0,H,0.0,0.0,D,2,...,13.0,12.0,7.0,3.0,2.0,0.0,0.0,0.0,19-20,31


In [4]:
train_df = get_season(df, "04-17").copy()
test_df = get_season(df, "17-19").copy()

In [5]:
train_df_input = create_input_df(train_df)
test_df_input = create_input_df(test_df)

In [9]:
train_df_input

Unnamed: 0,season,jornada,division,HomeTeam,AwayTeam,ht_total_wins%,ht_total_draws%,ht_total_loses%,ht_home_wins%,ht_home_draws%,...,at_total_corners,at_total_y_cards,at_total_r_cards,at_away_shots,at_away_t_shots,at_away_fouls,at_away_corners,at_away_y_cards,at_away_r_cards,result
0,05-06,1,1,Alaves,Barcelona,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,D
1,05-06,1,1,Ath Bilbao,Sociedad,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
2,05-06,1,1,Valencia,Betis,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
3,05-06,1,1,Ath Madrid,Zaragoza,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,D
4,05-06,1,1,Cadiz,Real Madrid,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4555,16-17,38,1,Valencia,Villarreal,0.351351,0.189189,0.459459,0.444444,0.222222,...,9.07895,4.72222,0.21345,8.5,3.44444,13.6111,3.5,2.72222,0.0555556,A
4556,16-17,38,1,Celta,Sociedad,0.351351,0.135135,0.513514,0.5,0.0555556,...,11.2456,4.22515,0.330409,10.7778,3.77778,14,4.66667,2.27778,0.277778,D
4557,16-17,38,1,Ath Madrid,Ath Bilbao,0.594595,0.243243,0.162162,0.722222,0.111111,...,11.5819,4.71637,0.111111,11.5,4.22222,13.2778,5.05556,2.61111,0.111111,H
4558,16-17,38,1,Barcelona,Eibar,0.72973,0.162162,0.108108,0.777778,0.166667,...,10.307,4.84503,0.324561,11.2222,4.72222,13.9444,4.83333,3.05556,0.166667,H


In [6]:
test_df_input

Unnamed: 0,season,jornada,division,HomeTeam,AwayTeam,ht_total_wins%,ht_total_draws%,ht_total_loses%,ht_home_wins%,ht_home_draws%,...,at_total_corners,at_total_y_cards,at_total_r_cards,at_away_shots,at_away_t_shots,at_away_fouls,at_away_corners,at_away_y_cards,at_away_r_cards,result
0,17-18,1,2,Lorca,Leonesa,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
1,17-18,1,2,Tenerife,Zaragoza,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
2,17-18,1,1,Leganes,Alaves,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
3,17-18,1,1,Valencia,Las Palmas,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
4,17-18,1,2,Alcorcon,Sp Gijon,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1658,18-19,41,2,Extremadura UD,Mallorca,0.325,0.25,0.425,0.315789,0.157895,...,10.8,4.85,0.3,9.75,3.6,14.7,4.4,2.65,0.15,D
1659,18-19,41,2,Almeria,Albacete,0.325,0.375,0.3,0.45,0.4,...,8.2,5.4,0.4,8.5,3.3,14.45,2.75,2.6,0.2,H
1660,18-19,41,2,Gimnastic,Lugo,0.2,0.2,0.6,0.315789,0.263158,...,9.35,5.95,0.25,8.95,2.65,15.2,4.1,3.15,0.25,D
1661,18-19,41,2,Numancia,Las Palmas,0.25,0.375,0.375,0.473684,0.210526,...,8.04261,6.31078,0.300752,11.7895,4,16.7368,3.94737,3.26316,0.157895,D


In [12]:
train_filename = '05-17_train.csv'
test_filename = '17-19_test.csv'
train_df_input.to_csv(data_folder / train_filename, index=False)
test_df_input.to_csv(data_folder / test_filename, index=False)

In [14]:
train = pd.read_csv(data_folder / train_filename)
train

Unnamed: 0,season,jornada,division,HomeTeam,AwayTeam,ht_total_wins%,ht_total_draws%,ht_total_loses%,ht_home_wins%,ht_home_draws%,...,at_total_corners,at_total_y_cards,at_total_r_cards,at_away_shots,at_away_t_shots,at_away_fouls,at_away_corners,at_away_y_cards,at_away_r_cards,result
0,05-06,1,1,Alaves,Barcelona,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,D
1,05-06,1,1,Ath Bilbao,Sociedad,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,H
2,05-06,1,1,Valencia,Betis,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,H
3,05-06,1,1,Ath Madrid,Zaragoza,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,D
4,05-06,1,1,Cadiz,Real Madrid,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4555,16-17,38,1,Valencia,Villarreal,0.351351,0.189189,0.459459,0.444444,0.222222,...,9.078947,4.722222,0.213450,8.500000,3.444444,13.611111,3.500000,2.722222,0.055556,A
4556,16-17,38,1,Celta,Sociedad,0.351351,0.135135,0.513514,0.500000,0.055556,...,11.245614,4.225146,0.330409,10.777778,3.777778,14.000000,4.666667,2.277778,0.277778,D
4557,16-17,38,1,Ath Madrid,Ath Bilbao,0.594595,0.243243,0.162162,0.722222,0.111111,...,11.581871,4.716374,0.111111,11.500000,4.222222,13.277778,5.055556,2.611111,0.111111,H
4558,16-17,38,1,Barcelona,Eibar,0.729730,0.162162,0.108108,0.777778,0.166667,...,10.307018,4.845029,0.324561,11.222222,4.722222,13.944444,4.833333,3.055556,0.166667,H


In [15]:
test = pd.read_csv(data_folder / test_filename)
test

Unnamed: 0,season,jornada,division,HomeTeam,AwayTeam,ht_total_wins%,ht_total_draws%,ht_total_loses%,ht_home_wins%,ht_home_draws%,...,at_total_corners,at_total_y_cards,at_total_r_cards,at_away_shots,at_away_t_shots,at_away_fouls,at_away_corners,at_away_y_cards,at_away_r_cards,result
0,17-18,1,2,Lorca,Leonesa,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,H
1,17-18,1,2,Tenerife,Zaragoza,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,H
2,17-18,1,1,Leganes,Alaves,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,H
3,17-18,1,1,Valencia,Las Palmas,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,H
4,17-18,1,2,Alcorcon,Sp Gijon,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1658,18-19,41,2,Extremadura UD,Mallorca,0.325,0.250,0.425,0.315789,0.157895,...,10.800000,4.850000,0.300000,9.750000,3.60,14.700000,4.400000,2.650000,0.150000,D
1659,18-19,41,2,Almeria,Albacete,0.325,0.375,0.300,0.450000,0.400000,...,8.200000,5.400000,0.400000,8.500000,3.30,14.450000,2.750000,2.600000,0.200000,H
1660,18-19,41,2,Gimnastic,Lugo,0.200,0.200,0.600,0.315789,0.263158,...,9.350000,5.950000,0.250000,8.950000,2.65,15.200000,4.100000,3.150000,0.250000,D
1661,18-19,41,2,Numancia,Las Palmas,0.250,0.375,0.375,0.473684,0.210526,...,8.042607,6.310777,0.300752,11.789474,4.00,16.736842,3.947368,3.263158,0.157895,D
