In [2]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 


We go back to the data we saved in notebook 05, as it contains the same information as in plotting data and also some str type columns that we could not plot but from which we could extract information during the feature engineering.

In [24]:
df = pd.read_csv('Cleaned Data', index_col = [0])
df.tail(5)

Unnamed: 0,match_id,year,round,local,visitor,league_id,team1_id_season,team2_id_season,team1_id,team2_id,...,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor,form_visitor,match_winner
4657,91110,2021,38,Real Oviedo,Sabadell,57314,6382799,6382802,2115,2198,...,40,9.0,13.0,15.0,35,42,-7.0,19,lddww,0
4658,91104,2021,38,FC Cartagena,CD Castellón,57314,6382787,6382788,643,673,...,41,11.0,8.0,18.0,35,43,-8.0,18,wlwdd,0
4659,91112,2021,38,UD Logroñés,Girona,57314,6382792,6391868,1578,1236,...,58,16.0,10.0,11.0,39,34,5.0,6,wlwww,2
4660,91109,2021,38,Rayo Vallecano,Leganés,57314,6382798,6382791,2080,1535,...,62,18.0,8.0,11.0,41,31,10.0,4,wldwd,1
4661,91105,2021,38,Real Sporting,Lugo,57314,6382800,6382793,2125,1598,...,37,8.0,13.0,16.0,32,49,-17.0,21,lllld,0


In [25]:
df.columns

Index(['match_id', 'year', 'round', 'local', 'visitor', 'league_id',
       'team1_id_season', 'team2_id_season', 'team1_id', 'team2_id',
       'local_abbr', 'visitor_abbr', 'division', 'local_goals',
       'visitor_goals', 'result', 'winner', 'points_local', 'wins_local',
       'draws_local', 'losses_local', 'gf_local', 'ga_local', 'avg_local',
       'pos_local', 'form_local', 'points_visitor', 'wins_visitor',
       'draws_visitor', 'losses_visitor', 'gf_visitor', 'ga_visitor',
       'avg_visitor', 'pos_visitor', 'form_visitor', 'match_winner'],
      dtype='object')

In [26]:
df['round-1_local'] = df['form_local'].apply(lambda x: x[0])
df['round-1_local']

0       d
1       d
2       w
3       w
4       d
       ..
4657    l
4658    w
4659    w
4660    w
4661    l
Name: round-1_local, Length: 4662, dtype: object

In [56]:
df['round-1_visitor'] = df['form_visitor'].apply(lambda x: x[0])
df['round-1_visitor']

0       w
1       d
2       d
3       d
4       d
       ..
4657    l
4658    w
4659    w
4660    w
4661    l
Name: round-1_visitor, Length: 4662, dtype: object

In [31]:
df['form_local'].head(5)

0    d
1    d
2    w
3    w
4    d
Name: form_local, dtype: object

In order to extract information of previous rounds from columns 'form_local' & 'form_visitor' is necessary to put into the same length the strings from each row in this colums.

In [54]:
max_length = df.form_local.map(len).max()
df.form_local = df.form_local.apply(lambda x: x + '0'*(max_length - len(x)))
df['form_local']

0       d0000
1       d0000
2       w0000
3       w0000
4       d0000
        ...  
4657    ldwld
4658    wddwl
4659    wwdld
4660    wlddl
4661    llldd
Name: form_local, Length: 4662, dtype: object

In [55]:
df.form_visitor = df.form_visitor.apply(lambda x: x + '0'*(max_length - len(x)))
df['form_visitor']

0       w0000
1       d0000
2       d0000
3       d0000
4       d0000
        ...  
4657    lddww
4658    wlwdd
4659    wlwww
4660    wldwd
4661    lllld
Name: form_visitor, Length: 4662, dtype: object

We create new columns assigning points to the home and away teams according to their results in the last matches. In order not to confuse the model, when we do not have information about the result of the round in question (this happens in the first rounds), we will assign an indicative value to the coefficient, different from 0, 1 and 3, so as not to confuse the lack of information with wins, draws or defeats.

In [59]:
def prev2roundlocal(col):
    if col['form_local'][1] == '0':
        return 0
    if col['form_local'][1] == 'w':
        return 3
    if col['form_local'][1] == 'd':
        return 1
    if col['form_local'][1] == 'l':
        return 0

df['round-2_local'] = df.apply(lambda col: prev2roundlocal (col),axis=1)
df['round-2_local'].head()

0    0
1    0
2    0
3    0
4    0
Name: round-2_local, dtype: int64

In [None]:
def prev2roundvisitor(col):
    if col['form_visitor'][1] == '0':
        return 0
    if col['form_visitor'][1] == 'w':
        return 3
    if col['form_visitor'][1] == 'd':
        return 1
    if col['form_visitor'][1] == 'l':
        return 0

df['round-2_local'] = df.apply(lambda col: prev2roundlocal (col),axis=1)
df['round-2_local'].head()

In [18]:
df['round-2_local'] = df['form_local'].apply(lambda x: x[1])
df['round-3__local'] = df['form_local'].apply(lambda x: x[2])
df['round-4__local'] = df['form_local'].apply(lambda x: x[3])
df['round-5__local'] = df['form_local'].apply(lambda x: x[4])
df['round-1_visitor'] = df['form_visitor'].apply(lambda x: x[0])
df['round-2_visitor'] = df['form_visitor'].apply(lambda x: x[1])
df['round-3_visitor'] = df['form_visitor'].apply(lambda x: x[2])
df['round-4_visitor'] = df['form_visitor'].apply(lambda x: x[3])
df['round-5_visitor'] = df['form_visitor'].apply(lambda x: x[4])

IndexError: string index out of range

In [None]:
df['score_local'] = df['result'].apply(lambda x: x.split("-")[0])
df['score_visitor'] = df['result'].apply(lambda x: x.split("-")[1])