In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular import *

In [3]:
from IPython.display import display, HTML

In [4]:
from fastai import tabular

In [5]:
from exp.nb_ import *

In [6]:
pd.set_option('display.max_rows',2000)

In [7]:
path=Path('../data/football/')
train_df = pd.read_pickle(path/'trans_over_1000_final.pkl')

In [8]:
cat_vars_tpl = ('season','trs_year','trs_month','trs_day','trs_till_deadline',
            'contract_left_months', 'contract_left_years','age',
            'is_midseason','is_loan','is_end_of_loan',
            'nat_national_name','plr_position_main',
            'plr_other_positions','plr_nationality_name',
            'plr_other_nationality_name','plr_place_of_birth_country_name',
            'plr_foot','plr_height','plr_player_agent','from_club_name','from_club_is_first_team',
            'from_clb_place','from_clb_qualified_to','from_clb_is_champion','from_clb_is_cup_winner',
            'from_clb_is_promoted','from_clb_lg_name','from_clb_lg_country','from_clb_lg_group',
            'from_coach_name', 'from_sport_dir_name',
            'to_club_name','to_club_is_first_team','to_clb_place','to_clb_qualified_to',
            'to_clb_is_champion','to_clb_is_cup_winner','to_clb_is_promoted','to_clb_lg_name','to_clb_lg_country',
            'to_clb_lg_group','to_coach_name', 'to_sport_dir_name',
            'plr_position_0','plr_position_1','plr_position_2',
            'stats_leag_name_0','stats_leag_grp_0','stats_leag_name_1','stats_leag_grp_1','stats_leag_name_2',
            'stats_leag_grp_2')

In [9]:
cont_vars_tpl = ('nat_months_from_debut','nat_matches_played','nat_goals_scored','from_clb_pts_avg',
             'from_clb_goals_diff_avg','to_clb_pts_avg','to_clb_goals_diff_avg','plr_apps_0',
             'plr_apps_1','plr_apps_2','stats_made_goals_0','stats_conc_gols_0','stats_cards_0',
             'stats_minutes_0','stats_team_points_0','stats_made_goals_1','stats_conc_gols_1',
             'stats_cards_1','stats_minutes_1','stats_team_points_1','stats_made_goals_2',
             'stats_conc_gols_2','stats_cards_2','stats_minutes_2','stats_team_points_2', 'pop_log1p')

In [10]:
cat_vars = list(cat_vars_tpl)
cont_vars = list(cont_vars_tpl)
all_vars = cat_vars + cont_vars

In [11]:
list_diff(train_df.columns, cat_vars, cont_vars)

['player_name',
 'market_value',
 'fee',
 'is_future_transfer',
 'plr_place_of_birth_name']

We throw away: effectively unique values ('player_name','plr_place_of_birth_name'), constants ('is_future_transfer' as we have no future transfers here anymore) or values we don't want to use now (transfermarkt's 'market_value')

In [12]:
dep_var = 'fee'

In [13]:
df = train_df[all_vars + [dep_var]].copy()

In [14]:
np.random.seed(1001)
ln = len(df)
valid_idx = np.random.choice(ln, int(ln*0.2), replace=False)

In [15]:
len(valid_idx)

1952

In [16]:
procs=[FillMissing, Categorify, Normalize]

In [17]:
min_log_y = np.log(500)
max_log_y = np.log(np.max(df[dep_var])*1.2)
y_range = torch.tensor([min_log_y, max_log_y], device=defaults.device)

In [18]:
def mape(pred:Tensor, targ:Tensor)->Rank0Tensor:
    "Median absolute percentage error between `pred` and `targ`."
    pred,targ = flatten_check(pred,targ)
    pct_var = (targ - pred)/targ
    return torch.abs(pct_var).median()

In [19]:
#export
def exp_mmape(pred:Tensor, targ:Tensor)->Rank0Tensor:
    "Exp median absolute percentage error between `pred` and `targ`."
    pred,targ = flatten_check(pred,targ)
    pred, targ = torch.exp(pred), torch.exp(targ)
    pct_var = (targ - pred)/targ
    return torch.abs(pct_var).median()

In [20]:
def avg_exp_mmape_n_mse(pred:Tensor, targ:Tensor)->Rank0Tensor:
    return (exp_mmape(pred, targ) + mean_squared_error(pred, targ))/2

In [21]:
#export
from fastai.layers import FlattenedLoss

In [22]:
#export
def MAELossFlat(*args, axis:int=-1, floatify:bool=True, **kwargs):
    "Same as `nn.MAELoss`, but flattens input and target."
    return FlattenedLoss(nn.L1Loss, *args, axis=axis, floatify=floatify, is_2d=False, **kwargs)

In [23]:
BS = 128

In [24]:
def emb_sz_rule_reduced(n_cat:int)->int: return min(10, round(1.6 * n_cat**0.56))

In [25]:
#monkey pacth embenning rule as 600 floats is too much for our case
tabular.data.emb_sz_rule = emb_sz_rule_reduced

In [26]:
layers = [1000,500]
layers_drop = [0.07, 0.7]
emb_drop = 0.7
cycles = 40
w_decay = 0.7
max_lr = 1e-3

In [27]:
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs)
        .split_by_idx(valid_idx)
        .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
        .databunch(bs=BS))

In [28]:
np.random.seed(1001)
learn = tabular_learner(data, 
                        layers=layers, 
                        ps=layers_drop, 
                        emb_drop=emb_drop, 
                        y_range=y_range, 
                        metrics=exp_mmape,
                        loss_func=MAELossFlat(), 
                        callback_fns=[CSVLogger])

#### Fit

In [29]:
learn.fit_one_cycle(cyc_len=cycles, max_lr=max_lr, wd=w_decay)

epoch,train_loss,valid_loss,exp_mmape,time
0,1.258965,1.104291,1.680418,00:02
1,1.180123,1.032551,1.123483,00:02
2,1.061284,0.889907,0.673526,00:02
3,0.943476,0.805942,0.560289,00:02
4,0.847718,0.709671,0.521456,00:02
5,0.743637,0.607571,0.478198,00:02
6,0.642968,0.550358,0.461449,00:02
7,0.568827,0.500691,0.405564,00:02
8,0.522954,0.494220,0.403703,00:02
9,0.488195,0.469534,0.383873,00:02
10,0.459300,0.466615,0.365734,00:02
11,0.439333,0.471351,0.374001,00:02
12,0.426345,0.455230,0.371424,00:02
13,0.422517,0.464315,0.376226,00:02
14,0.410307,0.456378,0.359223,00:02
15,0.400872,0.457877,0.358762,00:02
16,0.393798,0.460518,0.367701,00:02
17,0.386247,0.461936,0.369207,00:02
18,0.378324,0.449494,0.354346,00:02
19,0.374986,0.444790,0.349194,00:02
20,0.366961,0.440497,0.340728,00:02
21,0.362839,0.443647,0.349292,00:02
22,0.358753,0.448197,0.343309,00:02
23,0.353878,0.440448,0.346875,00:02
24,0.346549,0.448140,0.353718,00:02
25,0.345610,0.451065,0.356221,00:02
26,0.338368,0.444274,0.350040,00:02
27,0.332639,0.455951,0.363697,00:02
28,0.320565,0.452949,0.352383,00:02
29,0.322833,0.449920,0.351235,00:02
30,0.318272,0.453192,0.356277,00:02
31,0.310111,0.450475,0.358385,00:02
32,0.311353,0.451541,0.358633,00:02
33,0.305918,0.448644,0.355860,00:02
34,0.303241,0.450886,0.359607,00:02
35,0.300089,0.449848,0.355004,00:02
36,0.297784,0.450052,0.359004,00:02
37,0.294179,0.449693,0.359334,00:02
38,0.296302,0.450717,0.359841,00:02
39,0.296984,0.450700,0.358207,00:02


In [30]:
calc_valid_acc(learn=learn, func=exp_mmape)

0.34924299999999997

### Export learner and parameters

In [86]:
params = {
    'layers':layers,
    'layers_drop':layers_drop,
    'emb_drop':emb_drop,
    'cycles':cycles,
    'w_decay':w_decay,
    'max_lr':max_lr,
}

In [87]:
params

{'layers': [1000, 500],
 'layers_drop': [0.07, 0.7],
 'emb_drop': 0.7,
 'cycles': 40,
 'w_decay': 0.7,
 'max_lr': 0.001}

In [80]:
name = 'w_pop_ref-346-median'

In [88]:
with open(path/f'{name}_hypers.pkl', 'wb') as handle:
    pickle.dump(params, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [82]:
learn.save(f'{name}');

In [83]:
learn = learn.load(f'{name}')