In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

os.chdir('C:/Users/cg08900/Documents/Pandora/Personale/kaggle/NCAA_2k19/prog/functions/')

from ncaa19 import *

sns.set()

PATH_DATASETS = 'C:/Users/cg08900/Documents/Pandora/Personale/kaggle/NCAA_2k19/datasets/'
PATH_OUTPUT = 'C:/Users/cg08900/Documents/Pandora/Personale/kaggle/NCAA_2k19/datasets/'
PATH_ELAB = 'C:/Users/cg08900/Documents/Pandora/Personale/kaggle/NCAA_2k19/elaborazioni/'

REGRESSORS  = ['delta_poss_m', 'delta_opp_poss_m',
              'delta_ass_ratio', 'delta_tov_ratio',
              'delta_reb_rate', 'delta_opp_true_fg_pct',
              'delta_off_rating_m', 'delta_def_rating_m',
              'delta_net_rating_m', 'delta_pace_m',
              'delta_off_rating_m_last30D', 'delta_def_rating_m_last30D',
              'delta_net_rating_m_last30D', 'delta_off_rating_m_vs_topseeds',
              'delta_def_rating_m_vs_topseeds', 'delta_net_rating_m_vs_topseeds',
              'delta_c_N_season', 'delta_w_pct', 'delta_seed_int',
              'delta_w_pct_last30D', 'delta_w_pct_vs_topseeds',
              'delta_c_W_PCT_allT', 'delta_c_W_PCT_vs_topseeds_allT',
              'delta_MOR', 'delta_POM',
              'delta_SAG']

df_features_all = pd.read_csv(PATH_OUTPUT + '04Py_NCAA_dataset_con_indicatori.csv',
                              sep='|')

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
import numpy as np
from itertools import chain

def ir_fit(var_series, target_series):
  ir = IsotonicRegression(increasing="auto")
  ir.fit(var_series, target_series)
  return(ir)

def ir_apply(ir, var_series):
  return(pd.Series(ir.predict(var_series), index = var_series.index))

def ir_plot(var_base, var_iso, var_target, n_classes = 10):
  cl_x = pd.qcut(var_base, n_classes, labels = False, duplicates = 'drop')
  stats_td = var_target.groupby(cl_x).mean()
  stats_iso = var_iso.groupby(cl_x).mean()
  # plot figure
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.plot(stats_td, 'bs-')
  ax.plot(stats_iso, 'rs-')
  ax.yaxis.grid(True, linestyle='--', linewidth=1)
  ax.xaxis.set_label_text("Percentili variabile X")
  ax.yaxis.set_label_text("Td")
  plt.show()

def logloss(y_hat, p_1win):
    n = len(y_hat)
    logloss = (-1/n) * sum((y_hat * np.log(p_1win)) + (1 - y_hat) * np.log(1 - p_1win))
    return(logloss)

def logistic(df_tr, features, target, df_valid=None, penalty='l2', C=1.0):
    X = df_tr.loc[:, features]
    y = df_tr.loc[:, target]
    sk_model = LogisticRegression(fit_intercept=True, penalty=penalty, C=C)\
        .fit(X, y)
    df_results = pd.DataFrame({'feature': ["(Intercept)"] + features,
                               'beta': sk_model.intercept_.tolist() + sk_model.coef_[0].tolist()})
    # print(df_results)
    if df_valid is not None:
        X_valid = df_valid.loc[:, features]
        y_valid = df_valid.loc[:, target]
        X_valid.fillna(0, inplace=True) # TODO: ask Stefano!
        p_valid = pd.Series(sk_model.predict_proba(X_valid)[:,1], index = y_valid.index)
        p_valid[p_valid >= 0.975] = 0.975
        p_valid[p_valid <= 0.025] = 0.025
        print(max(p_valid))
        print(min(p_valid))
        print("Log loss for test: {0}".format(logloss(y_valid, p_valid)))
        
    df_results['logloss'] = logloss(y_valid, p_valid)

    return df_results

In [3]:
## Train 4 years

for year_ in [0,1,2,3]:
    filter_train = (df_features_all['Season']>=2008) & (df_features_all['Season']<=2014+year_)
    filter_test = (df_features_all['Season'].isin([2015+year_]))
    df_train = df_features_all.loc[filter_train, :].copy()
    df_test = df_features_all.loc[filter_test, :].copy()
    df_train.fillna(0, inplace=True) 
    df_test.fillna(0, inplace=True)

    # apply isotonic transformation
    for regressor in REGRESSORS:
    
        # print("Transform feature {0}".format(regressor))
    
        ir_var = ir_fit(df_train[regressor], df_train['win_dummy'])
        df_train['piso_' + regressor] =\
          ir_apply(ir_var, df_train[regressor])
        df_test['piso_' + regressor] =\
          ir_apply(ir_var, df_test[regressor])
    
        # ir_plot(df_test[regressor],
        #         df_test['piso_' + regressor],
        #         df_test['win_dummy'], 25)
    print('testing on: ' + str(2015+year_))
    out_ = logistic(df_train, ['piso_' + r for r in REGRESSORS], 
                    'win_dummy', df_test, penalty='l1', C=0.50)  
    
    out_['test_year'] = 2015+year_
    
    if year_ == 0:
        out_f = out_
    else:
        out_f = pd.concat([out_f, out_])



testing on: 2015
0.9527760550481392
0.04193384858381864
Log loss for test: 0.5379789498881105
testing on: 2016
0.9189498779239839
0.05225070439385922
Log loss for test: 0.546762067160723
testing on: 2017




0.9475436760444634
0.05057507638417902
Log loss for test: 0.5370035508239486
testing on: 2018
0.9093738137582613
0.04946188695744998
Log loss for test: 0.6127620664186489


In [4]:
out_f.to_excel(PATH_DATASETS + "09Py_NCAA_ciclo_primo_modello_lasso_isotonic.xlsx")