# Part I: Extend

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/hockey.csv')

In [2]:
df.shape

(657, 13)

In [3]:
df[df['name'] == 'Auston Matthews'].head(3)

Unnamed: 0,id,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
1,2,matthau01,Auston Matthews,C,2018-10-03,TOR,Home,MTL,W,2,0,3,17.0
15,16,matthau01,Auston Matthews,C,2018-10-06,TOR,Home,OTT,L,1,1,4,20.0
22,23,matthau01,Auston Matthews,C,2018-10-07,TOR,Away,CHI,W,2,2,3,16.0


In [4]:
df.sample(3)

Unnamed: 0,id,player_id,name,position,date,team,venue,opponent,outcome,goals,assists,shots,ice_time
569,570,matthau01,Auston Matthews,C,2019-02-01,TOR,Away,DET,L-OT,1,1,3,21.0
517,518,karlser01,Erik Karlsson,D,2019-01-15,SJS,Home,PIT,W,0,1,4,24.0
118,119,kapanka01,Kasperi Kapanen,RW,2018-10-27,TOR,Home,WPG,W,1,0,3,14.0


In [5]:
df.name.unique().tolist()

['Alex Ovechkin',
 'Auston Matthews',
 'Brent Burns',
 'Erik Karlsson',
 'John Tavares',
 'Kasperi Kapanen',
 'Patrice Bergeron',
 'Travis Dermott',
 'Zach Hyman',
 'Sidney Crosby',
 'Jake Muzzin',
 'Connor McDavid',
 'William Nylander']

In [6]:
df['date'] = df['date'].apply(pd.to_datetime)

`train_test_split` on time series data is a little different...

In [7]:
df = df[df.date <= '2018-12-31']

In [8]:
df.shape

(437, 13)

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*goals\* next game based on the {goals, assists, ice time} rolling average for the last *five* games...

In [9]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [10]:
# need to shift games by one to predict next game
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

In [11]:
# re-align index
train = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

In [12]:
# (re-)identify our X and y matrices
target = 'goals_next'
X_train = train.drop(target, axis=1)
y_train = train[target]

In [13]:
print(X_train.shape)
X_train[:3]

(372, 5)


Unnamed: 0,position,goals,assists,shots,ice_time
58,C,1.0,1.2,2.8,15.8
73,C,1.2,1.4,3.6,16.6
81,C,1.0,1.2,3.8,17.8


In [14]:
print(y_train.shape)
y_train[:3]

(372,)


58    1.0
73    0.0
81    0.0
Name: goals_next, dtype: float64

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper, CategoricalImputer

In [16]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [19]:
model.score(Z_train, y_train)

0.1281200080796906

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [20]:
demo = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'C']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [21]:
demo_2 = pd.DataFrame({
    'position': ['LW', 'RW', 'RW', 'D']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_D,position_LW,position_RW
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [22]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [23]:
X_train.sample(5)

Unnamed: 0,position,goals,assists,shots,ice_time
209,D,0.0,0.4,1.6,17.8
318,C,1.2,0.8,3.4,18.4
381,D,0.0,1.0,4.6,24.0
247,C,0.4,0.0,2.6,17.2
392,D,0.0,1.2,4.2,24.8


In [24]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [25]:
lb.classes_

array(['C', 'D', 'D/RW', 'LW', 'RW'], dtype='<U4')

In [26]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0]])

In [27]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [28]:
mapper = DataFrameMapper([
    ('position', [CategoricalImputer(), LabelBinarizer()]),
    (['goals'], [SimpleImputer(), StandardScaler()]), 
    (['assists'], [SimpleImputer(), StandardScaler()]),
    (['shots'], [SimpleImputer(), StandardScaler()]), 
    (['ice_time'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [29]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_D,position_D/RW,position_LW,position_RW,goals,assists,shots,ice_time
58,1,0,0,0,0,1.69057,1.75374,-0.036663,-1.438185
73,1,0,0,0,0,2.224888,2.285436,0.70056,-1.152887
81,1,0,0,0,0,1.69057,1.75374,0.884866,-0.72494
93,1,0,0,0,0,0.087617,1.222043,0.516254,-0.368318
101,1,0,0,0,0,0.087617,1.75374,0.147643,-0.439642
110,1,0,0,0,0,0.087617,0.158651,0.516254,0.059629
119,1,0,0,0,0,-0.446701,-0.373045,-0.036663,0.130953
132,1,0,0,0,0,-0.446701,1.222043,-0.036663,0.059629
151,1,0,0,0,0,-0.446701,1.222043,0.331949,0.130953
157,1,0,0,0,0,-0.981019,0.690347,0.884866,0.344927


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [30]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('dataframemapper', DataFrameMapper(default=False, df_out=True,
        features=[('position', [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
          strategy='most_frequent'), LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)]), (['goals'], [SimpleImputer(...ression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

### The Pickle 🥒

In [31]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [32]:
del pipe

In [33]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [34]:
pipe.score(X_train, y_train)

0.1281200080796906

In [35]:
pipe.predict(X_train)[:10]

array([0.52168727, 0.56945698, 0.55943287, 0.57367113, 0.59062786,
       0.48563282, 0.43943013, 0.55553534, 0.57661568, 0.59564802])

In [36]:
X_train.sample(1).to_dict(orient='list')

{'position': ['D'],
 'goals': [0.2],
 'assists': [0.2],
 'shots': [1.8],
 'ice_time': [18.4]}

In [37]:
new = pd.DataFrame({
    'position': ['RW'],
    'goals': [0.7],
    'assists': [0.0],
    'shots': [3],
    'ice_time': [20.0]
})

In [38]:
pipe.predict(new)

array([0.30075699])

### Time to Test

In [39]:
df = pd.read_csv('data/hockey.csv')
df['date'] = df['date'].apply(pd.to_datetime)
df = df[df.date > '2018-12-31']

In [40]:
X = (
    df
    .groupby(['player_id', 'position'])
    [['goals', 'assists', 'shots', 'ice_time']]
    .rolling(5)
    .mean()
    .reset_index()
    .rename(columns={'level_2': 'index'})
    .set_index('index')
    .dropna(subset=['goals'])
    [['position', 'goals', 'assists', 'shots', 'ice_time']]
)

In [41]:
y = df[['player_id', 'goals']].groupby('player_id').shift(-1)
y = y.dropna(subset=['goals'])

test = pd.merge(X, y, left_index=True, right_index=True, suffixes=('', '_next'))

target = 'goals_next'
X_test = train.drop(target, axis=1)
y_test = train[target]

In [42]:
score = pipe.score(X_test, y_test)
print(score)

0.1281200080796906


Not all that terrible TBH...

In [43]:
with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)