In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [3]:
columns_read = ['Profile', 'Player', 'Goals',
                'Assists', 'Plus_minus', 'Penalties',
                'Shots', 'Hits', 'Shots_blocked',
                'Penalties_against', 'Icetime_seconds'
                ]
forwards = pd.read_csv('players/after_2014/forwards_match_after_2014.csv', usecols=columns_read)
defenses = pd.read_csv('players/after_2014/defenses_match_after_2014.csv')
goaltenders = pd.read_csv('players/after_2014/goaltenders_match_after_2014.csv')

In [4]:
columns_read = ['Profile', 'Player', 'Season', 'Year', 'Team', 'Winner',
                'Goals', 'Assists', 'Plus', 'Minus', 'Penalties',
                'Shots', 'Hits', 'Shots_blocked',
                'Penalties_against', 'Icetime_seconds', 'Rating'
                ]
forwards = pd.read_csv('forwards_match_with_rating.csv', usecols=columns_read)

In [5]:
forwards = forwards[forwards['Season'] == 'Regular season']
forwards = forwards[forwards['Icetime_seconds'] >= 480]
forwards = forwards[forwards.groupby(['Profile', 'Year']).Profile.transform('count') >= 20]
forwards.groupby(['Profile', 'Year'])['Player'].count()

Profile                           Year     
https://en.khl.ru/players/10769/  2014/2015    50
                                  2015/2016    53
                                  2016/2017    51
                                  2017/2018    22
                                  2018/2019    49
                                  2019/2020    59
                                  2020/2021    23
https://en.khl.ru/players/10892/  2014/2015    33
                                  2015/2016    54
https://en.khl.ru/players/10900/  2014/2015    55
                                  2015/2016    48
                                  2016/2017    47
https://en.khl.ru/players/10942/  2014/2015    20
https://en.khl.ru/players/10992/  2014/2015    20
https://en.khl.ru/players/119/    2014/2015    47
                                  2015/2016    49
                                  2016/2017    51
https://en.khl.ru/players/120/    2014/2015    59
                                  2015/2016    58
      

In [6]:
forwards = forwards[forwards.groupby(['Profile']).Year.transform('nunique') > 2]
forwards.groupby(['Profile', 'Year']).count()

forwards = forwards.groupby(['Profile', 'Year']).sum().reset_index()
forwards = forwards[forwards['Assists'] + forwards['Goals'] > 10]

In [7]:
forwards.head()

Unnamed: 0,Profile,Year,Goals,Assists,Plus,Minus,Penalties,Shots,Icetime_seconds,Hits,Shots_blocked,Penalties_against,Rating
0,https://en.khl.ru/players/10769/,2014/2015,13,22,29,35,47,144,48117,123.0,15.0,25.0,241
1,https://en.khl.ru/players/10769/,2015/2016,20,20,32,26,40,146,46309,98.0,14.0,9.0,281
2,https://en.khl.ru/players/10769/,2016/2017,16,18,26,17,49,123,50416,88.0,20.0,10.0,241
3,https://en.khl.ru/players/10769/,2017/2018,3,10,13,16,16,43,22301,40.0,12.0,1.0,76
4,https://en.khl.ru/players/10769/,2018/2019,14,14,28,26,28,104,49396,73.0,15.0,15.0,202


In [8]:
forwards.groupby(['Profile', 'Year']).sum().reset_index().head()

Unnamed: 0,Profile,Year,Goals,Assists,Plus,Minus,Penalties,Shots,Icetime_seconds,Hits,Shots_blocked,Penalties_against,Rating
0,https://en.khl.ru/players/10769/,2014/2015,13,22,29,35,47,144,48117,123.0,15.0,25.0,241
1,https://en.khl.ru/players/10769/,2015/2016,20,20,32,26,40,146,46309,98.0,14.0,9.0,281
2,https://en.khl.ru/players/10769/,2016/2017,16,18,26,17,49,123,50416,88.0,20.0,10.0,241
3,https://en.khl.ru/players/10769/,2017/2018,3,10,13,16,16,43,22301,40.0,12.0,1.0,76
4,https://en.khl.ru/players/10769/,2018/2019,14,14,28,26,28,104,49396,73.0,15.0,15.0,202


In [9]:
forwards.to_csv('forwards_seasons.csv', encoding='utf8', index=False)

In [10]:
forwards[['T1_Profile', 'T1_Year',	'T1_Goals', 'T1_Assists',	'T1_Plus',	'T1_Minus', 'T1_Penalties',
          'T1_Shots', 'T1_Icetime_seconds', 'T1_Hits', 'T1_Shots_blocked', 'T1_Penalties_against', 'T1_Rating']] \
    = forwards[['Profile',  'Year',	'Goals', 'Assists',	'Plus',	'Minus', 'Penalties',
          'Shots', 'Icetime_seconds', 'Hits', 'Shots_blocked', 'Penalties_against', 'Rating']].shift(-1)
forwards[['T2_Profile', 'T2_Year',	'T2_Goals', 'T2_Assists',	'T2_Plus',	'T2_Minus', 'T2_Penalties',
          'T2_Shots', 'T2_Icetime_seconds', 'T2_Hits', 'T2_Shots_blocked', 'T2_Penalties_against', 'T2_Rating']] \
    = forwards[['Profile',  'Year',	'Goals', 'Assists',	'Plus',	'Minus', 'Penalties',
                'Shots', 'Icetime_seconds', 'Hits', 'Shots_blocked', 'Penalties_against', 'Rating']].shift(-2)
forwards.head()

Unnamed: 0,Profile,Year,Goals,Assists,Plus,Minus,Penalties,Shots,Icetime_seconds,Hits,Shots_blocked,Penalties_against,Rating,T1_Profile,T1_Year,T1_Goals,T1_Assists,T1_Plus,T1_Minus,T1_Penalties,T1_Shots,T1_Icetime_seconds,T1_Hits,T1_Shots_blocked,T1_Penalties_against,T1_Rating,T2_Profile,T2_Year,T2_Goals,T2_Assists,T2_Plus,T2_Minus,T2_Penalties,T2_Shots,T2_Icetime_seconds,T2_Hits,T2_Shots_blocked,T2_Penalties_against,T2_Rating
0,https://en.khl.ru/players/10769/,2014/2015,13,22,29,35,47,144,48117,123.0,15.0,25.0,241,https://en.khl.ru/players/10769/,2015/2016,20.0,20.0,32.0,26.0,40.0,146.0,46309.0,98.0,14.0,9.0,281.0,https://en.khl.ru/players/10769/,2016/2017,16.0,18.0,26.0,17.0,49.0,123.0,50416.0,88.0,20.0,10.0,241.0
1,https://en.khl.ru/players/10769/,2015/2016,20,20,32,26,40,146,46309,98.0,14.0,9.0,281,https://en.khl.ru/players/10769/,2016/2017,16.0,18.0,26.0,17.0,49.0,123.0,50416.0,88.0,20.0,10.0,241.0,https://en.khl.ru/players/10769/,2017/2018,3.0,10.0,13.0,16.0,16.0,43.0,22301.0,40.0,12.0,1.0,76.0
2,https://en.khl.ru/players/10769/,2016/2017,16,18,26,17,49,123,50416,88.0,20.0,10.0,241,https://en.khl.ru/players/10769/,2017/2018,3.0,10.0,13.0,16.0,16.0,43.0,22301.0,40.0,12.0,1.0,76.0,https://en.khl.ru/players/10769/,2018/2019,14.0,14.0,28.0,26.0,28.0,104.0,49396.0,73.0,15.0,15.0,202.0
3,https://en.khl.ru/players/10769/,2017/2018,3,10,13,16,16,43,22301,40.0,12.0,1.0,76,https://en.khl.ru/players/10769/,2018/2019,14.0,14.0,28.0,26.0,28.0,104.0,49396.0,73.0,15.0,15.0,202.0,https://en.khl.ru/players/10769/,2019/2020,17.0,17.0,33.0,50.0,34.0,122.0,61315.0,84.0,27.0,6.0,200.0
4,https://en.khl.ru/players/10769/,2018/2019,14,14,28,26,28,104,49396,73.0,15.0,15.0,202,https://en.khl.ru/players/10769/,2019/2020,17.0,17.0,33.0,50.0,34.0,122.0,61315.0,84.0,27.0,6.0,200.0,https://en.khl.ru/players/10900/,2014/2015,11.0,24.0,32.0,30.0,34.0,83.0,52432.0,20.0,11.0,18.0,200.0


In [11]:
forwards = forwards[(forwards['Profile'] == forwards['T1_Profile']) & (forwards['Profile'] == forwards['T2_Profile'])]
forwards.reset_index(drop=True, inplace=True)
forwards.head()

Unnamed: 0,Profile,Year,Goals,Assists,Plus,Minus,Penalties,Shots,Icetime_seconds,Hits,Shots_blocked,Penalties_against,Rating,T1_Profile,T1_Year,T1_Goals,T1_Assists,T1_Plus,T1_Minus,T1_Penalties,T1_Shots,T1_Icetime_seconds,T1_Hits,T1_Shots_blocked,T1_Penalties_against,T1_Rating,T2_Profile,T2_Year,T2_Goals,T2_Assists,T2_Plus,T2_Minus,T2_Penalties,T2_Shots,T2_Icetime_seconds,T2_Hits,T2_Shots_blocked,T2_Penalties_against,T2_Rating
0,https://en.khl.ru/players/10769/,2014/2015,13,22,29,35,47,144,48117,123.0,15.0,25.0,241,https://en.khl.ru/players/10769/,2015/2016,20.0,20.0,32.0,26.0,40.0,146.0,46309.0,98.0,14.0,9.0,281.0,https://en.khl.ru/players/10769/,2016/2017,16.0,18.0,26.0,17.0,49.0,123.0,50416.0,88.0,20.0,10.0,241.0
1,https://en.khl.ru/players/10769/,2015/2016,20,20,32,26,40,146,46309,98.0,14.0,9.0,281,https://en.khl.ru/players/10769/,2016/2017,16.0,18.0,26.0,17.0,49.0,123.0,50416.0,88.0,20.0,10.0,241.0,https://en.khl.ru/players/10769/,2017/2018,3.0,10.0,13.0,16.0,16.0,43.0,22301.0,40.0,12.0,1.0,76.0
2,https://en.khl.ru/players/10769/,2016/2017,16,18,26,17,49,123,50416,88.0,20.0,10.0,241,https://en.khl.ru/players/10769/,2017/2018,3.0,10.0,13.0,16.0,16.0,43.0,22301.0,40.0,12.0,1.0,76.0,https://en.khl.ru/players/10769/,2018/2019,14.0,14.0,28.0,26.0,28.0,104.0,49396.0,73.0,15.0,15.0,202.0
3,https://en.khl.ru/players/10769/,2017/2018,3,10,13,16,16,43,22301,40.0,12.0,1.0,76,https://en.khl.ru/players/10769/,2018/2019,14.0,14.0,28.0,26.0,28.0,104.0,49396.0,73.0,15.0,15.0,202.0,https://en.khl.ru/players/10769/,2019/2020,17.0,17.0,33.0,50.0,34.0,122.0,61315.0,84.0,27.0,6.0,200.0
4,https://en.khl.ru/players/10900/,2014/2015,11,24,32,30,34,83,52432,20.0,11.0,18.0,200,https://en.khl.ru/players/10900/,2015/2016,12.0,14.0,23.0,30.0,20.0,85.0,46667.0,16.0,22.0,3.0,154.0,https://en.khl.ru/players/10900/,2016/2017,3.0,10.0,15.0,25.0,14.0,63.0,43573.0,22.0,23.0,7.0,72.0


In [12]:
forwards.drop(['T1_Profile', 'T2_Profile'], axis=1, inplace=True)
forwards.drop(['T1_Year', 'T2_Year'], axis=1, inplace=True)
forwards.drop(['Profile'], axis=1, inplace=True)

In [13]:
dummies = forwards.copy()
dummies = pd.get_dummies(dummies, drop_first=True)
dummies.head()

Unnamed: 0,Goals,Assists,Plus,Minus,Penalties,Shots,Icetime_seconds,Hits,Shots_blocked,Penalties_against,Rating,T1_Goals,T1_Assists,T1_Plus,T1_Minus,T1_Penalties,T1_Shots,T1_Icetime_seconds,T1_Hits,T1_Shots_blocked,T1_Penalties_against,T1_Rating,T2_Goals,T2_Assists,T2_Plus,T2_Minus,T2_Penalties,T2_Shots,T2_Icetime_seconds,T2_Hits,T2_Shots_blocked,T2_Penalties_against,T2_Rating,Year_2015/2016,Year_2016/2017,Year_2017/2018,Year_2018/2019,Year_2019/2020
0,13,22,29,35,47,144,48117,123.0,15.0,25.0,241,20.0,20.0,32.0,26.0,40.0,146.0,46309.0,98.0,14.0,9.0,281.0,16.0,18.0,26.0,17.0,49.0,123.0,50416.0,88.0,20.0,10.0,241.0,0,0,0,0,0
1,20,20,32,26,40,146,46309,98.0,14.0,9.0,281,16.0,18.0,26.0,17.0,49.0,123.0,50416.0,88.0,20.0,10.0,241.0,3.0,10.0,13.0,16.0,16.0,43.0,22301.0,40.0,12.0,1.0,76.0,1,0,0,0,0
2,16,18,26,17,49,123,50416,88.0,20.0,10.0,241,3.0,10.0,13.0,16.0,16.0,43.0,22301.0,40.0,12.0,1.0,76.0,14.0,14.0,28.0,26.0,28.0,104.0,49396.0,73.0,15.0,15.0,202.0,0,1,0,0,0
3,3,10,13,16,16,43,22301,40.0,12.0,1.0,76,14.0,14.0,28.0,26.0,28.0,104.0,49396.0,73.0,15.0,15.0,202.0,17.0,17.0,33.0,50.0,34.0,122.0,61315.0,84.0,27.0,6.0,200.0,0,0,1,0,0
4,11,24,32,30,34,83,52432,20.0,11.0,18.0,200,12.0,14.0,23.0,30.0,20.0,85.0,46667.0,16.0,22.0,3.0,154.0,3.0,10.0,15.0,25.0,14.0,63.0,43573.0,22.0,23.0,7.0,72.0,0,0,0,0,0


In [14]:
y = dummies['T2_Rating'].copy()
X = dummies.drop(
    ['T2_Rating', 'T2_Goals', 'T2_Assists',	'T2_Plus',	'T2_Minus', 'T2_Penalties',
     'T2_Shots', 'T2_Icetime_seconds', 'T2_Hits', 'T2_Shots_blocked', 'T2_Penalties_against'], axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
print(X_test.head())

     Goals  Assists  Plus  Minus  Penalties  Shots  Icetime_seconds  Hits  \
210     11       18    36     13         53     93            48672  84.0   
824      6       11    23     28         20    117            51675  24.0   
78      21       15    29     10         28    100            48157  28.0   
67      26       50    58     25         22     96            53032  29.0   
679     14       16    28     16         40     92            44994  14.0   

     Shots_blocked  Penalties_against  Rating  T1_Goals  T1_Assists  T1_Plus  \
210           16.0               17.0     234      16.0        23.0     49.0   
824           22.0                8.0     128       3.0        13.0     18.0   
78             8.0                7.0     257       4.0        12.0     20.0   
67             6.0               27.0     491       9.0        16.0     21.0   
679           28.0               21.0     216      13.0        45.0     48.0   

     T1_Minus  T1_Penalties  T1_Shots  T1_Icetime_second

In [16]:
y_pred = X_test['T1_Rating']

In [17]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))

MSE: 7401.222222222223
RMSE: 86.0303563994839
MAE: 66.04761904761905
R_squared: -0.044010165935115975
MAPE: 0.36078941133442527


In [18]:
y_pred = (X_test['Rating'] + X_test['T1_Rating']) / 2

In [19]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))

MSE: 5831.376984126984
RMSE: 76.36345319671567
MAE: 61.0
R_squared: -0.12379360937403172
MAPE: 0.32420714180148114


In [20]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression(n_jobs=-1)
linear.fit(X_train, y_train)
y_pred = linear.predict(X_test)

In [21]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))

linear_coef = pd.DataFrame(zip(X.columns, linear.coef_))
linear_coef.columns = ['Feature', 'Coefficient']
print(linear_coef)


MSE: 5500.257702456384
RMSE: 74.1637222802118
MAE: 57.13011268070735
R_squared: -1.2418295390470329
MAPE: 0.3151070566290051
                 Feature  Coefficient
0                  Goals    -3.486314
1                Assists    -2.825452
2                   Plus    -2.431967
3                  Minus     1.868371
4              Penalties     0.664668
5                  Shots    -0.086189
6        Icetime_seconds    -0.001106
7                   Hits    -0.276298
8          Shots_blocked    -0.887005
9      Penalties_against    -0.584038
10                Rating     1.010053
11              T1_Goals   -12.814172
12            T1_Assists    -8.433122
13               T1_Plus    -4.021950
14              T1_Minus     4.961071
15          T1_Penalties     0.906936
16              T1_Shots    -0.681976
17    T1_Icetime_seconds    -0.000171
18               T1_Hits    -0.562879
19      T1_Shots_blocked    -1.245654
20  T1_Penalties_against    -0.353159
21             T1_Rating     2.322055
2

In [22]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_jobs=-1)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)


In [23]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))

MSE: 5872.392425000001
RMSE: 76.63153675217535
MAE: 58.61972222222223
R_squared: -1.6907127490389566
MAPE: 0.3184392296914355


In [24]:
from sklearn.ensemble import GradientBoostingRegressor

gradient = GradientBoostingRegressor()
gradient.fit(X_train, y_train)
y_pred = gradient.predict(X_test)

In [25]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))

MSE: 5776.600869161474
RMSE: 76.00395298378548
MAE: 59.20959193538842
R_squared: -1.4658644787435473
MAPE: 0.3195283799799917


In [26]:
from sklearn.linear_model import LassoCV

lasso = LassoCV()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

In [27]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))

MSE: 5393.81106666276
RMSE: 73.44256985334023
MAE: 56.76654424680907
R_squared: -2.051499330223521
MAPE: 0.30444790482042106


In [28]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [29]:
print('MSE:', mean_squared_error(y_pred, y_test, squared=True))
print('RMSE:', mean_squared_error(y_pred, y_test, squared=False))
print('MAE:', mean_absolute_error(y_pred, y_test))
print('R_squared:', r2_score(y_pred, y_test))
print('MAPE:', mean_absolute_percentage_error(y_pred, y_test))


MSE: 5495.262945151805
RMSE: 74.1300407739791
MAE: 57.10472493401094
R_squared: -1.2436820682423977
MAPE: 0.31486803527459795
