In [1]:
from sqlalchemy import create_engine
from config import Config
from pandas import read_sql
from transformers import BinaryEncoder, CityExtractor, ColDropper, XYTrainTestSplitter, PredictionTransformer, Scorer
import numpy as np

eng = create_engine(Config.DATABASE_URI)
car_heat = read_sql(sql='select * from car_heat', con=eng)

#extract the time value so that it's a string and not python datetime object
car_heat['time'] = [x.strftime("%H:%M") for x in car_heat['time']]

#drop columns which currently are not processed
dropper = ColDropper()
dropper.transform(data=car_heat, columns=['date', 'match_hash', 'heat_id', 'time', 'round', 'year', 'stadium'])

#remove club names and change them to only cities to account for clubs changing sponsors and names as a result
c_ex = CityExtractor()
c_ex.transform(car_heat)

#create binary variables for categorical columns
encoder = BinaryEncoder()
cat_cols = ['a_rider', 'b_rider', 'c_rider', 'd_rider', 'name_team_home', 'name_team_away']
encoder.fit_transform(data=car_heat, columns=cat_cols)

#split into X, Y, test and train
splitter = XYTrainTestSplitter()
X_train, Y_train, X_test, Y_test = splitter.transform(data=car_heat, y_columns = ['a_score','b_score','c_score','d_score'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
rfr = MultiOutputRegressor(GradientBoostingRegressor())
rfr.fit(X_train, Y_train)

MultiOutputRegressor(estimator=GradientBoostingRegressor())

In [3]:
pred = rfr.predict(X_test)

In [4]:
pred_trans = PredictionTransformer()
predictions = pred_trans.transform(pred)

In [5]:
rfr.score(X_test,Y_test)

0.21164227265479102

In [6]:
rfr.score(X_train,Y_train)

0.2543939617232893

In [7]:
score = Scorer()
score.score(predictions, Y_test)

{'winner_correct_%': 45.375,
 'all_correct_%': 14.3122,
 'no_obs': 3773,
 'winner_correct': 1712,
 'all_correct': 540}

In [8]:
Scorer().score(pred_trans.transform(rfr.predict(X_train)), Y_train)

{'winner_correct_%': 49.028,
 'all_correct_%': 15.7303,
 'no_obs': 11214,
 'winner_correct': 5498,
 'all_correct': 1764}

In [9]:
np.sort(rfr.estimators_[1].feature_importances_)

array([0.        , 0.        , 0.        , ..., 0.12305137, 0.13394094,
       0.38045004])

In [34]:
import sys
np.set_printoptions(threshold=np.inf)

In [50]:
stacked = np.column_stack((score.score_arr, predictions, np.array(Y_test)))
stacked[np.where(stacked[:, 0] == 'all correct')]
for pred in np.array(Y_test):
    if sum(pred) != 6:
        print(pred)

#exclude these heats? 

[2 3 0 2]
[2 0 1 0]
[0 2 0 1]
[0 0 3 2]
[0 3 0 1]
[3 0 0 2]
[0 1 2 0]
[0 1 0 2]
[3 1 0 0]
[0 1 0 2]
[0 0 1 3]
[0 0 2 1]
[0 1 2 0]
[0 3 0 2]
[0 1 0 2]
[3 0 0 2]
[0 3 0 0]
[3 1 0 0]
[0 1 3 0]
[3 2 2 0]
[0 3 1 0]
[2 2 0 3]
[0 2 2 3]
[3 0 0 1]
[2 0 1 0]
[3 1 0 0]
[2 2 0 3]
[0 3 1 0]
[3 0 0 1]
[1 3 0 0]
[0 2 3 0]
[0 0 0 0]
[3 0 0 2]
[0 3 2 0]
[2 3 0 0]
[0 0 0 0]
[3 0 2 0]
[0 2 0 3]
[2 0 3 0]
[2 0 3 0]
[0 0 3 2]
[0 0 0 0]
[2 0 0 3]
[3 0 2 0]
[3 0 2 0]
[3 0 2 0]
[3 0 2 0]
[0 3 0 2]
[3 0 2 0]
[3 0 2 0]
[0 2 0 3]
[3 0 2 0]
[2 0 3 0]
[0 3 0 2]
[2 0 0 3]
[2 0 1 0]
[0 3 1 0]
[0 2 0 1]
[2 3 0 0]
[3 0 0 2]
[2 0 3 0]
[2 3 0 0]
[0 0 3 2]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]
[0 2 0 3]
[0 2 3 0]
[0 2 3 0]
[0 0 0 0]
[0 0 0 0]
[0 0 0 0]


In [43]:
stacked[0]

array(['winner correct', '2', '1', '0', '3', '1', '0', '2', '3'],
      dtype='<U21')

In [26]:
feat_imp = np.column_stack(( rfr.estimators_[3].feature_importances_.round(4), np.array(X_train.columns)))
feat_imp[feat_imp[:, 0].argsort()[::-1]]

array([[0.447, 'd_rider_point_year_avg_1'],
       [0.0801, 'b_rider_point_year_avg_1'],
       [0.0793, 'd_previous_points'],
       ...,
       [0.0, 'c_rider_Piotr_Pawlicki'],
       [0.0, 'c_rider_Mirosław_Jabłoński'],
       [0.0, 'c_rider_Rafał_Dobrucki']], dtype=object)