In [85]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

In [2]:
data_path = Path('.') / 'data'
filenames = {
    'x_train': 'AMF_train_X_XCZw8r3.csv',
    'y_train': 'AMF_train_Y_omYQJZL.csv',
    'x_test':'AMF_test_X_uDs0jHH.csv',
}

In [3]:
x_train = pd.read_csv(data_path / filenames['x_train'])
y_train = pd.read_csv(data_path / filenames['y_train'])
x_test = pd.read_csv(data_path / filenames['x_test'])

# Constitution d'un pipeline

1. Principe

Le pipeline ne se basera que sur une liste d'identifiants de traders. Au sein du pipe, on récupèrera le détail des données par jointure, calculera une aggrégation / classification par trader et on remettra dans l'ordre initial des identifiants.

In [4]:
x_all = pd.concat([x_train, x_test], axis=0)
x_all.sample(5)

Unnamed: 0,Index,Share,Day,Trader,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,...,min_dt_TV1_TV2,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade
60272,60273,Isin_122,Date_11,Flash,3.212575,53.65,536.5,0.0,5868.068582,0.0,...,0.001,135.473895,73.145,0.001,126.660474,48.37,0.001,106.793949,38.129,184
27138,27139,Isin_55,Date_30,Trader_195,6.5,78.0,,0.0,4211.969551,0.0,...,,,,,,,,,,11
52263,52264,Isin_49,Date_16,Evinrude,129.8,38.176471,,0.0,1326.20899,6e-06,...,0.001863,93.474255,0.001863,0.001863,0.001863,0.001863,0.001863,0.001863,0.001863,29
67669,67670,Isin_116,Date_6,Trader_329,9.096291,2.508458,2125.5,0.0,44.816966,0.035147,...,1e-06,9.318502,0.001881,1e-06,10.977039,0.002693,2e-06,11.251883,0.00256,1081
74372,74373,Isin_74,Date_9,Trader_358,4.6,3.332386,167.571429,0.0,45.572005,0.006822,...,0.001,102.316702,47.581,0.001,123.232062,56.253,0.001,118.507958,53.578,261


On aura régulièrement d'un sample de Traders, on le calcule ici :

In [5]:
trader_sample = x_all.Trader.sample(10, random_state=0)
trader_sample

6364     Baileywick
78017    Trader_380
92357    Trader_430
33606    Trader_209
4111     Trader_114
82958    Trader_386
42983    Trader_233
89721    Trader_430
81226    Trader_386
77158      Megavolt
Name: Trader, dtype: object

2. Récupération des données

La fonction `get_X_from_indices` retourne les données liées aux traders passés en paramètre, avec une colonne 'order' qui trace l'ordre initial.

In [6]:
def get_X_from_indices(indices):
    order = pd.Series(range(len(indices)), index=indices, name='order')
    df = x_all.loc[x_all.Trader.isin(indices)].merge(order, left_on='Trader', right_index=True)
    return(df)
get_X_transformer = FunctionTransformer(get_X_from_indices)

3. Choix des colonnes

In [7]:
columns = [
    'OTR', 'OCR', 'OMR', 'NbTradeVenueMic', 'mean_lifetime_cancel', 'max_time_two_events', 'NbSecondWithAtLeatOneTrade',    
]

In [37]:
def select_cols(X, cols=None):
    if 'order' in X.columns:
        order_col = ['order']
    else:
        order_col = []
    if cols:
        return(X[['Trader'] + cols + order_col])
    else:
        return(X)
column_selector = FunctionTransformer(select_cols, kw_args=dict(cols=None))

4. Data preprocessing

In [38]:
def preprocessor(X, fill_value=None):
    if fill_value is not None:
        return(X.fillna(fill_value))
    else:
        return(X)
data_preprocessor = FunctionTransformer(preprocessor, kw_args=dict(fill_value=None))    

5. Data aggregation

C'est à cette étape qu'on remet les samples bien dans l'ordre initial.

In [39]:
def aggregator(X, groupers=None, func=None):
    if groupers:
        if not func:
            func = sum
        return(X.groupby(groupers + ['order']).agg(func).reset_index('order').sort_values('order'))
    else:
        return(X.sort_values('order'))
aggregator_transformer = FunctionTransformer(aggregator, kw_args=dict(groupers=None, func=None))

# Constitution du pipeline

On construit le pipeline et on teste

In [40]:
pipe = Pipeline(
    [
        ('getX', get_X_transformer),
        ('selectCols', column_selector),
        ('preprocessor', data_preprocessor),
        ('aggregator', aggregator_transformer),
        ('classifier', RandomForestClassifier()),
    ],
)

On vérifie que le pipeline fonctionne.

In [41]:
dumb_pred = pipe.set_params(
    selectCols__kw_args=dict(cols=['OTR', 'OCR', 'OMR', 'mean_lifetime_cancel']),
    preprocessor__kw_args=dict(fill_value=0.),
    aggregator__kw_args=dict(groupers=['Trader'], func='mean'),
).fit(y_train.Trader, y_train.type).predict(y_train.Trader)
dumb_pred

array(['MIX', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'HFT',
       'NON HFT', 'NON HFT', 'NON HFT', 'MIX', 'NON HFT', 'NON HFT',
       'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT',
       'NON HFT', 'NON HFT', 'NON HFT', 'MIX', 'NON HFT', 'MIX', 'MIX',
       'MIX', 'NON HFT', 'NON HFT', 'MIX', 'NON HFT', 'HFT', 'NON HFT',
       'MIX', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT',
       'NON HFT', 'HFT', 'MIX', 'NON HFT', 'NON HFT', 'NON HFT',
       'NON HFT', 'NON HFT', 'HFT', 'HFT', 'HFT', 'NON HFT', 'MIX', 'MIX',
       'MIX', 'MIX', 'NON HFT', 'MIX', 'NON HFT', 'NON HFT', 'NON HFT',
       'MIX', 'NON HFT', 'HFT', 'NON HFT', 'HFT', 'MIX', 'NON HFT', 'HFT',
       'HFT', 'NON HFT', 'MIX', 'HFT', 'NON HFT', 'NON HFT', 'MIX',
       'NON HFT', 'HFT', 'NON HFT', 'HFT', 'MIX', 'HFT', 'MIX', 'MIX',
       'MIX', 'MIX', 'HFT', 'MIX'], dtype=object)

In [42]:
pd.concat(
    [
        y_train.sample(10, random_state=23).reset_index(drop=True),
        pd.Series(pipe.predict(y_train.sample(10, random_state=23).Trader), name='dumb_pred')
    ], axis=1).head(30)

Unnamed: 0,Trader,type,num_type,dumb_pred
0,Trader_451,MIX,1.0,HFT
1,Trader_57,NON HFT,0.0,NON HFT
2,Trader_330,MIX,1.0,MIX
3,Trader_244,HFT,2.0,HFT
4,Trader_46,NON HFT,0.0,NON HFT
5,Trader_169,NON HFT,0.0,NON HFT
6,Trader_245,MIX,1.0,MIX
7,Trader_288,NON HFT,0.0,NON HFT
8,Trader_105,NON HFT,0.0,NON HFT
9,Trader_329,MIX,1.0,MIX


# Cross validation

On commence par construire un folder qui permet de mettre les différents samples dans les différents folds.

In [43]:
stratified_folder = StratifiedKFold(n_splits=5)
for train_index, test_index in stratified_folder.split(y_train.Trader, y_train.type):
    print(y_train.iloc[train_index].type.value_counts())
    print(y_train.iloc[test_index].type.value_counts())

NON HFT    37
MIX        19
HFT        12
Name: type, dtype: int64
NON HFT    10
MIX         5
HFT         3
Name: type, dtype: int64
NON HFT    38
MIX        19
HFT        12
Name: type, dtype: int64
NON HFT    9
MIX        5
HFT        3
Name: type, dtype: int64
NON HFT    38
MIX        19
HFT        12
Name: type, dtype: int64
NON HFT    9
MIX        5
HFT        3
Name: type, dtype: int64
NON HFT    38
MIX        19
HFT        12
Name: type, dtype: int64
NON HFT    9
MIX        5
HFT        3
Name: type, dtype: int64
NON HFT    37
MIX        20
HFT        12
Name: type, dtype: int64
NON HFT    10
MIX         4
HFT         3
Name: type, dtype: int64


In [44]:
pipe.set_params(
    selectCols__kw_args=dict(cols=None),
    preprocessor__kw_args=dict(fill_value=0.),
    aggregator__kw_args=dict(groupers=['Trader'], func='mean'),
    classifier__max_depth=5,
).fit(y_train.Trader, y_train.type)

Pipeline(steps=[('getX',
                 FunctionTransformer(func=<function get_X_from_indices at 0x7fd87ec1b700>)),
                ('selectCols',
                 FunctionTransformer(func=<function select_cols at 0x7fd875d81940>,
                                     kw_args={'cols': None})),
                ('preprocessor',
                 FunctionTransformer(func=<function preprocessor at 0x7fd875d818b0>,
                                     kw_args={'fill_value': 0.0})),
                ('aggregator',
                 FunctionTransformer(func=<function aggregator at 0x7fd875d81b80>,
                                     kw_args={'func': 'mean',
                                              'groupers': ['Trader']})),
                ('classifier', RandomForestClassifier(max_depth=5))])

On commence par vérifier que les résultats de la métrique 'f1_micro' standard ramène les mêmes résultats que la métrique spécifique fournie.

In [45]:
results = cross_val_score(pipe, y_train.Trader, y_train.type, cv=stratified_folder, scoring='f1_micro')
print(f'cross validation score: {results.mean():.2%} +/- {results.std():.2%}')
print(f'detailed scores: {results}')

cross validation score: 76.80% +/- 8.11%
detailed scores: [0.72222222 0.64705882 0.82352941 0.88235294 0.76470588]


# Grid search

Définition des paramètres de la grid search :

In [46]:
pipe.get_params()

{'memory': None,
 'steps': [('getX',
   FunctionTransformer(func=<function get_X_from_indices at 0x7fd87ec1b700>)),
  ('selectCols',
   FunctionTransformer(func=<function select_cols at 0x7fd875d81940>,
                       kw_args={'cols': None})),
  ('preprocessor',
   FunctionTransformer(func=<function preprocessor at 0x7fd875d818b0>,
                       kw_args={'fill_value': 0.0})),
  ('aggregator',
   FunctionTransformer(func=<function aggregator at 0x7fd875d81b80>,
                       kw_args={'func': 'mean', 'groupers': ['Trader']})),
  ('classifier', RandomForestClassifier(max_depth=5))],
 'verbose': False,
 'getX': FunctionTransformer(func=<function get_X_from_indices at 0x7fd87ec1b700>),
 'selectCols': FunctionTransformer(func=<function select_cols at 0x7fd875d81940>,
                     kw_args={'cols': None}),
 'preprocessor': FunctionTransformer(func=<function preprocessor at 0x7fd875d818b0>,
                     kw_args={'fill_value': 0.0}),
 'aggregator': Funct

In [47]:
ordered_cols = [
    'OTR', 'OCR', 'OMR', 'NbTradeVenueMic', 'mean_lifetime_cancel', 'max_time_two_events', 'NbSecondWithAtLeatOneTrade',    
]

parms = dict(
    selectCols__kw_args=[
        {'cols': None},
        {'cols': ordered_cols},
        {'cols': ordered_cols[:5]},
        {'cols': ordered_cols[:3]},
        {'cols': ordered_cols[:2]},
    ],
    preprocessor__kw_args=[dict(fill_value=0.)],
    aggregator__kw_args=[dict(groupers=['Trader'], func='mean')],
    classifier__max_depth=[None, 10, 5, 3, 2, 1],
    classifier__random_state=[42],
    classifier__criterion=['gini', 'entropy'],
    classifier__class_weight=[None, 'balanced', 'balanced_subsample'],
    classifier__n_estimators=[500, 300],
)

In [48]:
gs_clf = GridSearchCV(pipe, parms, cv=stratified_folder, scoring='f1_micro', n_jobs=-1, verbose=2)
gs_clf.fit(y_train.Trader, y_train.type)
print(gs_clf.best_score_)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:  2.4min finished


0.8496732026143791


In [49]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.8496732026143791
{'aggregator__kw_args': {'groupers': ['Trader'], 'func': 'mean'}, 'classifier__class_weight': 'balanced', 'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__n_estimators': 500, 'classifier__random_state': 42, 'preprocessor__kw_args': {'fill_value': 0.0}, 'selectCols__kw_args': {'cols': ['OTR', 'OCR', 'OMR', 'NbTradeVenueMic', 'mean_lifetime_cancel', 'max_time_two_events', 'NbSecondWithAtLeatOneTrade']}}


# Envoi de la prédiction

In [21]:
traders = x_test.Trader.unique()
traders

array(['Adelaide', 'Alana', 'Alcmene', 'Alice', 'Alices Sister',
       'Angel (Experiment 624)', 'Annette', 'Armoire', 'Arthur/Wart',
       'Axe', 'Baby Red Bird', 'Backwoods Beagle', 'Bacon Beagle',
       'Baileywick', 'Bat Queen', 'Becky', 'Bent-Tail the Coyote',
       'Big Al', 'Big Mama', 'Bill Green', 'Black Bart', 'Bobble',
       'Bomber Beagle', 'Bookseller', 'Boomer', 'Bowler Hat Guy',
       'Braddock', 'Brer Bear', 'Brudo Avarius', 'Bruno the Dog',
       'Buck Leatherleaf', 'Bucky Oryx-Antlerson', 'Butch the Bulldog',
       'Butcher Boy', 'Dallben', 'Danny', 'Dennis Avarius',
       'Digger the Mole', 'Dijon the Thief', 'Don Karnage', 'Donald Duck',
       'Doug Dalmatian', 'Dr. Delbert Doppler', 'Dr. Facilier',
       'Dr. Joshua Sweet', 'Dr. Teeth', 'Duchess', 'Duke of Weselton',
       'Dylan Dalmatian', 'Edric Blight', 'Elliott', 'Emperor Belos',
       'Evinrude', 'Fairy Mary', 'Fidget', 'Flash', 'Flora', 'Flounder',
       'Flunkey the Baboon', 'Francis', 'Friend

In [22]:
pred = gs_clf.best_estimator_.predict(traders)
pred

array(['NON HFT', 'NON HFT', 'NON HFT', 'HFT', 'NON HFT', 'NON HFT',
       'MIX', 'NON HFT', 'NON HFT', 'HFT', 'MIX', 'MIX', 'NON HFT', 'MIX',
       'NON HFT', 'NON HFT', 'HFT', 'NON HFT', 'MIX', 'MIX', 'NON HFT',
       'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT',
       'NON HFT', 'HFT', 'NON HFT', 'HFT', 'NON HFT', 'MIX', 'NON HFT',
       'NON HFT', 'MIX', 'NON HFT', 'NON HFT', 'MIX', 'HFT', 'HFT',
       'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'MIX',
       'MIX', 'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'MIX',
       'NON HFT', 'MIX', 'MIX', 'MIX', 'MIX', 'NON HFT', 'HFT', 'NON HFT',
       'MIX', 'HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'HFT', 'NON HFT',
       'NON HFT', 'NON HFT', 'NON HFT', 'NON HFT', 'MIX', 'MIX', 'HFT',
       'NON HFT', 'MIX', 'NON HFT', 'NON HFT', 'NON HFT', 'HFT', 'MIX',
       'HFT', 'HFT', 'MIX'], dtype=object)

In [23]:
with pd.option_context('display.max_rows', None):
    display(pd.read_csv(data_path / 'out' / 'submission3.csv').join(pd.Series(pred, name='pred6')))

Unnamed: 0,Trader,type,pred6
0,Adelaide,MIX,NON HFT
1,Alana,NON HFT,NON HFT
2,Alcmene,HFT,NON HFT
3,Alice,MIX,HFT
4,Alices Sister,MIX,NON HFT
5,Angel (Experiment 624),HFT,NON HFT
6,Annette,NON HFT,MIX
7,Armoire,NON HFT,NON HFT
8,Arthur/Wart,MIX,NON HFT
9,Axe,NON HFT,HFT


In [24]:
sub = pd.DataFrame({
    'Trader': traders,
    'type': pred,
}).set_index('Trader')
display(sub)
sub.to_csv(data_path / 'out' / 'submission6.csv')
pickle.dump(gs_clf.best_params_, open(data_path / 'out' / 'sub6params.pkl', 'wb'))
pickle.dump(gs_clf, open(data_path / 'out' / 'model6.pkl', 'wb'))

Unnamed: 0_level_0,type
Trader,Unnamed: 1_level_1
Adelaide,NON HFT
Alana,NON HFT
Alcmene,NON HFT
Alice,HFT
Alices Sister,NON HFT
...,...
Monstro,HFT
Morgana,MIX
The Doorknob,HFT
The Doorman,HFT


# Prédiction à un niveau fin, puis aggrégation

Dans ce pipeline, on va prédire à un niveau fin (ex: chaque triplet Trader x Share x Day, ou Trader x Share), avant d'aggréger ensuite les prédictions pour prédire au niveau Trader.

Avant de faire un pipeline, je fais à la mano, en utilisant le modèle précédent.

In [86]:
trader_train, trader_test = train_test_split(y_train.Trader, test_size=.3)

In [100]:
train_data = gs_clf.best_estimator_[1:3].transform(x_train.loc[x_train.Trader.isin(trader_train)]).merge(y_train, on='Trader')
train_data

Unnamed: 0,Trader,OTR,OCR,OMR,NbTradeVenueMic,mean_lifetime_cancel,max_time_two_events,NbSecondWithAtLeatOneTrade,type,num_type
0,Trader_10,2.272727,8.333333,12.500000,1,8272.770569,25139.59800,4,NON HFT,0.0
1,Trader_10,1.696629,25.166667,21.571429,1,3796.632686,31278.35700,15,NON HFT,0.0
2,Trader_10,1.482759,47.300000,118.250000,1,1397.103679,30799.46700,63,NON HFT,0.0
3,Trader_10,1.705882,14.500000,29.000000,1,9736.869034,19187.71900,4,NON HFT,0.0
4,Trader_10,1.517730,26.750000,0.000000,1,10066.803992,23164.51400,38,NON HFT,0.0
...,...,...,...,...,...,...,...,...,...,...
77624,Trader_60,1.150000,23.000000,0.000000,1,7609.944336,7609.94340,3,NON HFT,0.0
77625,Trader_60,1.352941,23.000000,0.000000,1,567.153015,392.75638,7,NON HFT,0.0
77626,Trader_60,1.146667,86.000000,0.000000,1,23448.525391,23448.52500,16,NON HFT,0.0
77627,Trader_60,1.179191,204.000000,0.000000,1,19157.808594,19157.80900,35,NON HFT,0.0


In [101]:
clf = clone(gs_clf.best_estimator_[-1])
clf.fit(train_data.drop(['Trader', 'type', 'num_type'], axis=1), train_data.type)

RandomForestClassifier(class_weight='balanced', max_depth=3, n_estimators=500,
                       random_state=42)

In [104]:
gs_clf.best_estimator_[1:3].transform(x_train.loc[x_train.Trader.isin(trader_test)])

Unnamed: 0,Trader,OTR,OCR,OMR,NbTradeVenueMic,mean_lifetime_cancel,max_time_two_events,NbSecondWithAtLeatOneTrade
77,Trader_105,3.693548,35.781250,0.000000,1,20966.791788,49783.777000,147
78,Trader_105,2.925267,28.344828,0.000000,1,11612.122256,33397.780000,153
79,Trader_105,8.025000,53.500000,0.000000,1,19814.739218,44633.320000,37
80,Trader_105,5.583333,67.000000,0.000000,1,17357.777344,21580.172000,10
81,Trader_105,7.255319,341.000000,0.000000,1,21637.669922,26379.031000,32
...,...,...,...,...,...,...,...,...
105713,Trader_54,5.100000,5.666667,3.187500,1,1016.225519,12114.159000,8
105714,Trader_54,6.333333,19.000000,1.583333,2,138.864349,49.990845,4
105715,Trader_54,2.461538,9.600000,9.600000,1,8815.686191,31970.676000,19
105716,Trader_54,2.630769,7.772727,5.896552,1,49.859188,3528.821300,37


On score sur le set de test les prédictions faites par le modèle.

In [110]:
f1_score(
    clf.predict(gs_clf.best_estimator_[1:3].transform(x_train.loc[x_train.Trader.isin(trader_test)]).drop('Trader', axis=1)),
    gs_clf.best_estimator_[1:3].transform(x_train.loc[x_train.Trader.isin(trader_test)]).merge(y_train, on='Trader').type,
    average='micro',
)

0.7742336518310658

In [113]:
test_pred = (
    clf.predict(
        gs_clf.best_estimator_[1:3].transform(x_train.loc[x_train.Trader.isin(trader_test)]).drop('Trader', axis=1)
    )
)

In [130]:
test_data = gs_clf.best_estimator_[1:3].transform(x_train.loc[x_train.Trader.isin(trader_test)])
test_data = test_data.reset_index(drop=True)
with pd.option_context('display.max_rows', None):
    display(
        pd.concat([test_data, pd.Series(test_pred, name='type_pred')], axis=1)
        .merge(y_train, on='Trader')
        .groupby(['Trader', 'type', 'type_pred'])
        .size()
        .rename('counts')
        .to_frame()
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,counts
Trader,type,type_pred,Unnamed: 3_level_1
Trader_105,NON HFT,HFT,2
Trader_105,NON HFT,NON HFT,2050
Trader_107,MIX,HFT,1025
Trader_107,MIX,MIX,559
Trader_107,MIX,NON HFT,252
Trader_110,NON HFT,HFT,1
Trader_110,NON HFT,NON HFT,7
Trader_127,NON HFT,HFT,1
Trader_127,NON HFT,NON HFT,4
Trader_164,MIX,HFT,34


# Modèle linéaire avec un ordinal encoder

### WIP WIP WIP

1. Encoding de la target

On commence par définir un `OrdinalEncoder` qui va permettre de transformer la target en un integer (on considère que NON HFT < MIX < HFT).

In [25]:
ord_encoder = OrdinalEncoder(categories=[np.array(['NON HFT', 'MIX', 'HFT'])])
ord_encoder.fit_transform(y_train.type.values.reshape(-1, 1))[:5]

array([[1.],
       [0.],
       [0.],
       [0.],
       [0.]])

On applique cet encoder sur la valeur à prédire.

In [26]:
y_train['num_type'] = ord_encoder.fit_transform(y_train.type.values.reshape(-1, 1))
y_train

Unnamed: 0,Trader,type,num_type
0,Trader_285,MIX,1.0
1,Trader_114,NON HFT,0.0
2,Trader_110,NON HFT,0.0
3,Trader_57,NON HFT,0.0
4,Trader_128,NON HFT,0.0
...,...,...,...
81,Trader_140,MIX,1.0
82,Trader_451,MIX,1.0
83,Trader_438,MIX,1.0
84,Trader_278,HFT,2.0


2. Définition des étapes de preprocessing

In [27]:
pipe = Pipeline(
    [
        ('getX', get_X_transformer),
        ('selectCols', column_selector),
        ('standardScale', standard_scaler)
        
        ('aggregator', aggregator),
        ('classifier', RandomForestClassifier()),
    ],
)

  ('standardScale', standard_scaler)


NameError: name 'standard_scaler' is not defined