In [3]:
import xgboost as xgb
import pickle
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [4]:
TRAIN_FRAC = 0.8    

In [27]:
def smote_func(df):
    '''
    Oversample the minority class using SMOTE prior to OHE.
    '''

    x_train = df.drop(columns = ['action'])
    y_train = df['action']

    
    smote_inst = SMOTE(sampling_strategy='auto', random_state=2611, k_neighbors=12)
      
    x_train , y_train = smote_inst.fit_resample(x_train, y_train)
    
    
    df = pd.concat([x_train, y_train], axis=1)
    
    print(df[['action', 'year']].groupby('action').count())
    
    
    

    return df

In [28]:
def get_train_test_sets(data, train_frac):
    '''
    Train and test sets to be split prior to reducing majority class.
    '''
    n_train = int(data.shape[0] * train_frac)

    train_data = data[:n_train]
    test_data = data[n_train:]

    # train_data = downsample_majority_classes(train_data, ['H', 'W'])

    
    
    train_data = np.array(train_data)
    test_data = np.array(test_data)

    
    x_train = train_data[:,  :-1]
    y_train = train_data[:,  -1:].reshape(-1)
    x_test = test_data[:, :-1]
    y_test = test_data[:, -1:].reshape(-1)
    return x_train, y_train, x_test, y_test

In [29]:
with open('df_result_no_smote.pkl', 'rb') as f:
    df_result = pickle.load(f)


In [30]:
df_result.shape

(34848, 66)

In [31]:
le = LabelEncoder()
encoded_actions = le.fit_transform(df_result['action'])
df_result['action'] = encoded_actions

In [32]:
df_result = smote_func(df_result)

         year
action       
0       18778
1       18778
2       18778
3       18778


In [33]:
x_train, y_train, x_test, y_test = get_train_test_sets(df_result, TRAIN_FRAC)

In [34]:
xgb_clf = xgb.XGBClassifier( booster='gbtree', objective='multi:softprob',
                        eval_metric = 'auc', num_round=200, 
                        # early_stopping_rounds=60, 
                        num_class=4, max_depth=2,
                        learning_rate=0.01, verbosity=3, scale_pos_weight= 0.1,
                        eval_set=[(x_test, y_test)])

In [35]:
params = {'max_depth': [ 10, 11, 12],
          'learning_rate': [ 0.2, 0.3],  'max_delta_step': [0, 1, 2]}

In [36]:
grSearchCV = GridSearchCV(estimator = xgb_clf, 
                          param_grid= params, scoring = 'roc_auc_ovr', n_jobs=10, verbose=3, cv=10)

In [37]:
grSearchCV.fit(x_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[01:54:58] AllReduce: 0.011327s, 1 calls @ 11327us

[01:54:58] MakeCuts: 0.012s, 1 calls @ 12000us

[01:54:58] DEBUG: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\gbm\gbtree.cc:130: Using tree method: 0


Parameters: { "eval_set", "num_round", "scale_pos_weight" } are not used.



[01:55:10] Configure: 0.001078s, 1 calls @ 1078us

[01:55:10] EvalOneIter: 0.001142s, 100 calls @ 1142us

[01:55:10] GetGradient: 0.238355s, 100 calls @ 238355us

[01:55:10] PredictRaw: 0.000548s, 100 calls @ 548us

[01:55:10] UpdateOneIter: 11.6043s, 100 calls @ 11604291us

[01:55:10] BoostNewTrees: 11.3635s, 100 calls @ 11363463us

[01:55:10] CommitModel: 9.9e-05s, 100 calls @ 99us

[01:55:10] BuildHistogram: 5.36677s, 3600 calls @ 5366771us

[01:55:10] EvaluateSplits: 4.6692s, 4000 calls @ 4669204us

[01:55:10] InitData: 0.075933s, 400 calls @ 75933us

[01:55:10] InitRoot: 0.525288s, 400 calls @ 525288us

[01:55:10] LeafPartition: 8.1e-05s, 400 calls @ 81us

[01:55:10] UpdatePosition: 0.471416s, 4000 calls @ 471416us

[01:55:10] UpdatePredictionCache: 0.05439s, 400 calls @ 54390us

[01:55:10] UpdateTree: 11.2606s, 400 calls @ 11260591us



In [38]:
grSearchCV.best_params_

{'learning_rate': 0.3, 'max_delta_step': 1, 'max_depth': 10}

In [39]:
grSearchCV.best_score_

0.8685909977454644

In [40]:
grSearchCV.best_estimator_.save_model('xgb_model.json')

In [41]:
for value in grSearchCV.cv_results_:
    print(value)

mean_fit_time
std_fit_time
mean_score_time
std_score_time
param_learning_rate
param_max_delta_step
param_max_depth
params
split0_test_score
split1_test_score
split2_test_score
split3_test_score
split4_test_score
split5_test_score
split6_test_score
split7_test_score
split8_test_score
split9_test_score
mean_test_score
std_test_score
rank_test_score
