# Model construction

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

os.chdir("C:/Users/TonyG/Documents/GitHub/bads/kaggle")
#os.chdir("C:/Users/erin-/Documents/bads/kaggle")
data = pd.read_pickle('./data/known_cleaned_w_dummies')
X = data.drop("return", axis = 1)
y = data["return"]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98928 entries, 1 to 100000
Data columns (total 80 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   item_id                                   98928 non-null  int64  
 1   brand_id                                  98928 non-null  int64  
 2   item_price                                98928 non-null  float64
 3   user_id                                   98928 non-null  int64  
 4   time_to_delivery                          89610 non-null  float64
 5   user_age                                  90292 non-null  float64
 6   customer_age                              98928 non-null  int64  
 7   item_size_10                              98928 non-null  uint8  
 8   item_size_24                              98928 non-null  uint8  
 9   item_size_34                              98928 non-null  uint8  
 10  item_size_35                     

### For Preliminary version only: Create test and train sets based on the known dataset via random splitting

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 314)

## 1. Boosted Trees 

### 1.1 with Random Search CV

In [7]:
import xgboost as xgb
#dtrain_data = xgb.DMatrix(X_train, label = y_train)

param_grid = {"max_depth" : np.arange(20,100, step = 5),
             "eta" : stats.uniform(0.01, 0.4),
             "gamma" : stats.uniform(0.05, 3),
             "lambda" : stats.uniform(0, 8),
             "alpha" : stats.uniform(0, 8),
             "colsample_bytree" : np.arange(0.2, 1, step = 0.1),
             "n_estimators" : np.arange(10, 200, step = 10)}
    
gbm = xgb.XGBClassifier(objective = "binary:logistic", 
             num_parallel_tree = 1, num_boost_round = 35, early_stopping_rounds = 10)
metric = "roc_auc"
n = 160
fold = 5

In [8]:
randomized_auc = RandomizedSearchCV(estimator = gbm, n_iter = n, cv = fold, scoring = metric,
                                    param_distributions = param_grid, verbose = 3, random_state = 123,
                                   n_jobs = 7)
randomized_auc.fit(X,y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed: 17.6min
[Parallel(n_jobs=7)]: Done 114 tasks      | elapsed: 104.3min
[Parallel(n_jobs=7)]: Done 274 tasks      | elapsed: 218.9min
[Parallel(n_jobs=7)]: Done 498 tasks      | elapsed: 424.4min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed: 636.6min
[Parallel(n_jobs=7)]: Done 800 out of 800 | elapsed: 646.3min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(early_stopping_rounds=10,
                                           num_boost_round=35,
                                           num_parallel_tree=1),
                   n_iter=160, n_jobs=7,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000012F72699B48>,
                                        'colsample_bytree': array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'eta': <scipy.stats._distn_infrastructure.rv_frozen object...
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000012F70D87408>,
                                        'lambda': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000012F726997C8>,
                                        'max_depth': array([20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]),
                        

In [9]:
print("Best Parameters:", randomized_auc.best_params_)
print("Best Score:", randomized_auc.best_score_)

Best Parameters: {'alpha': 5.179081164586749, 'colsample_bytree': 0.8000000000000003, 'eta': 0.021776051459879167, 'gamma': 2.0530173316629945, 'lambda': 1.2872414745855947, 'max_depth': 25, 'n_estimators': 10}
Best Score: 0.6199544234282922


### Train a new boosted tree with higher boosting rounds and optimal parameters found by random search and save predictions for unknown for csv

In [11]:
import datetime
name_ = datetime.datetime.now().strftime('%y_%m_%d-%I_%M') # For prediction csv name

data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies') # load in unknown data
params = randomized_auc.best_params_ # retrieve optimal parameters
params["objective"] = "binary:logistic"  # Add Objective
dtrain_total = xgb.DMatrix(X, label = y) # Create DMatrix of known data
dtest_unknown = xgb.DMatrix(data_u) # Create DMatrix of unknown data

model_opt = xgb.train(params = params, dtrain = dtrain_total,
                      num_boost_round = 400) # Fit a boosted tree but with a higher number if iterations

# Create predictions and save
preds = model_opt.predict(dtest_unknown)
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("".join([name_, "_pred.csv"]))

In [6]:
from sklearn.metrics import roc_auc_score
preds = randomized_auc.predict(X_test)

roc_auc_score(y_test, preds)

0.9455351838685127

In [7]:
data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies')

In [8]:
preds = randomized_auc.predict_proba(data_u)[:, 1]
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("230121_pred.csv")

### 1.2 with Bayesion optimization CV

In [4]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 123)

dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

Define function to use Bayesian optimization to optimize a given function on a given dataset based on given parameters

In [13]:
def xgb_evaluate(max_depth, gamma, eta, colsample_bytree, lam, alph, est):
    params = {'eval_metric': 'auc',
              'max_depth': int(max_depth),
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              'objective' : 'binary:logistic',
              'n_estimators' :  int(est),
              'lambda' : lam,
              'alph' : alph}
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round = 150, nfold = 10, early_stopping_rounds = 30)
    return cv_result['test-auc-mean'].iloc[-1]

In [43]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 200), 
                                             'gamma': (0, 8),
                                             'colsample_bytree': (0.3, 0.9),
                                             'eta' : (0.001, 0.3),
                                             'lam' : (0.1, 8),
                                             'alph' : (0.1, 3),
                                             'est' : (10, 80)})
# Optimally needs quite a few more initiation points and number of iterations
xgb_bo.maximize(init_points = 5, n_iter = 50)

|   iter    |  target   |   alph    | colsam... |    est    |    eta    |   gamma   |    lam    | max_depth |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7489  [0m | [0m 1.815   [0m | [0m 0.6438  [0m | [0m 57.98   [0m | [0m 0.232   [0m | [0m 5.491   [0m | [0m 1.942   [0m | [0m 137.8   [0m |
| [95m 2       [0m | [95m 0.754   [0m | [95m 0.1493  [0m | [95m 0.5076  [0m | [95m 73.66   [0m | [95m 0.114   [0m | [95m 1.752   [0m | [95m 3.129   [0m | [95m 49.96   [0m |
| [0m 3       [0m | [0m 0.7527  [0m | [0m 1.169   [0m | [0m 0.3318  [0m | [0m 54.17   [0m | [0m 0.107   [0m | [0m 1.027   [0m | [0m 1.221   [0m | [0m 15.95   [0m |
| [95m 4       [0m | [95m 0.7609  [0m | [95m 1.249   [0m | [95m 0.3493  [0m | [95m 64.56   [0m | [95m 0.07483 [0m | [95m 2.277   [0m | [95m 3.76    [0m | [95m 78.19   [0m |
| [0m 5       [0m | [0m 0.7527 

KeyboardInterrupt: 

Extract best parameters

In [5]:
params = xgb_bo.max
#params['max_depth'] = int(params['max_depth'])
params["params"]["max_depth"] = int(params["params"]["max_depth"])
#params["params"]["n_estimators"] = int(params["params"]["est"])
params["params"]['objective'] = 'binary:logistic'
params["params"].pop("est")
params

NameError: name 'xgb_bo' is not defined

In [10]:
params =  {'alpha': 0.5793037571735223,

  'colsample_bytree': 0.3,

  'eta': 0.11,

  'gamma': 0.0,

  'lambda': 6.468522967337128,

  'max_depth': 77,

  'objective': 'binary:logistic'}

 Train model with optimal parameters and calculate auc for test set

In [11]:
model_opt = xgb.train(params, dtrain, num_boost_round = 400)

preds = model_opt.predict(dtest)
roc_auc_score(y_test, preds)



0.7745377371183007

In [12]:
data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies')

In [13]:
preds = model_opt.predict(xgb.DMatrix(data_u))
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("sixth_pred.csv")

### 1.3 with a genetic algorithm (doesn't work because of data formatting)

In [18]:
from tpot import TPOTClassifier
import numpy as np
import pandas as pd
from scipy import stats

# Only reduced so far
data_reduced = pd.read_pickle('./data/known_cleaned_w_dummies_reduced')
data_u_reduced = pd.read_pickle('./data/unknown_cleaned_w_dummies_reduced')
X, y = data_reduced.drop("return", axis = 1), data_reduced["return"]
X_test = data_u_reduced 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 123)
X_train = X_train.fillna(-999) # TPOT cannot handle NA's
y_train = y_train.fillna(-999)

# Define Hyperparams to tune
params = {"max_depth" : list(np.linspace(10,200, dtype = int)), 
         "learning_rate" : list(np.arange(0.01, 0.8, step = 0.05)),
         "max_features" : ["auto", "sqrt", "log2"],
         #"ccp_alpha" : list(np.arange(0.01, 0.5, step = 0.05)),
         #"loss" : ["deviance", "exponential"],
         "min_samples_split" : list(np.linspace(2, 60, num = 20, dtype = int))}
             #"lambda" : list(np.arange(0.5, 8, step = 0.05)),
             #"alpha" : list(np.arange(0.5, 8, step = 0.05)),
             #"colsample_bytree" : np.arange(0.2, 0.9, step = 0.1)}
             #"n_estimators" : list(np.arange(10,80, step = 10))}


Find optimal model via evolutionary algorithm

In [19]:
tpot_classifier = TPOTClassifier(generations= 2, population_size = 5, offspring_size = 2, mutation_rate = 0.9, crossover_rate = 0.1,
                                 verbosity= 3, early_stop = 12, n_jobs = 7, random_state = 123,
                                 config_dict =
                                 {'sklearn.ensemble.GradientBoostingClassifier': params}, 
                                 cv = 10, scoring = 'roc_auc')

tpot_classifier.fit(X_train,y_train) 

1 operators have been imported by TPOT.


Version 0.11.6.post3 of tpot is outdated. Version 0.11.7 was released Wednesday January 06, 2021.


Optimization Progress:   0%|          | 0/9 [00:00<?, ?pipeline/s]

RuntimeError: A pipeline has not yet been optimized. Please call fit() first.