# Model construction

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

#os.chdir("C:/Users/TonyG/Documents/GitHub/bads/kaggle")
os.chdir("C:/Users/erin-/Documents/bads/kaggle")
data = pd.read_pickle('./data/known_cleaned_w_dummies')
X = data.drop("return", axis = 1)
y = data["return"]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98928 entries, 1 to 100000
Columns: 189 entries, item_id to user_state_Thuringia
dtypes: float64(3), int64(4), uint8(182)
memory usage: 23.2 MB


### For Preliminary version only: Create test and train sets based on the known dataset via random splitting

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 314)

## 1. Logit Model with Elastic Net Penality 

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98928 entries, 1 to 100000
Columns: 190 entries, item_id to user_state_Thuringia
dtypes: bool(1), float64(3), int64(4), uint8(182)
memory usage: 23.3 MB


In [4]:
%%script false --no-raise-error
y, X = data["return"], data.drop(axis = 1, labels = "return")
model_logit = sm.OLS(y, X)


Couldn't find program: 'false'


## 2. Boosted Trees 

### 2.1 with Random Search CV

In [6]:
import xgboost as xgb
#dtrain_data = xgb.DMatrix(X_train, label = y_train)

param_grid = {"max_depth" : np.arange(1,20),
             "eta" : stats.uniform(0.1, 0.8),
             "gamma" : stats.uniform(0.05, 3),
             "lambda" : stats.uniform(0, 5),
             "alpha" : stats.uniform(0, 5),
             "colsample_bytree" : np.arange(0.2, 1, step = 0.1),
             "subsample" : np.arange(0.5, 1, step = 0.1),
             "n_estimators" : np.arange(10, 50, step = 5)}
    
gbm = xgb.XGBClassifier(objective = "binary:logistic", 
             num_parallel_tree = 1, num_boost_round = 80, early_stopping_rounds = 10)
metric = "roc_auc"
n = 100
fold = 10

In [7]:
randomized_auc = RandomizedSearchCV(estimator = gbm, n_iter = n, cv = fold, scoring = metric,
                                    param_distributions = param_grid, verbose = 1, random_state = 123,
                                   n_jobs = 7)
randomized_auc.fit(X,y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed: 25.6min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 212.9min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 531.7min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed: 1036.1min
[Parallel(n_jobs=7)]: Done 1000 out of 1000 | elapsed: 1305.5min finished


RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(early_stopping_rounds=10,
                                           num_boost_round=80,
                                           num_parallel_tree=10),
                   n_iter=100, n_jobs=7,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000218A218ABC8>,
                                        'colsample_bytree': array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'eta': <scipy.stats._distn_infrastructure.rv_frozen obje...
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002189873D708>,
                                        'lambda': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000218A218A708>,
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
     

In [8]:
print("Best Parameters:", randomized_auc.best_params_)
print("Best Score:", randomized_auc.best_score_)

Best Parameters: {'alpha': 2.7630903067955797, 'colsample_bytree': 0.4000000000000001, 'eta': 0.3615745486583717, 'gamma': 0.5881705239639977, 'lambda': 2.3340493873098933, 'max_depth': 2, 'n_estimators': 15, 'subsample': 0.5}
Best Score: 0.6747625886800404


masked_array(data=[3.595751550773865, 2.005087783306018,
                   3.158960088435252, 2.4604238839617114,
                   1.1725643759114308, 0.7704112092342336,
                   3.1197647589605557, 4.927798928053525,
                   2.167091196133339, 2.283739584286279,
                   1.5748322294534307, 3.617081790949774,
                   3.4776476438545547, 4.208349984563581,
                   1.7795743285872978, 1.2042794886181225,
                   3.3078216833312184, 1.7713233779583926,
                   0.01344032287160346, 4.918154424558616,
                   0.8053450721460742, 2.73881786314427,
                   2.3693592750229744, 4.055195684974514,
                   2.511848237228068, 2.9133554393070233,
                   2.203214842049366, 2.426984129042013,
                   4.6106433051801545, 3.588787811397788,
                   3.72390327570883, 2.5565869574909694,
                   1.7205596263776406, 2.0864553045167207,
              

In [None]:
score_50 = randomized_auc.best_score_

In [9]:
from sklearn.metrics import roc_auc_score
preds = randomized_auc.predict(X_test)

roc_auc_score(y_test, preds)

0.6198670210605478

In [10]:
data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies')

In [11]:
preds = randomized_auc.predict_proba(data_u)[:, 1]
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("first_pred.csv")

### 2.2 with Bayesion optimization CV

In [4]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 123)

dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

Define function to use Bayesian optimization to optimize a given function on a given dataset based on given parameters

In [41]:
def xgb_evaluate(max_depth, gamma, eta, colsample_bytree, lam, alph, est):
    params = {'eval_metric': 'auc',
              'max_depth': int(max_depth),
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              'objective' : 'binary:logistic',
              'n_estimators' :  int(est),
              'lambda' : lam,
              'alph' : alph}
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round = 150, nfold = 10, early_stopping_rounds = 30)
    return cv_result['test-auc-mean'].iloc[-1]

In [43]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 200), 
                                             'gamma': (0, 8),
                                             'colsample_bytree': (0.3, 0.9),
                                             'eta' : (0.001, 0.3),
                                             'lam' : (0.1, 8),
                                             'alph' : (0.1, 3),
                                             'est' : (10, 80)})
# Optimally needs quite a few more initiation points and number of iterations
xgb_bo.maximize(init_points = 5, n_iter = 50)

|   iter    |  target   |   alph    | colsam... |    est    |    eta    |   gamma   |    lam    | max_depth |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7489  [0m | [0m 1.815   [0m | [0m 0.6438  [0m | [0m 57.98   [0m | [0m 0.232   [0m | [0m 5.491   [0m | [0m 1.942   [0m | [0m 137.8   [0m |
| [95m 2       [0m | [95m 0.754   [0m | [95m 0.1493  [0m | [95m 0.5076  [0m | [95m 73.66   [0m | [95m 0.114   [0m | [95m 1.752   [0m | [95m 3.129   [0m | [95m 49.96   [0m |
| [0m 3       [0m | [0m 0.7527  [0m | [0m 1.169   [0m | [0m 0.3318  [0m | [0m 54.17   [0m | [0m 0.107   [0m | [0m 1.027   [0m | [0m 1.221   [0m | [0m 15.95   [0m |
| [95m 4       [0m | [95m 0.7609  [0m | [95m 1.249   [0m | [95m 0.3493  [0m | [95m 64.56   [0m | [95m 0.07483 [0m | [95m 2.277   [0m | [95m 3.76    [0m | [95m 78.19   [0m |
| [0m 5       [0m | [0m 0.7527 

KeyboardInterrupt: 

Extract best parameters

In [5]:
params = xgb_bo.max
#params['max_depth'] = int(params['max_depth'])
params["params"]["max_depth"] = int(params["params"]["max_depth"])
#params["params"]["n_estimators"] = int(params["params"]["est"])
params["params"]['objective'] = 'binary:logistic'
params["params"].pop("est")
params

NameError: name 'xgb_bo' is not defined

In [10]:
params =  {'alpha': 0.5793037571735223,

  'colsample_bytree': 0.3,

  'eta': 0.11,

  'gamma': 0.0,

  'lambda': 6.468522967337128,

  'max_depth': 77,

  'objective': 'binary:logistic'}

 Train model with optimal parameters and calculate auc for test set

In [11]:
model_opt = xgb.train(params, dtrain, num_boost_round = 400)

preds = model_opt.predict(dtest)
roc_auc_score(y_test, preds)



0.7745377371183007

In [12]:
data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies')

In [13]:
preds = model_opt.predict(xgb.DMatrix(data_u))
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("sixth_pred.csv")

### 2.3 with a genetic algorithm (doesn't work because of data formatting)

In [37]:
from tpot import TPOTClassifier
import numpy as np
import pandas as pd
from scipy import stats

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 123)
X_train = X_train.fillna(-999)
y_train = y_train.fillna(-999)

# Define Hyperparams to tune
params = {"max_depth" : list(np.linspace(10,200, dtype = int)), 
         "learning_rate" : list(np.arange(0.01, 0.8, step = 0.05)),
         "max_features" : ["auto", "sqrt", "log2"],
         #"ccp_alpha" : list(np.arange(0.01, 0.5, step = 0.05)),
         #"loss" : ["deviance", "exponential"],
         "min_samples_split" : list(np.linspace(2, 60, num = 20, dtype = int))}
             #"lambda" : list(np.arange(0.5, 8, step = 0.05)),
             #"alpha" : list(np.arange(0.5, 8, step = 0.05)),
             #"colsample_bytree" : np.arange(0.2, 0.9, step = 0.1)}
             #"n_estimators" : list(np.arange(10,80, step = 10))}


Find optimal model via evolutionary algorithm

In [38]:
tpot_classifier = TPOTClassifier(generations= 2, population_size = 5, offspring_size = 2, mutation_rate = 0.9, crossover_rate = 0.1,
                                 verbosity= 2, early_stop = 12, n_jobs = 7, random_state = 123,
                                 config_dict =
                                 {'sklearn.ensemble.GradientBoostingClassifier': params}, 
                                 cv = 10, scoring = 'roc_auc')

tpot_classifier.fit(X_train,y_train) 

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=9.0), HTML(value='')))


Generation 1 - Current best internal CV score: -inf


RuntimeError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly. If you enabled PyTorch estimators, please check the data requirements in the online documentation: https://epistasislab.github.io/tpot/using/