# Model construction

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

os.chdir("C:/Users/TonyG/Documents/GitHub/bads/kaggle")
data = pd.read_pickle('./data/known_cleaned_w_dummies')
X = data.drop("return", axis = 1)
y = data["return"]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98928 entries, 1 to 100000
Columns: 189 entries, item_id to user_state_Thuringia
dtypes: float64(3), int64(4), uint8(182)
memory usage: 23.2 MB


### For Preliminary version only: Create test and train sets based on the known dataset via random splitting

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

## 1. Logit Model with Elastic Net Penality 

In [3]:
%%script false --no-raise-error
y, X = data["return"], data.drop(axis = 1, labels = "return")
model_logit = sm.OLS(y, X)


Couldn't find program: 'false'


## 2. Boosted Trees

In [6]:
import xgboost as xgb
#dtrain_data = xgb.DMatrix(X_train, label = y_train)

param_grid = {"max_depth" : np.arange(1,20),
             "eta" : stats.uniform(0.1, 0.8),
             "gamma" : stats.uniform(0.05, 3),
             "lambda" : stats.uniform(0, 5),
             "alpha" : stats.uniform(0, 5),
             "colsample_bytree" : np.arange(0.2, 1, step = 0.1),
             "subsample" : np.arange(0.5, 1, step = 0.1),
             "n_estimators" : np.arange(10, 50, step = 5)}
    
gbm = xgb.XGBClassifier(objective = "binary:logistic", 
             num_parallel_tree = 1, num_boost_round = 80, early_stopping_rounds = 10)
metric = "roc_auc"
n = 100
fold = 10

In [7]:
randomized_auc = RandomizedSearchCV(estimator = gbm, n_iter = n, cv = fold, scoring = metric,
                                    param_distributions = param_grid, verbose = 1, random_state = 123,
                                   n_jobs = 7)
randomized_auc.fit(X,y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed: 25.6min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 212.9min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 531.7min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed: 1036.1min
[Parallel(n_jobs=7)]: Done 1000 out of 1000 | elapsed: 1305.5min finished


RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(early_stopping_rounds=10,
                                           num_boost_round=80,
                                           num_parallel_tree=10),
                   n_iter=100, n_jobs=7,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000218A218ABC8>,
                                        'colsample_bytree': array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'eta': <scipy.stats._distn_infrastructure.rv_frozen obje...
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002189873D708>,
                                        'lambda': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000218A218A708>,
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
     

In [8]:
print("Best Parameters:", randomized_auc.best_params_)
print("Best Score:", randomized_auc.best_score_)
randomized_auc.cv_results_["param_lambda"]

Best Parameters: {'alpha': 2.7630903067955797, 'colsample_bytree': 0.4000000000000001, 'eta': 0.3615745486583717, 'gamma': 0.5881705239639977, 'lambda': 2.3340493873098933, 'max_depth': 2, 'n_estimators': 15, 'subsample': 0.5}
Best Score: 0.6747625886800404


masked_array(data=[3.595751550773865, 2.005087783306018,
                   3.158960088435252, 2.4604238839617114,
                   1.1725643759114308, 0.7704112092342336,
                   3.1197647589605557, 4.927798928053525,
                   2.167091196133339, 2.283739584286279,
                   1.5748322294534307, 3.617081790949774,
                   3.4776476438545547, 4.208349984563581,
                   1.7795743285872978, 1.2042794886181225,
                   3.3078216833312184, 1.7713233779583926,
                   0.01344032287160346, 4.918154424558616,
                   0.8053450721460742, 2.73881786314427,
                   2.3693592750229744, 4.055195684974514,
                   2.511848237228068, 2.9133554393070233,
                   2.203214842049366, 2.426984129042013,
                   4.6106433051801545, 3.588787811397788,
                   3.72390327570883, 2.5565869574909694,
                   1.7205596263776406, 2.0864553045167207,
              

In [None]:
score_50 = randomized_auc.best_score_

In [9]:
from sklearn.metrics import roc_auc_score
preds = randomized_auc.predict(X_test)

roc_auc_score(y_test, preds)

0.6198670210605478

In [10]:
data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies')

In [11]:
preds = randomized_auc.predict_proba(data_u)[:, 1]
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("first_pred.csv")

In [12]:
predict_unknown.shape

(50000,)