# Model construction

In [73]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

os.chdir("C:/Users/TonyG/Documents/GitHub/bads/kaggle")
data = pd.read_pickle('./data/known_cleaned_w_dummies')
X = data.drop("return", axis = 1)
y = data["return"]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98928 entries, 1 to 100000
Columns: 189 entries, item_id to user_state_Thuringia
dtypes: float64(3), int64(4), uint8(182)
memory usage: 23.2 MB


### For Preliminary version only: Create test and train sets based on the known dataset via random splitting

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

## 1. Logit Model with Elastic Net Penality 

In [None]:
%%script false --no-raise-error
y, X = data["return"], data.drop(axis = 1, labels = "return")
model_logit = sm.OLS(y, X)


## 2. Boosted Trees

In [75]:
import xgboost as xgb
dtrain_data = xgb.DMatrix(X_train, label = y_train)

param_grid = {"max_depth" : np.arange(1,20),
             "eta" : stats.uniform(0.1, 0.8),
             "gamma" : stats.uniform(0.05, 3),
             "lambda" : stats.uniform(0,5),
             "alpha" : stats.uniform(0, 5),
             "colsample_bytree" : np.arange(0.2, 1, step = 0.1),
             "subsample" : np.arange(0.5, 1, step = 0.1),
             "n_estimators" : np.arange(10, 50, step = 5)}
    
gbm = xgb.XGBClassifier(objective = "binary:logistic", 
             num_parallel_tree = 1, num_boost_round = 50, early_stopping_rounds = 10)
metric = "roc_auc"
n = 10
fold = 10

In [76]:
randomized_auc = RandomizedSearchCV(estimator = gbm, n_iter = n, cv = fold, scoring = metric,
                                    param_distributions = param_grid, verbose = 1, random_state = 123,
                                   n_jobs = 7)
randomized_auc.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  2.0min
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:  7.8min finished


RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(early_stopping_rounds=10,
                                           num_boost_round=50,
                                           num_parallel_tree=1),
                   n_jobs=7,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000211021444C8>,
                                        'colsample_bytree': array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'eta': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002...
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000211020E9508>,
                                        'lambda': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000211020E9408>,
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
    

In [77]:
print("Best Parameters:", randomized_auc.best_params_)
print("Best Score:", randomized_auc.best_score_)
randomized_auc.cv_results_["param_lambda"]

Best Parameters: {'alpha': 3.7518935238342417, 'colsample_bytree': 0.6000000000000001, 'eta': 0.8553280145631037, 'gamma': 1.5555100276530098, 'lambda': 3.1197647589605557, 'max_depth': 17, 'n_estimators': 40, 'subsample': 0.8999999999999999}
Best Score: 0.735215462077861


masked_array(data=[3.595751550773865, 2.005087783306018,
                   3.158960088435252, 2.4604238839617114,
                   1.1725643759114308, 0.7704112092342336,
                   3.1197647589605557, 4.927798928053525,
                   2.167091196133339, 2.283739584286279],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object)

In [None]:
score_50 = randomized_auc.best_score_

In [80]:
from sklearn.metrics import roc_auc_score
preds = randomized_auc.predict(X_test)

roc_auc_score(y_test, preds)

0.6627405275011877

In [95]:
data_u = pd.read_pickle('./data/unknown_cleaned_w_dummies')

In [96]:
preds = randomized_auc.predict_proba(data_u)[:, 1]
predict_unknown = pd.Series(preds, index=data_u["item_id"].index, name='return')
predict_unknown.to_csv("first_pred.csv")

In [97]:
predict_unknown.shape

(50000,)

(49491, 189)