In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')

In [2]:
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
#XGBoost steps
# Label encode
# sklearn cv
# hp tuning
from sklearn.preprocessing import LabelEncoder
def make_train(data):
    train = data
    for col in train.select_dtypes(include='object'):
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
    
    return train

In [4]:
train = make_train(data)

In [5]:
%%time
X = train.drop(['id', 'target'], axis=1)
y = train.target
# data_dmatrix = xgb.DMatrix(data=X,label=y)
# params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
#                 'max_depth': 5, 'alpha': 10}

# cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
#                     num_boost_round=1000,early_stopping_rounds=200,metrics="rmse", as_pandas=True, seed=42)

Wall time: 3min 6s


In [6]:
# cv_results.tail()

# cv_results['test-rmse-mean'].min()

### Hyperopt CV for XGBoost

In [62]:
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, space_eval

def hyperopt_train(params):
    model = xgb.XGBRegressor(**params)
    return -1 * cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", n_jobs=-1, cv=5).mean()

best = 0
def objective(params):
    global best
    rmse = hyperopt_train(params)
    if rmse < best:
        best = rmse
        print('new best: ', best, params)
    return {'loss': rmse, 'status': STATUS_OK}

In [63]:
%%time
space = {
    'objective': 'reg:linear',
    'n_estimators': hp.choice('n_estimators', range(100, 500)),
    'max_depth': hp.choice('max_depth', range(5, 10)),
    'gamma': hp.quniform('gamma', 0, 0.5, 0.01),
    'tree_method': 'gpu_hist',
    'random_seed': 42
}

trials = Trials()
best_params = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials
           )
params = space_eval(space, trials.argmin)
print("Best Params: ", params)
print("Best RMSE: ", best)

100%|███████████████████████████████████████████████████| 5/5 [01:52<00:00, 22.52s/trial, best loss: 0.8571415219557454]
Best Params:  {'gamma': 0.32, 'max_depth': 2, 'n_estimators': 46}
Best RMSE:  0
Wall time: 1min 52s


### 450 estimators cuz idk how to set early stopping on XGBRegressor.fit()

In [19]:
model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 450, seed=42)

In [20]:
test = make_train(test)
test_X = test.drop(['id'], axis=1)

def predict_test(model, X, y):
    model.fit(X, y)
    pred = model.predict(test_X)
#     pred.to_csv('test_pred.csv')
    return pred

In [21]:
%%time
pred = predict_test(model, X, y)
pred_df = pd.DataFrame(test['id'], columns=['id'])
pred_df['target'] = pred
pred_df.to_csv('xgb_results.csv', index=False)

Wall time: 1min 3s


In [10]:
# !kaggle competitions submit -c tabular-playground-series-feb-2021 -f xgb_results.csv -m "xgb minimal processing"

Successfully submitted to Tabular Playground Series - Feb 2021



  0%|          | 0.00/3.29M [00:00<?, ?B/s]
  0%|          | 8.00k/3.29M [00:00<00:57, 60.2kB/s]
 11%|#         | 368k/3.29M [00:00<00:35, 85.3kB/s] 
 30%|###       | 1.00M/3.29M [00:00<00:19, 121kB/s]
 54%|#####3    | 1.77M/3.29M [00:00<00:09, 172kB/s]
 66%|######5   | 2.16M/3.29M [00:02<00:06, 179kB/s]
100%|##########| 3.29M/3.29M [00:05<00:00, 669kB/s]
