In [1]:
import numpy as np
import pandas as pd
from gboosting import XGBTreeClassifier, XGBTreeRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import time

### Binary classification

In [9]:
def parse_monks(filename):
    data = pd.read_csv(filename, sep=' ', header=None, skipinitialspace=True)
    data = data.iloc[:,:-1] 
    y = data[0].values
    X = pd.get_dummies(data.drop(0,axis=1),
                       columns=[1,2,3,4,5,6]).values.astype(int)
    return X, y

#### Monk 1

In [10]:
X_tr, y_tr = parse_monks('datasets/Monks/monks-1.train')
X_ts, y_ts = parse_monks('datasets/Monks/monks-1.test')

In [11]:
my_gbm = XGBTreeClassifier(n_estimators=100,
                           max_depth=4,
                           eta=0.5,
                           lmbda=0.9,
                           gamma=0.0)
my_gbm.fit(X_tr, y_tr, verbose=True)

print('TRAIN\t|', accuracy_score(y_tr, my_gbm.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, my_gbm.predict(X_ts)))

100%|██████████| 100/100 [00:00<00:00, 137.63it/s]

TRAIN	| 1.0
TEST	| 1.0





In [12]:
official_xgb = xgb.XGBClassifier(n_estimators=100,
                                 max_depth=4,
                                 learning_rate=0.5,
                                 reg_lambda=0.9,
                                 gamma=0.0)
official_xgb.fit(X_tr, y_tr)

print('TRAIN\t|', accuracy_score(y_tr, official_xgb.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, official_xgb.predict(X_ts)))

TRAIN	| 1.0
TEST	| 1.0


#### Monk 2

In [13]:
X_tr, y_tr = parse_monks('datasets/Monks/monks-2.train')
X_ts, y_ts = parse_monks('datasets/Monks/monks-2.test')

In [19]:
my_gbm = XGBTreeClassifier(n_estimators=1000,
                           max_depth=8,
                           eta=0.1,
                           lmbda=0.5,
                           gamma=0.0)
my_gbm.fit(X_tr, y_tr, verbose=True)

print('TRAIN\t|', accuracy_score(y_tr, my_gbm.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, my_gbm.predict(X_ts)))

  2%|▏         | 22/1000 [00:01<00:46, 20.88it/s]

In [10]:
official_xgb = xgb.XGBClassifier(n_estimators=20,
                                 max_depth=8,
                                 learning_rate=0.5,
                                 reg_lambda=0.5,
                                 gamma=0.0)
official_xgb.fit(X_tr, y_tr)

print('TRAIN\t|', accuracy_score(y_tr, official_xgb.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, official_xgb.predict(X_ts)))

TRAIN	| 1.0
TEST	| 0.7962962962962963


## Higgs

In [2]:
higgs = pd.read_csv('./datasets/HIGGS.csv.gz', header=None, nrows=11000)
X_tr, X_ts, y_tr, y_ts = train_test_split(higgs.iloc[:, 1:], higgs.iloc[:, 0], train_size=0.8, random_state=0)

In [3]:
higgs.shape

(11000, 29)

In [4]:
xgbcls = xgb.XGBClassifier(n_estimators=300,
                           max_depth=4,
                           learning_rate=0.1,
                           reg_lambda=0.1,
                           gamma=0.1,
                           )
start = time.time()
xgbcls.fit(X_tr, y_tr)
end = time.time()

print((end-start))

print('TRAIN\t|', accuracy_score(y_tr, xgbcls.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, xgbcls.predict(X_ts)))

0.5426208972930908
TRAIN	| 0.8893181818181818
TEST	| 0.7063636363636364


In [5]:
xgbcls = xgb.XGBClassifier(n_estimators=300,
                           max_depth=4,
                           learning_rate=0.1,
                           reg_lambda=0.1,
                           gamma=0.1,
                           tree_method='hist'
                           )
start = time.time()
xgbcls.fit(X_tr, y_tr)
end = time.time()

print((end-start))

print('TRAIN\t|', accuracy_score(y_tr, xgbcls.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, xgbcls.predict(X_ts)))

0.4938797950744629
TRAIN	| 0.8893181818181818
TEST	| 0.7063636363636364


In [6]:
cls = XGBTreeClassifier(n_estimators=300,
                        algorithm='exact',
                        max_depth=8,
                        row_subsample=0.1,
                        eta=0.1,
                        lmbda=0.1,
                        gamma=0.1,
                        )

start = time.time()
cls.fit(X_tr.values, y_tr.values, verbose=True, thresh=1e-5)
end = time.time()

print((end - start))

print('TRAIN\t|', accuracy_score(y_tr, cls.predict(X_tr.values)))
print('TEST\t|', accuracy_score(y_ts, cls.predict(X_ts.values)))

 91%|█████████ | 273/300 [11:13<01:06,  2.47s/it]


673.8626136779785
TRAIN	| 0.9364772727272728
TEST	| 0.6518181818181819


In [9]:
cls = XGBTreeClassifier(n_estimators=300,
                        algorithm='approx',
                        epsilon=0.1,
                        max_depth=8,
                        row_subsample=0.1,
                        eta=0.1,
                        lmbda=0.1,
                        gamma=0.1,
                        )

start = time.time()
cls.fit(X_tr.values, y_tr.values, verbose=True, thresh=1e-5)
end = time.time()

print((end - start))

print('TRAIN\t|', accuracy_score(y_tr, cls.predict(X_tr.values)))
print('TEST\t|', accuracy_score(y_ts, cls.predict(X_ts.values)))


[A

ValueError: Unknown algorithm type: approx

## Criteo

In [7]:
criteo = pd.read_csv('./datasets/criteo.gz', nrows=100000)
criteo.fillna(0, inplace=True)
criteo = criteo.sample(6000)

#ohe categorical features
criteo = pd.get_dummies(criteo, columns=criteo.columns[13:-1], drop_first=True)
X_tr, X_ts, y_tr, y_ts = train_test_split(criteo.drop(columns=['label']), criteo['label'], stratify=criteo['label'], train_size=0.8, random_state=0)

In [8]:
criteo.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '38_fe56b763', '38_fe913524', '38_ff2edb72', '38_ff34bde9',
       '38_ff7dd6cd', '38_ff86d5e0', '38_ffda9301', '38_ffe5d2de',
       '38_ffef2a68', '38_fffa8e76'],
      dtype='object', length=31512)

In [None]:
xgbcls = xgb.XGBClassifier(n_estimators=300,
                           max_depth=8,
                           learning_rate=0.1,
                           reg_lambda=0.5,
                           gamma=0.5,
                           )
start = time.time()
xgbcls.fit(X_tr, y_tr)
end = time.time()

print((end-start))

print('TRAIN\t|', accuracy_score(y_tr, xgbcls.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, xgbcls.predict(X_ts)))

0.06778019269307455
TRAIN	| 0.934375
TEST	| 0.7991666666666667


In [None]:
xgbcls = xgb.XGBClassifier(n_estimators=300,
                           max_depth=8,
                           learning_rate=0.1,
                           reg_lambda=0.5,
                           gamma=0.5,
                           tree_method='hist'
                           )
start = time.time()
xgbcls.fit(X_tr, y_tr)
end = time.time()

print((end-start))

print('TRAIN\t|', accuracy_score(y_tr, xgbcls.predict(X_tr)))
print('TEST\t|', accuracy_score(y_ts, xgbcls.predict(X_ts)))

0.2636940638224284
TRAIN	| 0.9370833333333334
TEST	| 0.78


In [None]:
cls = XGBTreeClassifier(n_estimators=300,
                        algorithm='exact',
                        max_depth=4,
                        row_subsample=0.1,
                        eta=0.1,
                        lmbda=0.5,
                        gamma=0.5,
                        )

start = time.time()
cls.fit(X_tr.values, y_tr.values, verbose=True)
end = time.time()

print((end - start))
print('TRAIN\t|', accuracy_score(y_tr, cls.predict(X_tr.values)))
print('TEST\t|', accuracy_score(y_ts, cls.predict(X_ts.values)))

 51%|█████     | 152/300 [2:22:43<2:18:57, 56.34s/it]


28.564884440104166
TRAIN	| 0.88375
TEST	| 0.7475


In [None]:
cls = XGBTreeClassifier(n_estimators=300,
                        algorithm='approx',
                        eta=0.1,
                        max_depth=4,
                        row_subsample=0.1,
                        epsilon=0.1,
                        lmbda=0.5,
                        gamma=0.5,
                        )

start = time.time()
cls.fit(X_tr.values, y_tr.values, verbose=True)
end = time.time()

print((end - start))
print('TRAIN\t|', accuracy_score(y_tr, cls.predict(X_tr.values)))
print('TEST\t|', accuracy_score(y_ts, cls.predict(X_ts.values)))

100%|██████████| 300/300 [1:44:24<00:00, 20.88s/it]


20.90076855023702
TRAIN	| 0.8425
TEST	| 0.7633333333333333
