In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from tqdm.notebook import trange, tqdm
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
def create_classification_dataset(n_features=30, n_samples=30000):
    X, y = make_classification(n_features=n_features, n_samples=n_samples)
    features = pd.DataFrame(X)
    for i in range(n_features):
        features = features.rename(columns={i:f'x{i}'})
    target = pd.DataFrame(y).rename(columns={0:'y_true'})
    df = pd.concat((features, target), axis=1)
    return df
df = create_classification_dataset()
feature_names = df.drop('y_true', axis=1).columns.tolist()
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,y_true
0,-0.499515,0.405372,1.299035,0.140961,0.468191,0.038780,1.338528,1.062459,-0.819829,-1.062697,...,-1.021530,1.041727,-0.589672,-0.437354,-1.260741,-1.636807,-0.243491,0.744196,-1.078240,0
1,-2.201899,-0.596634,0.736183,0.135850,0.739103,-0.198157,0.197092,0.210643,-0.365994,1.159906,...,-1.103324,0.063150,-0.505031,1.075885,0.161563,-0.340813,1.106273,-0.479424,-0.186984,1
2,-0.547775,0.555207,1.977821,-2.138625,-0.119406,0.096043,-0.298527,1.607904,-0.691452,1.341169,...,-0.667844,-0.128295,-0.236466,1.304514,-1.652713,0.032829,1.136653,-0.418767,0.629500,1
3,-1.203121,1.593411,-0.006512,-1.289521,-0.159286,-0.816935,2.088056,-1.049304,-0.035831,0.110080,...,0.468345,0.868917,-0.229116,-1.122440,1.569966,0.654019,-0.313876,0.192480,-0.261461,0
4,-0.632963,0.725198,-1.016835,-0.507126,0.471448,-1.410136,0.538082,-1.758508,-0.846035,0.184156,...,0.050622,-1.227718,-0.952063,-0.606980,0.062510,-1.606780,-0.052147,0.422932,1.289834,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,1.438694,-0.803608,0.335663,1.055872,-0.105136,1.031858,-1.867890,-1.043008,1.630294,0.715266,...,0.608681,-1.057433,-2.182554,0.542308,-0.249896,1.342107,2.154113,-0.583993,0.259971,0
29996,-0.407924,1.991230,-1.099367,0.458537,0.550347,1.004145,0.140107,-0.399847,-0.582279,-0.945972,...,0.846935,0.438068,-0.615290,-0.260688,-0.643107,0.737364,-0.691257,1.009570,0.883594,1
29997,1.402627,0.105576,-1.894657,-1.631187,0.520548,-0.430787,-0.857767,-1.899752,0.127136,-0.694511,...,-1.496023,0.522655,0.553650,0.089491,1.100389,0.681255,-0.499845,-0.145279,-0.323935,0
29998,1.396841,0.000258,0.008980,1.855927,-0.406378,1.169250,0.204624,0.414210,-1.301170,0.620415,...,1.052435,0.544630,0.620063,-0.364079,1.130360,0.981009,-0.079696,0.138851,-1.127079,0


In [3]:
kfold= KFold(n_splits=10,random_state=42,shuffle=True) #kfold cross validation, 10-subsets
X_train, X_test, y_train, y_test = train_test_split(df[feature_names], df['y_true'], test_size=0.3,random_state=17)

In [4]:
train = pd.concat((X_train, y_train), axis=1)

In [5]:
test = pd.concat((X_test, y_test), axis=1)

In [6]:
# LEARNING ON TRAIN DATA

In [7]:
list_of_trees = []
iterations = 100
learning_rate = 0.001
for i in trange(iterations, desc='LEARNING PROGRESS'):
    if i == 0:
        train[f'y_pred_{i}'] = train['y_true'].mean()

    train[f'gradient_{i}'] = -(train['y_true'] - train[f'y_pred_{i}'])

    tree = DecisionTreeRegressor(max_depth=1)
    tree.fit(train[feature_names], train[f'gradient_{i}'])
    list_of_trees.append(tree)

    train[f'tree_{i+1}_pred'] = tree.predict(train[feature_names])

    train[f'y_pred_{i+1}'] = train[f'y_pred_{i}'] - learning_rate*train[f'tree_{i+1}_pred']

# checking metrics
accuracy = accuracy_score(train['y_true'], (train[f'y_pred_0'] > 0.5).astype('int64'))
roc_auc = roc_auc_score(train['y_true'], train[f'y_pred_0'])
print(f'initial accuracy: {accuracy}')
print(f'initial roc_auc: {roc_auc}')
print('========= next step ========')
accuracy = accuracy_score(train['y_true'], (train[f'y_pred_{iterations}'] > 0.5).astype('int64'))
roc_auc = roc_auc_score(train['y_true'], train[f'y_pred_{iterations}'])
print(f'final accuracy: {accuracy}')
print(f'final roc_auc: {roc_auc}')

LEARNING PROGRESS:   0%|          | 0/100 [00:00<?, ?it/s]

initial accuracy: 0.5020476190476191
initial roc_auc: 0.5
final accuracy: 0.8838095238095238
final roc_auc: 0.8965253122476403


In [8]:
# VALIDATION ON TEST SET

In [9]:
for i in trange(iterations, desc='TEST VALIDATION PROGRESS'):
    if i == 0:
        test[f'y_pred_{i}'] = test['y_true'].mean()

    test[f'gradient_{i}'] = -(test['y_true'] - test[f'y_pred_{i}'])

    test[f'tree_{i+1}_pred'] = list_of_trees[i].predict(test[feature_names])

    test[f'y_pred_{i+1}'] = test[f'y_pred_{i}'] - learning_rate*test[f'tree_{i+1}_pred']

# checking metrics
accuracy = accuracy_score(test['y_true'], (test[f'y_pred_0'] > 0.5).astype('int64'))
roc_auc = roc_auc_score(test['y_true'], test[f'y_pred_0'])
print(f'initial accuracy: {accuracy}')
print(f'initial roc_auc: {roc_auc}')
print('========= next step ========')
accuracy = accuracy_score(test['y_true'], (test[f'y_pred_{iterations}'] > 0.5).astype('int64'))
roc_auc = roc_auc_score(test['y_true'], test[f'y_pred_{iterations}'])
print(f'final accuracy: {accuracy}')
print(f'final roc_auc: {roc_auc}')

TEST VALIDATION PROGRESS:   0%|          | 0/100 [00:00<?, ?it/s]

initial accuracy: 0.5036666666666667
initial roc_auc: 0.5
final accuracy: 0.8786666666666667
final roc_auc: 0.8922617863252004


In [10]:
# FITTING REAL GRADIENT BOOSTING MODEL

In [55]:
xg_reg = xgb.XGBClassifier(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 1, alpha = 10, n_estimators = 1000)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)
preds_prob = xg_reg.predict_proba(X_test)

In [56]:
# checking metrics
accuracy = accuracy_score(test['y_true'], preds)
roc_auc = roc_auc_score(test['y_true'], preds_prob[:,1])
print(f'final accuracy: {accuracy}')
print(f'final roc_auc: {roc_auc}')

final accuracy: 0.9096666666666666
final roc_auc: 0.9590188825463257
