# Gradient Boosting

In [None]:
import numpy as np
np.random.seed(10)

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

In [None]:
# Generate a random classification problem
X, y = make_classification(n_samples=8000, n_features=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3)

In [None]:
# Number of weak learner the model should train
n_estimator = 10
best_n_estimator = n_estimator
best_acc = 0.0

while n_estimator < 500:
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd.fit(X_train, y_train)
    preds = grd.predict(X_valid)
    accuracy = np.sum(preds == y_valid)/len(y_valid)
    print ('accuracy = ' + str(accuracy) + ' n_estimator = ' + str(n_estimator))
    if (accuracy > best_acc):
        best_n_estimator = n_estimator
        best_acc = accuracy
    n_estimator += 10

In [None]:
# Get predictions, probabilities and decision function
grd = GradientBoostingClassifier(n_estimators=best_n_estimator)
grd.fit(X_train, y_train)
prob_pred = grd.predict_proba(X_test)[:, 1]
predictions = grd.predict(X_test)
fpr_grd, tpr_grd, _ = roc_curve(y_test, prob_pred)
y_score = grd.decision_function(X_test)

In [None]:
# Accuracy on test set
print (np.sum(predictions==y_test)/(len(y_test)))

In [None]:
# Distance of the samples X to the separating hyperplane.
print (y_score[0])

In [None]:
# Plot ROC Curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_grd, tpr_grd, label='GBT')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()