In [1]:
from os import getcwd
import pandas
import math
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

# 1.

df = pandas.read_csv(getcwd() + "/LWs/LW_7/data/gbm-data.csv")
y = df['Activity'].values
X = df.loc[:, 'D1':'D1776'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)



In [28]:
# 2.

def sigmoid(y_pred):
    return 1.0 / (1.0 + math.exp(-y_pred))

def log_loss_results(model, X, y):
    results = []
    for pred in model.staged_decision_function(X):
        results.append(log_loss(y, [sigmoid(y_pred) for y_pred in pred]))
    return results

def plot_loss(learning_rate, test_loss, train_loss):
    plt.figure()
    plt.plot(test_loss, 'm', linewidth=2)
    plt.plot(train_loss, 'c', linewidth=2)
    plt.xlabel("iterations")
    plt.ylabel("loss")
    plt.legend(['test', 'train'])
    plt.title("learning rate  " + str(learning_rate))
    plt.savefig(getcwd() + '/LWs/LW_7/plots/' + str(learning_rate) + '.png')
    plt.show()
    min_loss_value = min(test_loss)
    min_loss_index = test_loss.index(min_loss_value)
    return min_loss_value, min_loss_index


def model_test(learning_rate):
    model = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250,
                                       verbose=True, random_state=241)
    model.fit(X_train, y_train)
    train_loss = log_loss_results(model, X_train, y_train)
    test_loss = log_loss_results(model, X_test, y_test)
    return plot_loss(learning_rate, test_loss, train_loss)

min_loss_results = {}
for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
    print("learning rate " + str(learning_rate), end="\n\n\n")
    min_loss_results[learning_rate] = model_test(learning_rate)

learning rate 1


      Iter       Train Loss   Remaining Time 
         1           1.0190           16.79s


         2           0.9192           17.48s
         3           0.8272           17.23s
         4           0.7834           17.26s


         5           0.7109           17.10s
         6           0.6368           18.28s


         7           0.5797           19.15s
         8           0.5610           19.69s
         9           0.5185           19.41s


        10           0.4984           19.33s


        20           0.1999           16.69s


        30           0.1313           15.34s


        40           0.0790           13.54s


        50           0.0511           12.30s


        60           0.0352           11.53s


        70           0.0245           10.93s


        80           0.0162           10.13s


        90           0.0114            9.40s


       100           0.0077            8.70s


       200           0.0004            2.45s


learning rate 0.5


      Iter       Train Loss   Remaining Time 


         1           1.1255           33.19s
         2           1.0035           29.17s


         3           0.9386           27.84s
         4           0.8844           25.50s
         5           0.8381           23.49s
         6           0.7995           21.79s


         7           0.7559           21.16s
         8           0.7205           20.61s
         9           0.6958           19.69s
        10           0.6725           18.88s


        20           0.4672           14.87s


        30           0.3179           13.18s


        40           0.2274           12.10s


        50           0.1774           11.28s


        60           0.1394           10.74s


        70           0.1050           10.08s


        80           0.0805            9.48s


        90           0.0650            8.82s


       100           0.0511            8.25s


       200           0.0058            2.63s


learning rate 0.3


      Iter       Train Loss   Remaining Time 
         1           1.2095           18.54s


         2           1.1006           20.14s
         3           1.0240           19.90s
         4           0.9729           18.66s
         5           0.9387           17.40s


         6           0.8948           18.00s
         7           0.8621           17.35s
         8           0.8360           16.67s
         9           0.8171           16.18s


        10           0.7883           15.82s


        20           0.6164           13.56s


        30           0.4933           12.64s


        40           0.4248           11.88s


        50           0.3345           11.25s


        60           0.2760           10.90s


        70           0.2263           10.41s


        80           0.1971            9.69s


        90           0.1693            8.99s


       100           0.1388            8.37s


       200           0.0294            2.70s


learning rate 0.2


      Iter       Train Loss   Remaining Time 


         1           1.2613           34.81s
         2           1.1715           33.03s


         3           1.1009           29.23s
         4           1.0529           28.14s
         5           1.0130           27.01s


         6           0.9740           26.09s
         7           0.9475           25.29s


         8           0.9197           26.02s
         9           0.8979           25.93s


        10           0.8730           26.31s


        20           0.7207           23.77s


        30           0.6055           20.64s


        40           0.5244           18.25s


        50           0.4501           16.42s


        60           0.3908           15.02s


        70           0.3372           14.07s


        80           0.3009           13.25s


        90           0.2603           12.27s


       100           0.2327           11.75s


       200           0.0835            3.78s


learning rate 0.1


      Iter       Train Loss   Remaining Time 
         1           1.3199           25.49s


         2           1.2645           24.78s
         3           1.2170           22.90s
         4           1.1775           21.66s


         5           1.1404           21.21s
         6           1.1106           21.36s
         7           1.0844           21.14s


         8           1.0617           21.09s
         9           1.0411           20.83s
        10           1.0223           20.40s


        20           0.8864           17.66s


        30           0.7844           15.68s


        40           0.7176           14.36s


        50           0.6590           13.24s


        60           0.6120           12.48s


        70           0.5599           11.69s


        80           0.5242           10.92s


        90           0.4829           10.21s


       100           0.4473            9.65s


       200           0.2379            3.12s


In [29]:
# 3. Переобучение. Для лучшего рез-тата необходимо сокращать число итераций (Искл. : learning rate 0.001)

In [30]:
# 4.

min_loss_value, min_loss_index = min_loss_results[0.2]
print('loss index: {}, loss value: {:0.2f}'.format(min_loss_index, min_loss_value))

loss index: 36, loss value: 0.53


In [31]:
# 5.

model = RandomForestClassifier(n_estimators=min_loss_index, random_state=241)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
test_loss = log_loss(y_test, y_pred)
print("test loss:", test_loss, sep="\t")

test loss:	0.5413812861804069
