In [1]:
from os import getcwd
import pandas
import math
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

# 1.

df = pandas.read_csv(getcwd() + "/LWs/LW_7/data/gbm-data.csv")
y = df['Activity'].values
X = df.loc[:, 'D1':'D1776'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)



In [2]:
# 2.

def sigmoid(y_pred):
    return 1.0 / (1.0 + math.exp(-y_pred))

def log_loss_results(model, X, y):
    results = []
    for pred in model.staged_decision_function(X):
        results.append(log_loss(y, [sigmoid(y_pred) for y_pred in pred]))
    return results

def plot_loss(learning_rate, test_loss, train_loss):
    plt.figure()
    plt.plot(test_loss, 'm', linewidth=2)
    plt.plot(train_loss, 'c', linewidth=2)
    plt.xlabel("iterations")
    plt.ylabel("loss")
    plt.legend(['test', 'train'])
    plt.title("learning rate  " + str(learning_rate))
    plt.savefig(getcwd() + '/LWs/LW_7/plots/' + str(learning_rate) + '.png')
    plt.show()
    min_loss_value = min(test_loss)
    min_loss_index = test_loss.index(min_loss_value)
    return min_loss_value, min_loss_index


def model_test(learning_rate):
    model = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250,
                                       verbose=True, random_state=241)
    model.fit(X_train, y_train)
    train_loss = log_loss_results(model, X_train, y_train)
    test_loss = log_loss_results(model, X_test, y_test)
    return plot_loss(learning_rate, test_loss, train_loss)

min_loss_results = {}
for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]:
    print("learning rate " + str(learning_rate), end="\n\n\n")
    min_loss_results[learning_rate] = model_test(learning_rate)

learning rate 1


      Iter       Train Loss   Remaining Time 
         1           1.0190           20.15s


         2           0.9192           18.23s
         3           0.8272           18.87s
         4           0.7834           20.13s


         5           0.7109           22.48s
         6           0.6368           24.43s


         7           0.5797           25.58s
         8           0.5610           25.69s


         9           0.5185           25.81s
        10           0.4984           25.63s


        20           0.1999           24.99s


        30           0.1313           20.78s


        40           0.0790           17.61s


        50           0.0511           15.46s


        60           0.0352           13.92s


        70           0.0245           12.52s


        80           0.0162           11.49s


        90           0.0114           10.43s


       100           0.0077            9.60s


       200           0.0004            2.51s


learning rate 0.5


      Iter       Train Loss   Remaining Time 
         1           1.1255           16.33s


         2           1.0035           20.55s
         3           0.9386           21.24s
         4           0.8844           18.82s


         5           0.8381           19.66s
         6           0.7995           20.21s
         7           0.7559           20.44s


         8           0.7205           20.66s
         9           0.6958           20.67s
        10           0.6725           20.56s


        20           0.4672           20.47s


        30           0.3179           17.93s


        40           0.2274           16.35s


        50           0.1774           14.32s


        60           0.1394           12.88s


        70           0.1050           11.69s


        80           0.0805           10.63s


        90           0.0650            9.68s


       100           0.0511            8.86s


       200           0.0058            2.57s


learning rate 0.3


      Iter       Train Loss   Remaining Time 
         1           1.2095           25.74s


         2           1.1006           25.00s
         3           1.0240           25.89s
         4           0.9729           23.97s


         5           0.9387           22.75s
         6           0.8948           23.06s
         7           0.8621           21.68s


         8           0.8360           20.27s
         9           0.8171           19.38s
        10           0.7883           18.44s


        20           0.6164           14.11s


        30           0.4933           12.52s


        40           0.4248           11.41s


        50           0.3345           10.62s


        60           0.2760            9.86s


        70           0.2263            9.24s


        80           0.1971            8.58s


        90           0.1693            8.06s


       100           0.1388            7.57s


       200           0.0294            2.41s


learning rate 0.2


      Iter       Train Loss   Remaining Time 
         1           1.2613           19.68s


         2           1.1715           21.61s
         3           1.1009           21.58s
         4           1.0529           20.44s


         5           1.0130           20.05s
         6           0.9740           19.50s
         7           0.9475           18.13s
         8           0.9197           17.47s


         9           0.8979           16.79s
        10           0.8730           16.82s


        20           0.7207           13.74s


        30           0.6055           12.14s


        40           0.5244           11.10s


        50           0.4501           10.28s


        60           0.3908            9.61s


        70           0.3372            9.05s


        80           0.3009            8.47s


        90           0.2603            7.95s


       100           0.2327            7.40s


       200           0.0835            2.38s


learning rate 0.1


      Iter       Train Loss   Remaining Time 
         1           1.3199           20.45s


         2           1.2645           21.63s
         3           1.2170           21.45s
         4           1.1775           20.83s


         5           1.1404           20.20s
         6           1.1106           19.81s
         7           1.0844           19.01s
         8           1.0617           18.22s


         9           1.0411           18.23s
        10           1.0223           17.95s


        20           0.8864           14.90s


        30           0.7844           12.94s


        40           0.7176           11.76s


        50           0.6590           10.77s


        60           0.6120           10.00s


        70           0.5599            9.32s


        80           0.5242            8.68s


        90           0.4829            8.09s


       100           0.4473            7.52s


       200           0.2379            2.38s


In [3]:
# 3. Переобучение. Для лучшего рез-тата необходимо сокращать число итераций (/ Ис

In [4]:
# 4.

min_loss_value, min_loss_index = min_loss_results[0.2]
print('loss index: {}, loss value: {:0.2f}'.format(min_loss_index, min_loss_value))

loss index: 36, loss value: 0.53


In [5]:
# 5.

model = RandomForestClassifier(n_estimators=min_loss_index, random_state=241)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
test_loss = log_loss(y_test, y_pred)
print("test loss:", test_loss, sep="\t")

test loss:	0.5413812861804069
