# Otto Group Product Classification Challenge using nolearn/lasagne

This short notebook is meant to help you getting started with nolearn and lasagne in order to train a neural net and make a submission to the Otto Group Product Classification Challenge.

* [Otto Group Product Classification Challenge](https://www.kaggle.com/c/otto-group-product-classification-challenge)
* [Get the notebook from the Otto Group repository](https://github.com/ottogroup)
* [Nolearn repository](https://github.com/dnouri/nolearn)
* [Lasagne repository](https://github.com/benanne/Lasagne)
* [A nolearn/lasagne tutorial for convolutional nets](http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/)

## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import theano

In [2]:
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum, sgd, adagrad
from nolearn.lasagne import NeuralNet

from sklearn.cross_validation import train_test_split
import random

# Cross validation

## Log loss

In [7]:
sample_sub = "submissions/sampleSubmission.csv"
sample_sub_df = pd.read_csv(sample_sub)

def normalize(row, epsilon=1e-15):
    
    row = row / np.sum(row)
    row = np.maximum(epsilon, row)
    row = np.minimum(1 - epsilon, row)
    
    return row
    
def logloss_mc(y_true, y_probs):
    
    # Normalize probability data frame
    y_probs = y_probs.apply(normalize, axis=1)
        
    log_vals = []
        
    for i, y in enumerate(y_true):
        c = int(y.split("_")[1])
        log_vals.append(- np.log(y_probs.iloc[i,c - 1]))
        
    return np.mean(log_vals)
        

# Manually split data and keep forever

In [10]:
df_man = pd.read_csv("data/train.csv")

## Load data

In [4]:
def load_train_data_non_lasagne(df, train_size=0.8, percentage=1, standardize=False):

    if standardize:
        X = df.drop(['id', 'target'], axis=1).apply(func=log_normalize, axis=1)
        X = StandardScaler().fit_transform(X)
        X = pd.DataFrame(X)
        X.loc[:, 'id'] = df.loc[:, 'id']
        X.loc[:, 'target'] = df.loc[:, 'target']
        df = X
    
    num_samples = int(len(df) * percentage)
    
    sample_rows = random.sample(df.index, num_samples)
    
    df_sampled = df.ix[sample_rows]
    
    X_train, X_valid, y_train, y_valid = train_test_split(df_sampled.drop(['id', 'target'], axis=1),
                                                          df_sampled.target, 
                                                          train_size=train_size)
    
    return (X_train, X_valid,
            y_train.astype(str), y_valid.astype(str))

## Utility functions

## Get random rows

In [5]:
X_train, X_valid, y_train, y_valid = load_train_data_non_lasagne(pd.read_csv("data/train.csv"))

In [29]:
def load_train_data_cross_validation(X_train, y_train):
    
    X, labels = X_train.astype(np.float32), y_train
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler

In [30]:
def load_test_data_cross_validation(X_valid, scaler):
    X_valid, ids = X_valid.astype(np.float32), np.arange(1, len(y_valid) + 1).astype(str)
    X_valid = scaler.transform(X_valid)
    return X_valid, ids

In [31]:
def load_train_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    np.random.shuffle(X)
    X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler

In [32]:
def load_test_data(path, scaler):
    df = pd.read_csv(path)
    X = df.values.copy()
    X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
    X = scaler.transform(X)
    return X, ids

In [33]:
def make_submission(clf, X_test, ids, encoder, name='my_neural_net_submission.csv'):
    y_prob = clf.predict_proba(X_test)
    with open(name, 'w') as f:
        f.write('id,')
        f.write(','.join(encoder.classes_))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(name))

In [34]:
def float32(k):
    return np.cast['float32'](k)

class AdjustVariable(object):
    def __init__(self, name, start=0.03, stop=0.001):
        self.name = name
        self.start, self.stop = start, stop
        self.ls = None

    def __call__(self, nn, train_history):
        if self.ls is None:
            self.ls = np.linspace(self.start, self.stop, nn.max_epochs)

        epoch = train_history[-1]['epoch']
        new_value = float32(self.ls[epoch - 1])
        getattr(nn, self.name).set_value(new_value)

class EarlyStopping(object):
    def __init__(self, patience=100):
        self.patience = patience
        self.best_valid = np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_loss']
        current_epoch = train_history[-1]['epoch']
        if current_valid < self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = [w.get_value() for w in nn.get_all_params()]
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("Early stopping.")
            print("Best valid loss was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_weights_from(self.best_weights)
            raise StopIteration()

# Perform cross validation

In [35]:
X, y, encoder, scaler = load_train_data_cross_validation(X_train, y_train)

In [36]:
num_classes = len(encoder.classes_)
num_features = X.shape[1]

In [37]:
X_test, ids = load_test_data_cross_validation(X_valid, scaler)

## Train Neural Net

In [84]:
layers0 = [('input', InputLayer),
           ('dropout', DropoutLayer),
           ('dense0', DenseLayer),
           ('dropout0', DropoutLayer),
           ('dense1', DenseLayer),
           ('dropout1', DropoutLayer),
           ('dense2', DenseLayer),
           ('dropout2', DropoutLayer),
           ('output', DenseLayer)]

In [None]:
net2 = NeuralNet(layers=layers0,
                 
                 input_shape=(None, num_features),
                 
                 dropout_p=0.1,
                 
                 dense0_num_units=512,
                 dropout0_p=0.2,
                 
                 dense1_num_units=1024,
                 dropout1_p=0.3,
                 
                 dense2_num_units=512,
                 dropout2_p=0.4,
                 
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=nesterov_momentum,
                 #update=sgd,
                 # optimization method:
                 update_learning_rate=theano.shared(float32(0.02)),
                 update_momentum=theano.shared(float32(0.9)),


                on_epoch_finished=[
                    AdjustVariable('update_learning_rate', start=0.02, stop=0.001),
                    AdjustVariable('update_momentum', start=0.9, stop=0.9999),
                    EarlyStopping(patience=200),
        ],
                 
                 eval_size=0.2,
                 verbose=1,
                 max_epochs=100)


In [None]:
net2.fit(X, y)

## Prepare Submission File

In [65]:
make_submission(net2, X_test, ids, encoder, "cross_validation1.csv")

Wrote submission to file cross_validation1.csv.


In [66]:
y_probs = pd.read_csv("cross_validation1.csv").iloc[:,1:]

In [75]:
logloss_mc(y_valid, y_probs)

0.52981008250282702

# Polish submission file

In [341]:
def polish(row, threshold=0.01):
    
    for i, x in enumerate(row):
        if x < threshold:
            row[i] = 0
            
    return row    

In [343]:
# y_probs = y_probs.apply(polish, axis=1)

# Gradient Descent

In [69]:
import graphlab as gl

In [70]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [None]:
# Load the data

graph_train = pd.DataFrame(X, columns=df_train.columns[1:-1])
graph_train['target'] = y

train = gl.SFrame(graph_train)

graph_test = pd.DataFrame(X_test, columns=df_test.columns[1:])

test = gl.SFrame(graph_test)
sample = gl.SFrame.read_csv('submissions/sampleSubmission.csv')

# Train a model
m = gl.boosted_trees_classifier.create(dataset = train,
                                       target='target',
                                       max_iterations=100,
                                       max_depth = 10,
                                       row_subsample = 0.86,
                                       column_subsample = 0.78,
                                       min_loss_reduction = 1.05,
                                       min_child_weight = 5,
                                       validation_set = None)
 
# Make submission
preds = m.predict_topk(test, output_type='probability', k=9)
preds['id'] = preds['id'].astype(int) + 1
preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
preds = preds.sort('id')

preds.save("graphlab_crazy_cross.csv", format = 'csv')


PROGRESS: Finished parsing file /home/pold/Documents/Radboud/otto-find-ich-gut/submissions/sampleSubmission.csv
PROGRESS: Parsing completed. Parsed 100 lines in 0.14 secs.
------------------------------------------------------PROGRESS:     18   8.679e-01      455.24s
PROGRESS:     19   8.698e-01      466.31s
PROGRESS:     20   8.720e-01      502.06s


In [72]:
crazy_graphlab = pd.read_csv("graphlab_crazy_cross.csv").iloc[:,1:]

In [74]:
logloss_mc(y_valid, crazy_graphlab)

0.54265855159833221

In [89]:
absolute_crazy = pd.DataFrame(0.1 * crazy_graphlab.as_matrix() + 0.9 * y_probs.as_matrix())

In [95]:
for p in np.linspace(0,1,11):
    print "p is: ", p
    combined = pd.DataFrame(p * crazy_graphlab.as_matrix() +
                            (1 - p) * y_probs.as_matrix())
    print logloss_mc(y_valid, combined)
    print ""

p is:  0.0
0.529810082503

p is:  0.1
0.51785465833

p is:  0.2
0.513138476566

p is:  0.3
0.510888234464

p is:  0.4
0.510467995433

p is:  0.5
0.511619023379

p is:  0.6
0.514251064076

p is:  0.7
0.518390332656

p is:  0.8
0.524185774811

p is:  0.9
0.531982907178

p is:  1.0
0.542658551598



In [88]:
logloss_mc(y_valid, absolute_crazy)

0.51785465833004651

In [83]:
absolute_crazy

array([[ 0.00879195,  1.20103565,  0.6545801 , ...,  0.01923468,
         0.04363248,  0.0081767 ],
       [ 0.02719556,  0.00502506,  0.00359961, ...,  0.02017262,
         0.33962073,  0.0124155 ],
       [ 0.13292915,  0.01942159,  0.00791098, ...,  0.0147196 ,
         1.65915004,  0.05366536],
       ..., 
       [ 0.10482673,  1.33743742,  0.11034114, ...,  0.08099743,
         0.06426095,  0.13559176],
       [ 0.14020137,  0.12392503,  0.09462741, ...,  0.13060482,
         0.32985689,  0.30654467],
       [ 0.4304514 ,  0.02294892,  0.02172343, ...,  0.20669216,
         0.2282868 ,  0.07542733]])