# Otto Group Product Classification Challenge using nolearn/lasagne

This short notebook is meant to help you getting started with nolearn and lasagne in order to train a neural net and make a submission to the Otto Group Product Classification Challenge.

* [Otto Group Product Classification Challenge](https://www.kaggle.com/c/otto-group-product-classification-challenge)
* [Get the notebook from the Otto Group repository](https://github.com/ottogroup)
* [Nolearn repository](https://github.com/dnouri/nolearn)
* [Lasagne repository](https://github.com/benanne/Lasagne)
* [A nolearn/lasagne tutorial for convolutional nets](http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/)

## Imports

In [30]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import theano

In [163]:
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet

from sklearn.cross_validation import train_test_split
import random

In [166]:
random.seed(42)

In [174]:
random.randint(1,5)

4

# Cross validation

## Log loss

In [336]:
sample_sub = "submissions/sampleSubmission.csv"
sample_sub_df = pd.read_csv(sample_sub)

def normalize(row, epsilon=1e-15):
    
    row = row / np.sum(row)
    row = np.maximum(epsilon, row)
    row = np.minimum(1 - epsilon, row)
    
    return row
    
def logloss_mc(y_true, y_probs):
    
    # Normalize probability data frame
    y_probs = y_probs.apply(normalize, axis=1)
        
    log_vals = []
        
    for i, y in enumerate(y_true):
        c = int(y.split("_")[1])
        log_vals.append(- np.log(y_probs.iloc[i,c - 1]))
        
    return np.mean(log_vals)
        

## Load data

In [189]:
def load_train_data_non_lasagne(df, train_size=0.8, percentage=1, standardize=False):

    if standardize:
        X = df.drop(['id', 'target'], axis=1).apply(func=log_normalize, axis=1)
        X = StandardScaler().fit_transform(X)
        X = pd.DataFrame(X)
        X.loc[:, 'id'] = df.loc[:, 'id']
        X.loc[:, 'target'] = df.loc[:, 'target']
        df = X

    
    
    num_samples = int(len(df) * percentage)
    
    sample_rows = random.sample(df.index, num_samples)
    
    df_sampled = df.ix[sample_rows]
    
    X_train, X_valid, y_train, y_valid = train_test_split(df_sampled.drop(['id', 'target'], axis=1),
                                                          df_sampled.target, 
                                                          train_size=train_size)
    
    return (X_train, X_valid,
            y_train.astype(str), y_valid.astype(str))

## Utility functions

In [182]:
## Get random rows

In [193]:
df = pd.read_csv("data/train.csv")

In [262]:
X_train, X_valid, y_train, y_valid = load_train_data_non_lasagne(pd.read_csv("data/train.csv"))

In [265]:
len(y_valid)

12376

In [266]:
def load_train_data_cross_validation(X_train, y_train):
    
    X, labels = X_train.astype(np.float32), y_train
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler

In [274]:
def load_test_data_cross_validation(X_valid, scaler):
    X_valid, ids = X_valid.astype(np.float32), np.arange(1, len(y_valid) + 1).astype(str)
    X_valid = scaler.transform(X_valid)
    return X_valid, ids

In [198]:
def load_train_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    np.random.shuffle(X)
    X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler

In [199]:
def load_test_data(path, scaler):
    df = pd.read_csv(path)
    X = df.values.copy()
    X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
    X = scaler.transform(X)
    return X, ids

In [200]:
def make_submission(clf, X_test, ids, encoder, name='my_neural_net_submission.csv'):
    y_prob = clf.predict_proba(X_test)
    with open(name, 'w') as f:
        f.write('id,')
        f.write(','.join(encoder.classes_))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(name))

In [201]:
def float32(k):
    return np.cast['float32'](k)

class AdjustVariable(object):
    def __init__(self, name, start=0.03, stop=0.001):
        self.name = name
        self.start, self.stop = start, stop
        self.ls = None

    def __call__(self, nn, train_history):
        if self.ls is None:
            self.ls = np.linspace(self.start, self.stop, nn.max_epochs)

        epoch = train_history[-1]['epoch']
        new_value = float32(self.ls[epoch - 1])
        getattr(nn, self.name).set_value(new_value)

class EarlyStopping(object):
    def __init__(self, patience=100):
        self.patience = patience
        self.best_valid = np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_loss']
        current_epoch = train_history[-1]['epoch']
        if current_valid < self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = [w.get_value() for w in nn.get_all_params()]
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("Early stopping.")
            print("Best valid loss was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_weights_from(self.best_weights)
            raise StopIteration()

## Load Data

In [202]:
X, y, encoder, scaler = load_train_data('data/train.csv')

In [203]:
len(X[0])

93

In [204]:
X_test, ids = load_test_data('data/test.csv', scaler)

In [205]:
num_classes = len(encoder.classes_)
num_features = X.shape[1]

# Perform cross validation

In [275]:
X, y, encoder, scaler = load_train_data_cross_validation(X_train, y_train)

In [355]:
X_test, ids = load_test_data_cross_validation(X_valid, scaler)

## Train Neural Net

In [278]:
layers0 = [('input', InputLayer),
           ('dense0', DenseLayer),
           ('dropout', DropoutLayer),
           ('dense1', DenseLayer),
           ('output', DenseLayer)]

In [279]:
net2 = NeuralNet(layers=layers0,
                 
                 input_shape=(None, num_features),
                 dense0_num_units=300,
                 dropout_p=0.5,
                 dense1_num_units=300,
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=nesterov_momentum,
                 
                 # optimization method:
                 update_learning_rate=theano.shared(float32(0.05)),
                 update_momentum=theano.shared(float32(0.8)),


                on_epoch_finished=[
                    AdjustVariable('update_learning_rate', start=0.05, stop=0.00001),
                    AdjustVariable('update_momentum', start=0.8, stop=0.9999),
                    EarlyStopping(patience=200),
        ],
                 
                 eval_size=0.2,
                 verbose=1,
                 max_epochs=10)


In [280]:
net2.fit(X, y)

  InputLayer        	(None, 93)          	produces      93 outputs
  DenseLayer        	(None, 300)         	produces     300 outputs
  DropoutLayer      	(None, 300)         	produces     300 outputs
  DenseLayer        	(None, 300)         	produces     300 outputs
  DenseLayer        	(None, 9)           	produces       9 outputs

 Epoch  |  Train loss  |  Valid loss  |  Train / Val  |  Valid acc  |  Dur
--------|--------------|--------------|---------------|-------------|-------
     1  |  [94m  0.818763[0m  |  [32m  0.643108[0m  |     1.273134  |     74.71%  |  8.6s
     2  |  [94m  0.658205[0m  |  [32m  0.598454[0m  |     1.099842  |     76.88%  |  8.8s
     3  |  [94m  0.622278[0m  |  [32m  0.582854[0m  |     1.067640  |     77.42%  |  8.5s
     4  |  [94m  0.598871[0m  |  [32m  0.570920[0m  |     1.048958  |     77.60%  |  8.5s
     5  |  [94m  0.587366[0m  |  [32m  0.557764[0m  |     1.053072  |     78.21%  |  8.3s
     6  |  [94m  0.575259[0m  |  [32m  

NeuralNet(X_tensor_type=<function matrix at 0x7fc9ed37f140>,
     batch_iterator_test=<nolearn.lasagne.BatchIterator object at 0x7fc9e9f3cd10>,
     batch_iterator_train=<nolearn.lasagne.BatchIterator object at 0x7fc9e9f3ccd0>,
     dense0_num_units=300, dense1_num_units=300, dropout_p=0.5,
     eval_size=0.2, input_shape=(None, 93),
     layers=[('input', <class 'lasagne.layers.input.InputLayer'>), ('dense0', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout', <class 'lasagne.layers.noise.DropoutLayer'>), ('dense1', <class 'lasagne.layers.dense.DenseLayer'>), ('output', <class 'lasagne.layers.dense.DenseLayer'>)],
     loss=<function negative_log_likelihood at 0x7fc9ea861668>,
     max_epochs=10, more_params={},
     on_epoch_finished=[<__main__.AdjustVariable object at 0x7fc9dd7a4c10>, <__main__.AdjustVariable object at 0x7fc9dd7e34d0>, <__main__.EarlyStopping object at 0x7fc9dd7e3750>],
     on_training_finished=(),
     output_nonlinearity=<theano.tensor.nnet.nnet.Softmax objec

## Prepare Submission File

In [281]:
make_submission(net2, X_test, ids, encoder, "cross_validation1.csv")

Wrote submission to file cross_validation1.csv.


In [350]:
y_probs = pd.read_csv("cross_validation1.csv").iloc[:,1:]

In [351]:
y_true = y_valid

In [354]:
logloss_mc(y_true, y_probs)

0.70197506680714838

# Polish submission file

In [341]:
def polish(row, threshold=0.01):
    
    for i, x in enumerate(row):
        if x < threshold:
            row[i] = 0
            
    return row    

In [343]:
y_probs = y_probs.apply(polish, axis=1)

In [348]:
y_probs.head()

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,0.003692817,1.918713e-14,8.015072e-15,1.195318e-12,1.257551e-13,6.9e-05,1.43851e-07,2.203745e-09,0.8658806
1,7.462026e-06,0.6189851,0.002689075,0.0001289277,5.38896e-06,0.017242,6.305716e-05,7.709063e-06,8.720888e-06
2,4.631211e-06,0.4855975,0.06016642,0.0007889919,8.745908e-07,0.0,1.344715e-05,3.622999e-06,0.0004265114
3,1.222972e-14,1.089753e-14,6.575821e-15,2.454846e-14,3.360689e-18,0.999796,1.431133e-12,9.558473e-09,6.313265e-12
4,2.05927e-05,2.503575e-13,4.841479e-14,4.741045e-13,1.49883e-12,2e-05,3.547771e-11,8.205999e-08,0.9814084


In [353]:
y_probs = y_probs * y_probs

# Gradient Descent

In [359]:
import graphlab as gl

In [361]:
# Load the data

graph_df = pd.DataFrame(X)
graph_df['target'] = y

train = gl.SFrame(graph_df)
test = gl.SFrame(X_test)
sample = gl.SFrame.read_csv('submissions/sampleSubmission.csv')

del train['id']

# Train a model
m = gl.boosted_trees_classifier.create(dataset = train,
                                       target='target',
                                       max_iterations=100,
                                       max_depth = 10,
                                       row_subsample = 0.86,
                                       column_subsample = 0.78,
                                       min_loss_reduction = 1.05,
                                       min_child_weight = 4,
                                       validation_set = None)
 
# Make submission
preds = m.predict_topk(test, output_type='probability', k=9)
preds['id'] = preds['id'].astype(int) + 1
preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
preds = preds.sort('id')
 
assert sample.num_rows() == preds.num_rows()

preds.save("graphlab_crazy_cross.csv", format = 'csv')


[ERROR] Fatal error. The unity_server process cannot be started. There may have been an issue during installation of graphlab-create. Please uninstall graphlab-create and reinstall it, looking for errors that may occur during installation. If the problem persists, please contact support@dato.com.


AssertionError: Cannot connect to GraphLab Server