# Massive scale regression: (TODO not yet finished)
**Warning:** this dataset will occupy 80GB on your computer. Check that the download location (currently '../data') is appropriate for data this size. You will also need about 20GB of RAM to run this code. 

The taxi data set consists of 1.21 billion yellow taxi journeys in New York. We got the data from http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml

The processing was as follows:
- We extracted the following features: time of day; day of the week; day of the month; month; pick-up latitude and longitude; drop-off latitude and longitude; travel distance; journey time (the target)
- We discarded journeys that are less than 10 s or greater than 5 h, or start/end outside the New York region, which we judge to have squared distance less than $5^o$ from the centre of New York
- As we read in the data we calculated $\sum x$ and $\sum x^2$. These are in the file `taxi_data_stats.p`. We use these for normalizing the data. In the paper we normalise the outputs and restore the scaling, but here we use a mean function and set the variance accordingly. 
- We shuffled the entire data set (we used a machine with 224GB of memory to do this) and then split the data into 101 files each with $10^7$ lines. We use the first 100 chunks for training and final chunk for testing 

To use this data set managably on a standard machine we read in two chunks at a time, the second loading asynchronously as the first chunk is used for training. We have a special `DataHolder` class for this  





In [1]:
import sys
sys.path.append('../src')

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline 

from GPflow.likelihoods import Gaussian
from GPflow.kernels import RBF, White
from GPflow.mean_functions import Constant, Zero
from GPflow.svgp import SVGP
from GPflow.param import DataHolder, Parentable

from scipy.cluster.vq import kmeans2
from get_data import get_taxi_data, get_taxi_stats

from threading import Thread
from Queue import Queue

from dgp import DGP
import time

In [3]:
def wrapper(func, arg, queue):
    queue.put(func(arg))

class TaxiData(DataHolder):
    def __init__(self, minibatch_size=10000):
        Parentable.__init__(self)
        self._shape = [minibatch_size, 10]
        self.minibatch_size = minibatch_size
        self.counter = 0
        self.chunk_counter = 0
        
        self.num_data = int(10**9)
        self.chunk_size = int(10**7)
        self.num_chunks = int(self.num_data/self.chunk_size)

        self.current_chunk = get_taxi_data(0) # get first chunk
        self.chunk_counter += 1
        self.start_get_chunk(self.chunk_counter) # start loading next one
        
        self.X_mean, self.X_std = get_taxi_stats() 
    
    def start_get_chunk(self, i):
        self.next_chunk_queued = Queue() 
        Thread(target=wrapper, args=(get_taxi_data, i, 
                                     self.next_chunk_queued)).start()
    
    def get_chunk(self, i):
        return self.whiten_X(get_taxi_data(i))
    
    def whiten_X(self, data):
        X = data[:, :-1]
        Xw = (X - self.X_mean)/self.X_std
        return np.concatenate([Xw, data[:, -1, None]], 1)
    
    def _get_type(self):
        return np.float64

    def make_tf_array(self):
        self._tf_array = tf.placeholder(dtype=self._get_type(),
                                        shape=[None, self._shape[1]],
                                        name=self.name)

    @property
    def value(self):
        raise NotImplementedError
        
    @property
    def size(self):
        return np.prod(self.shape)

    @property
    def shape(self):
        return self._shape

    def __str__(self, prepend='Data:'):
        return prepend + \
               '\033[1m' + self.name + '\033[0m' + \
               '\n data not printed!'
               
    def update_feed_dict(self, key_dict, feed_dict):
        if self.counter + self.minibatch_size > self.chunk_size:
            self.current_chunk = self.next_chunk_queued.get()
            self.chunk_counter = (self.chunk_counter + 1) % self.num_chunks
            self.start_get_chunk(self.chunk_counter)
            self.counter = 0     
       
        start = self.counter
        end = self.counter + self.minibatch_size
        
        self.counter += self.minibatch_size
        
        feed_dict[key_dict[self]] = self.whiten_X(self.current_chunk[start:end, :])



In [4]:
taxi_data = TaxiData()
test_data = taxi_data.get_chunk(101)
Xs, Ys = test_data[:, :-1], test_data[:, -1, None]

Downloading file: taxi_data_shuffled_101.csv
Downloaded file: taxi_data_shuffled_101.csv


We'll use the $10^6$ from the first chunk for kmeans.

In [5]:
Z = kmeans2(taxi_data.current_chunk[:int(1e6), :-1], 100, minit='points')[0]

To create a single layer model we need to slightly modify the base SVGP

In [None]:
class MassiveDataSVGP(SVGP):
    def __init__(self, dataholder, kernel, likelihood, Z, q_diag=False, whiten=True, num_latent=1, mean_function=Zero()):
        SVGP.__init__(self, np.zeros((1, 1)), np.zeros((1, 1)), kernel, likelihood, Z, 
                      q_diag=q_diag, whiten=whiten, num_latent=num_latent)
        del self.X
        del self.Y
        self.dataholder = dataholder
        self.num_data = dataholder.num_data
        
    def build_likelihood(self):
        self.X = self.dataholder[:, :-1]
        self.Y = self.dataholder[:, -1, None]        
        return SVGP.build_likelihood(self)
        
Y_mean, Y_std = np.average(taxi_data.current_chunk[:, -1]), np.std(taxi_data.current_chunk[:, -1])
m_sgp = MassiveDataSVGP(taxi_data, RBF(9), Gaussian(), Z.copy(), 
                        whiten=True, q_diag=False, mean_function=Constant(Y_mean))
m_sgp.likelihood.variance = Y_std**2



In [None]:
m_sgp.optimize(tf.train.AdamOptimizer(0.01), maxiter=10000)

print m_sgp.compute_log_likelihood()

  result = np.log(np.exp(y - self._lower) - np.ones(1, np_float_type))


Downloading file: taxi_data_shuffled_6.csv
Downloaded file: taxi_data_shuffled_6.csv
Downloading file: taxi_data_shuffled_7.csv
