# Datasets for experimenting yourself

In [1]:
import urllib2 # import urllib.request as urllib2 in Python3
import requests, io, os, StringIO
import numpy as np
import tarfile, zipfile, gzip


def unzip_from_UCI(UCI_url, dest=''):
    """
    Downloads and unpacks datasets from UCI in zip format
    """
    response = requests.get(UCI_url)
    compressed_file = io.BytesIO(response.content)
    z = zipfile.ZipFile(compressed_file)
    print ('Extracting in %s' %  os.getcwd()+'\\'+dest)
    for name in z.namelist():
        if '.csv' in name:
            print ('\tunzipping %s' %name)
            z.extract(name, path=os.getcwd()+'\\'+dest)

def gzip_from_UCI(UCI_url, dest=''):
    """
    Downloads and unpacks datasets from UCI in gzip format
    """
    response = urllib2.urlopen(UCI_url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    filename = UCI_url.split('/')[-1][:-3]
    with open(os.getcwd()+'\\'+filename, 'wb') as outfile:
        outfile.write(decompressed_file.read())
    print ('File %s decompressed' % filename)
            
def targzip_from_UCI(UCI_url, dest='.'):
    """
    Downloads and unpacks datasets from UCI in tar.gz format
    """
    response = urllib2.urlopen(UCI_url)
    compressed_file = StringIO.StringIO(response.read())
    tar = tarfile.open(mode="r:gz", fileobj = compressed_file)
    tar.extractall(path=dest)
    datasets = tar.getnames()
    for dataset in datasets:
        size = os.path.getsize(dest+'\\'+dataset)
        print ('File %s is %i bytes' % (dataset,size))
    tar.close()

def load_matrix(UCI_url):
    """
    Downloads datasets from UCI in matrix form
    """
    return np.loadtxt(urllib2.urlopen(UCI_url))

In [2]:
import os
print "Current directory is: \"%s\"" % (os.getcwd())

Current directory is: "C:\scisoft\WinPython-64bit-2.7.9.4\notebooks\Packt - Large Scale"


In [3]:
import zlib
from random import shuffle, seed

def ram_shuffle(filename_in, filename_out, header=True, random_seed=0):
    with open(filename_in, 'rb') as f:
        zlines = [zlib.compress(line, 9) for line in f]
        if header:
            first_row = zlines.pop(0)
    seed(random_seed)
    shuffle(zlines)
    with open(filename_out, 'wb') as f:
        if header:
            f.write(zlib.decompress(first_row))
        for zline in zlines:
            f.write(zlib.decompress(zline))

###Bike Sharing Dataset Data Set

In [8]:
UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
unzip_from_UCI(UCI_url, dest='bikesharing')

Extracting in C:\scisoft\WinPython-64bit-2.7.9.4\notebooks\Packt - Large Scale\bikesharing
	unzipping day.csv
	unzipping hour.csv


###Covertype Data Set 

In [10]:
UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
gzip_from_UCI(UCI_url)

File covtype.data decompressed


In [3]:
import os
from random import seed
local_path = os.getcwd()
source = 'covtype.data'
ram_shuffle(filename_in=local_path+'\\'+source, \
                   filename_out=local_path+'\\shuffled_covtype.data', header=False)

#Non-linear & faster with Vowpal Wabbit 

###Useful functions

In [1]:
import numpy as np

def sigmoid(x):
    return 1. / (1. + np.exp(-x))

def apply_log(x): 
    return np.log(x + 1.0)

def apply_exp(x): 
    return np.exp(x) - 1.0

###Useful dataset examples

In [37]:
with open('house_dataset','wb') as W:
    W.write("0 | price:.23 sqft:.25 age:.05 2006\n")
    W.write("1 2 'second_house | price:.18 sqft:.15 age:.35 1976\n")
    W.write("0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\n")

with open('house_dataset','rb') as R:
    for line in R:
        print line.strip()

0 | price:.23 sqft:.25 age:.05 2006
1 2 'second_house | price:.18 sqft:.15 age:.35 1976
0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924


###A way to call VW from Python

In [2]:
import subprocess

def execute_vw(parameters):
    execution = subprocess.Popen('vw '+parameters, shell=True, stderr=subprocess.PIPE)
    line = ""
    history = ""
    while True:
        out = execution.stderr.read(1)
        history += out
        if out == '' and execution.poll() != None:
            print '------------ COMPLETED ------------\n'
            break
        if out != '':
            line += out
            if '\n' in line[-2:]:
                print line[:-2]
                line = ''
    return history.split('\r\n')


params = "house_dataset"
results = execute_vw(params)

Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = house_dataset
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0   0.0000   0.0000        5
0.666667 1.000000            2            3.0   1.0000   0.0000        5

finished run
number of examples per pass = 3
passes used = 1
weighted example sum = 4.000000
weighted label sum = 2.000000
average loss = 0.750000
best constant = 0.500000
best constant's loss = 0.250000
total feature number = 15
------------ COMPLETED ------------



###Processing examples

In [2]:
import csv

def vw_convert(origin_file, target_file, binary_features, numeric_features, target, transform_target=lambda(x):x,
               separator=',', classification=True, multiclass=False, fieldnames= None, header=True, sparse=True):
    """
    Reads a online style stream and returns a generator of normalized feature vectors
    
    Parameters
    ‐‐‐‐‐‐‐‐‐‐
    original_file = the csv file you are taken the data from 
    target file = the file to stream from
    binary_features = the list of qualitative features to consider
    numeric_features = the list of numeric features to consider
    target = the label of the response variable
    transform_target = a function transforming the response
    separator = the field separator character
    classification = a Boolean indicating if it is classification
    multiclass =  a Boolean indicating if it is multiclass classification
    fieldnames = the fields' labels (can be ommitted and read from file)
    header = a boolean indicating if the original file has an header
    sparse = if a sparse vector is to be returned from the generator
    """
    with open(target_file, 'wb') as W:
        with open(origin_file, 'rb') as R:
            iterator = csv.DictReader(R, fieldnames, delimiter=separator)
            for n, row in enumerate(iterator):
                if not header or n>0:
                # DATA PROCESSING
                    response = transform_target(float(row[target]))
                    if classification and not multiclass:
                            if response == 0:
                                stream_row = '-1 '
                            else:
                                stream_row = '1 '
                    else:
                        stream_row = str(response)+' '
                    quantitative = list()
                    qualitative  = list()
                    for k,v in row.iteritems():
                        if k in binary_features:
                            qualitative.append(str(k)+'_'+str(v)+':1')
                        else:
                            if k in numeric_features and (float(v)!=0 or not sparse):
                                quantitative.append(str(k)+':'+str(v))
                    if quantitative:
                        stream_row += '|n '+' '.join(quantitative)
                    if qualitative:
                        stream_row += '|q ' + ' '.join(qualitative)
                    W.write(stream_row+'\n')

###Examples with toys datasets

In [210]:
import numpy as np
from sklearn.datasets import load_iris, load_boston
from random import seed
iris = load_iris()
seed(2)
re_order = np.random.permutation(len(iris.target))
with open('iris_versicolor.vw','wb') as W1:
    for k in re_order:
        y = iris.target[k]
        X = iris.values()[1][k,:]
        features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])
        target = '1' if y==1 else '-1'
        W1.write(target+features+'\n')

In [203]:
boston = load_boston()
seed(2)
re_order = np.random.permutation(len(boston.target))
with open('boston.vw','wb') as W1:
     for k in re_order:
        y = boston.target[k]
        X = boston.data[k,:]
        features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])
        W1.write(str(y)+features+'\n')

###Binary Iris

In [197]:
params = '--ksvm --l2 0.000001 --reprocess 2 -b 18 --kernel rbf --bandwidth=0.1 -p iris_bin.test -d iris_versicolor.vw'
results = execute_vw(params)

using l2 regularization = 1e-006
predictions = iris_bin.test
Lambda = 1e-006
Kernel = rbf
bandwidth = 0.1
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = iris_versicolor.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0  -1.0000   0.0000        5
0.960606 0.921212            2            2.0  -1.0000  -0.0788        5
1.030685 1.100763            4            4.0  -1.0000  -0.7865        5
0.790707 0.550729            8            8.0  -1.0000  -0.3755        5
0.647808 0.504909           16           16.0  -1.0000  -1.2473        5
0.477695 0.307582           32           32.0   1.0000   0.8621        5
0.319804 0.161914           64           64.0  -1.0000  -1.7015        5
0.272695 0.225585          128          128.0  -1.0000  -1.3150        5

finished run
number of ex

In [198]:
import numpy as np
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

accuracy = 0
with open('iris_bin.test', 'rb') as R:
    with open('iris_versicolor.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if (n+1) % 10==0:
                predicted = float(line.strip())
                y = float(example.split('|')[0])
                accuracy += np.sign(predicted)==np.sign(y)
                holdouts += 1            
print 'holdout accuracy: %0.3f' % ((accuracy / holdouts)**0.5)

holdout accuracy: 0.966


###Boston

In [211]:
params = 'boston.vw -f boston.model --loss_function squared -k --cache_file cache_train.vw --passes=20 --nn 5 --dropout'
results = execute_vw(params)

final_regressor = boston.model
using dropout for neural network training
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_train.vw
Reading datafile = boston.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
2500.000000 2500.000000            1            1.0  50.0000   0.0000        4
1570.433136 640.866272            2            2.0  26.4000   1.0847        3
945.682968 320.932800            4            4.0  21.0000   3.4834        3
738.617393 531.551817            8            8.0  35.4000   6.9177        4
559.106543 379.595694           16           16.0  23.1000   6.6911        3
362.538769 165.970995           32           32.0  16.7000  12.2397        3
301.716126 240.893483           64           64.0  19.7000  12.3789        3
236.351873 170.987621          128          128.0  16.1000  15.3

In [212]:
params = '-t boston.vw -i boston.model -k --cache_file cache_test.vw -p boston.test'
results = execute_vw(params)

only testing
predictions = boston.test
using dropout for neural network testing
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = cache_test.vw
Reading datafile = boston.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
922.607483 922.607483            1            1.0  50.0000  19.6255        4
464.302045 5.996608            2            2.0  26.4000  23.9512        3
253.949617 43.597188            4            4.0  21.0000  21.2530        3
175.713928 97.478239            8            8.0  35.4000  25.5958        4
130.466937 85.219947           16           16.0  15.2000  15.8726        3
79.291346 28.115755           32           32.0  15.6000  19.7057        4
85.270478 91.249610           64           64.0  22.8000  20.4866        3
83.265921 81.261364          128          128.0  20.8000  18.1267        3
70.838572 58.411224

In [214]:
val_rmse = 0
with open('boston.test', 'rb') as R:
    with open('boston.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if (n+1) % 10==0:
                predicted = float(line.strip())
                y = float(example.split('|')[0])
                val_rmse += (predicted - y)**2
                holdouts += 1            
print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)

holdout RMSE: 7.010


###Bike sharing

In [6]:
import os
local_path = os.getcwd()
b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']
n_vars = ['hum', 'temp', 'atemp', 'windspeed']
source = '\\bikesharing\\hour.csv'
origin = target_file=local_path+'\\'+source
target = target_file=local_path+'\\'+'bike.vw'
vw_convert(origin, target, binary_features=b_vars, numeric_features=n_vars, target = 'cnt', transform_target=apply_log,
               separator=',', classification=False, multiclass=False, fieldnames= None, header=True)

In [45]:
params = 'bike.vw -f regression.model -k --cache_file cache_train.vw --passes=1000 --hash strings --holdout_after 16000'
results = execute_vw(params)

final_regressor = regression.model
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_train.vw
Reading datafile = bike.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
8.027098 8.027098            1            1.0   2.8332   0.0000       12
7.243733 6.460369            2            2.0   3.7136   1.1718       12
4.184013 1.124293            4            4.0   2.6391   2.4762       12
2.709537 1.235061            8            8.0   1.3863   1.5636       12
2.265795 1.822052           16           16.0   4.7095   3.7598       13
1.325281 0.384768           32           32.0   2.1972   1.5774       13
1.350559 1.375836           64           64.0   5.0626   3.8186       13
1.395717 1.440876          128          128.0   4.2195   4.0547       13
1.165618 0.935518          256          256.0   2.0794   3.348

In [47]:
params = '-t bike.vw -i regression.model -k --cache_file cache_test.vw -p pred.test'
results = execute_vw(params)

only testing
predictions = pred.test
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = cache_test.vw
Reading datafile = bike.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.127379 0.127379            1            1.0   2.8332   3.1901       12
0.751745 1.376112            2            2.0   3.7136   2.5405       12
1.210345 1.668944            4            4.0   2.6391   1.5334       12
2.774795 4.339245            8            8.0   1.3863   4.3803       12
2.276018 1.777242           16           16.0   4.7095   4.8526       13
2.179675 2.083333           32           32.0   2.1972   4.6568       13
1.411963 0.644251           64           64.0   5.0626   5.1554       13
0.836451 0.260938          128          128.0   4.2195   4.6608       13
0.677186 0.517921          256          256.0   2.0794   2.8816       13
0.600932 0.5

In [10]:
val_rmse = 0
val_rmsle = 0
with open('pred.test', 'rb') as R:
    with open('bike.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if n > 16000:
                predicted = float(line.strip())
                y_log = float(example.split('|')[0])
                y = apply_exp(y_log)
                val_rmse += (apply_exp(predicted) - y)**2
                val_rmsle += (predicted - y_log)**2
                holdouts += 1
            
print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)
print 'holdout RMSLE: %0.3f' % ((val_rmsle / holdouts)**0.5)


holdout RMSE: 135.306
holdout RMSLE: 0.845


###Covertype

In [8]:
import os
local_path = os.getcwd()
n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]
source = 'shuffled_covtype.data'
origin = target_file=local_path+'\\'+source
target = target_file=local_path+'\\'+'covtype.vw'
vw_convert(origin, target, binary_features=list(), fieldnames= n_vars+['covertype'], numeric_features=n_vars,
    target = 'covertype', separator=',', classification=True, multiclass=True, header=False, sparse=False)

In [20]:
params = 'covtype.vw --ect 7 -f multiclass.model -k --cache_file cache_train.vw --passes=2 -l 1.0 --cubic nnn'
results = execute_vw(params)

creating cubic features for triples: nnn 
final_regressor = multiclass.model
Num weight bits = 18
learning rate = 1
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_train.vw
Reading datafile = covtype.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0        1        1      377
0.000000 0.000000            2            2.0        1        1      377
0.250000 0.500000            4            4.0        2        1      377
0.375000 0.500000            8            8.0        1        2      377
0.437500 0.500000           16           16.0        2        1      231
0.531250 0.625000           32           32.0        1        2      377
0.546875 0.562500           64           64.0        2        1      377
0.500000 0.453125          128          128.0        1        1      377
0.519531 0.539063  

In [21]:
params = '-t covtype.vw -i multiclass.model -k --cache_file cache_test.vw -p covertype.test'
results = execute_vw(params)

creating cubic features for triples: nnn 
only testing
predictions = covertype.test
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = cache_test.vw
Reading datafile = covtype.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0        1        1      377
0.000000 0.000000            2            2.0        1        1      377
0.000000 0.000000            4            4.0        2        2      377
0.000000 0.000000            8            8.0        1        1      377
0.187500 0.375000           16           16.0        1        2      377
0.156250 0.125000           32           32.0        3        3      377
0.156250 0.156250           64           64.0        2        1      377
0.218750 0.281250          128          128.0        2        2      377
0.222656 0.226563          256     

In [8]:
accuracy = 0
with open('covertype.test', 'rb') as R:
    with open('covtype.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if (n+1) % 10==0:
                predicted = float(line.strip())
                y = float(example.split('|')[0])
                accuracy += predicted ==y
                holdouts += 1
print 'holdout accuracy: %0.3f' % (accuracy / holdouts)

holdout accuracy: 0.769
