# 3 Fast Learning SVM

## Preparing datasets

In [1]:
import os
print "Current directory is: \"%s\"" % (os.getcwd())

Current directory is: "C:\scisoft\WinPython-64bit-2.7.9.4\notebooks\Packt - Large Scale CHECK"


In [2]:
import urllib2 # import urllib.request as urllib2 in Python3
import requests, io, os, StringIO
import numpy as np
import tarfile, zipfile, gzip

def unzip_from_UCI(UCI_url, dest=''):
    """
    Downloads and unpacks datasets from UCI in zip format
    """
    response = requests.get(UCI_url)
    compressed_file = io.BytesIO(response.content)
    z = zipfile.ZipFile(compressed_file)
    print ('Extracting in %s' %  os.getcwd()+'\\'+dest)
    for name in z.namelist():
        if '.csv' in name:
            print ('\tunzipping %s' %name)
            z.extract(name, path=os.getcwd()+'\\'+dest)

def gzip_from_UCI(UCI_url, dest=''):
    """
    Downloads and unpacks datasets from UCI in gzip format
    """
    response = urllib2.urlopen(UCI_url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    filename = UCI_url.split('/')[-1][:-3]
    with open(os.getcwd()+'\\'+filename, 'wb') as outfile:
        outfile.write(decompressed_file.read())
    print ('File %s decompressed' % filename)
            
def targzip_from_UCI(UCI_url, dest='.'):
    """
    Downloads and unpacks datasets from UCI in tar.gz format
    """
    response = urllib2.urlopen(UCI_url)
    compressed_file = StringIO.StringIO(response.read())
    tar = tarfile.open(mode="r:gz", fileobj = compressed_file)
    tar.extractall(path=dest)
    datasets = tar.getnames()
    for dataset in datasets:
        size = os.path.getsize(dest+'\\'+dataset)
        print ('File %s is %i bytes' % (dataset,size))
    tar.close()

def load_matrix(UCI_url):
    """
    Downloads datasets from UCI in matrix form
    """
    return np.loadtxt(urllib2.urlopen(UCI_url))

###Bike Sharing Dataset Data Set

In [3]:
UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
unzip_from_UCI(UCI_url, dest='bikesharing')

Extracting in C:\scisoft\WinPython-64bit-2.7.9.4\notebooks\Packt - Large Scale CHECK\bikesharing
	unzipping day.csv
	unzipping hour.csv


###Covertype Data Set 

In [4]:
UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
gzip_from_UCI(UCI_url)

File covtype.data decompressed


## Understanding Scikit-learn SVM implementation

In [5]:
from sklearn import datasets
iris = datasets.load_iris()
X_i, y_i = iris.data, iris.target

In [6]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
import numpy as np
h_class = SVC(kernel='rbf', C=1.0, gamma=0.7, random_state=101)
scores = cross_val_score(h_class, X_i, y_i, cv=20, scoring='accuracy')
print 'Accuracy: %0.3f' % np.mean(scores)

Accuracy: 0.969


In [7]:
h_class.fit(X_i,y_i)
print h_class.support_

[ 13  14  15  22  24  41  44  50  52  56  60  62  63  66  68  70  72  76
  77  83  84  85  98 100 106 110 114 117 118 119 121 123 126 127 129 131
 133 134 138 141 146 149]


In [8]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
boston = load_boston()
shuffled = np.random.permutation(boston.target.size)
X_b = scaler.fit_transform(boston.data[shuffled,:])
y_b = boston.target[shuffled]

In [9]:
from sklearn.svm import SVR
from sklearn.cross_validation import cross_val_score
h_regr = SVR(kernel='rbf', C=20.0, gamma=0.001, epsilon=1.0)
scores = cross_val_score(h_regr, X_b, y_b, cv=20, scoring='mean_squared_error')
print 'Mean Squared Error: %0.3f' % abs(np.mean(scores))

Mean Squared Error: 28.218


## Pursuing non linear SVM by sub-sampling

In [10]:
from random import seed, randint
SAMPLE_COUNT = 5000
TEST_COUNT   = 20000
seed(0) # allows repeatable results
sample = list()
test_sample = list()
for index, line in enumerate(open('covtype.data','rb')):
    if index < SAMPLE_COUNT:
        sample.append(line)
    else:
        r = randint(0, index)
        if r < SAMPLE_COUNT:
            sample[r] = line
        else:
            k = randint(0, index)
            if k < TEST_COUNT:
                if len(test_sample) < TEST_COUNT:
                    test_sample.append(line)
                else:
                    test_sample[k] = line

In [11]:
import numpy as np
from sklearn.preprocessing import StandardScaler
for n,line in enumerate(sample):
        sample[n] = map(float,line.strip().split(','))
y = np.array(sample)[:,-1]
scaling = StandardScaler()
X = scaling.fit_transform(np.array(sample)[:,:-1])

In [12]:
for n,line in enumerate(test_sample):
        test_sample[n] = map(float,line.strip().split(','))
yt = np.array(test_sample)[:,-1]
Xt = scaling.transform(np.array(test_sample)[:,:-1])

In [13]:
from sklearn.svm import SVC
h = SVC(kernel='rbf', C=250.0, gamma=0.0025, random_state=101)
h.fit(X,y)
prediction = h.predict(Xt)
from sklearn.metrics import accuracy_score
print accuracy_score(yt, prediction)

0.75205


## Achieving SVM at scale with SGD

In [14]:
import csv, time, os
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix

def explore(target_file, separator=',', fieldnames= None, binary_features=list(), numeric_features=list(), max_rows=20000):
    """
    Generate from an online style stream a DictVectorizer and a MinMaxScaler.
    
    Parameters
    ‐‐‐‐‐‐‐‐‐‐
    target file = the file to stream from
    separator = the field separator character
    fieldnames = the fields' labels (can be ommitted and read from file)
    binary_features = the list of qualitative features to consider
    numeric_features = the list of numeric futures to consider
    max_rows = the number of rows to be read from the stream (can be None)
    """
    features = dict()
    min_max  = dict()
    vectorizer = DictVectorizer(sparse=False)
    scaler = MinMaxScaler()
    with open(target_file, 'rb') as R:
        iterator = csv.DictReader(R, fieldnames, delimiter=separator)
        for n, row in enumerate(iterator):
            # DATA EXPLORATION
            for k,v in row.iteritems():
                if k in binary_features:
                    if k+'_'+v not in features:
                        features[k+'_'+v]=0
                elif k in numeric_features:
                    v = float(v)
                    if k not in features:
                        features[k]=0
                        min_max[k] = [v,v]
                    else:
                        if v < min_max[k][0]:
                            min_max[k][0]= v
                        elif v > min_max[k][1]:
                            min_max[k][1]= v
                else:
                    pass # ignore the feature
            if max_rows and n > max_rows:
                break
    vectorizer.fit([features])
    A = vectorizer.transform([{f:0 if f not in min_max else min_max[f][0] for f in vectorizer.feature_names_},
{f:1 if f not in min_max else min_max[f][1] for f in vectorizer.feature_names_}])
    scaler.fit(A)
    return vectorizer, scaler

In [15]:
def pull_examples(target_file, vectorizer, binary_features, numeric_features, target, min_max=None, separator=',', 
fieldnames=None, sparse=True):
    """
    Reads a online style stream and returns a generator of normalized feature vectors
    
    Parameters
    ‐‐‐‐‐‐‐‐‐‐
    target file = the file to stream from
    vectorizer = a DictVectorizer object
    binary_features = the list of qualitative features to consider
    numeric_features = the list of numeric features to consider
    target = the label of the response variable
    min_max = a MinMaxScaler object, can be omitted leaving None
    separator = the field separator character
    fieldnames = the fields' labels (can be ommitted and read from file)
    sparse = if a sparse vector is to be returned from the generator
    """
    with open(target_file, 'rb') as R:
        iterator = csv.DictReader(R, fieldnames, delimiter=separator)
        for n, row in enumerate(iterator):
            # DATA PROCESSING
            stream_row = {}
            response = np.array([float(row[target])])
            for k,v in row.iteritems():
                if k in binary_features:
                    stream_row[k+'_'+v]=1.0 
                else:
                    if k in numeric_features:
                        stream_row[k]=float(v)
            if min_max:
                features = min_max.transform(vectorizer.transform([stream_row]))
            else:
                features = vectorizer.transform([stream_row])
            if sparse:
                yield(csr_matrix(features), response, n)
            else:
                yield(features, response, n)

In [16]:
source = '\\bikesharing\\hour.csv'
local_path = os.getcwd()
b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']
n_vars = ['hum', 'temp', 'atemp', 'windspeed']
std_row, min_max = explore(target_file=local_path+'\\'+source, binary_features=b_vars, numeric_features=n_vars)
print 'Features: '
for f,mv,mx in zip(std_row.feature_names_, min_max.data_min_, min_max.data_max_):
    print '%s:[%0.2f,%0.2f] ' % (f,mv,mx)

Features: 
atemp:[0.00,1.00] 
holiday_0:[0.00,1.00] 
holiday_1:[0.00,1.00] 
hr_0:[0.00,1.00] 
hr_1:[0.00,1.00] 
hr_10:[0.00,1.00] 
hr_11:[0.00,1.00] 
hr_12:[0.00,1.00] 
hr_13:[0.00,1.00] 
hr_14:[0.00,1.00] 
hr_15:[0.00,1.00] 
hr_16:[0.00,1.00] 
hr_17:[0.00,1.00] 
hr_18:[0.00,1.00] 
hr_19:[0.00,1.00] 
hr_2:[0.00,1.00] 
hr_20:[0.00,1.00] 
hr_21:[0.00,1.00] 
hr_22:[0.00,1.00] 
hr_23:[0.00,1.00] 
hr_3:[0.00,1.00] 
hr_4:[0.00,1.00] 
hr_5:[0.00,1.00] 
hr_6:[0.00,1.00] 
hr_7:[0.00,1.00] 
hr_8:[0.00,1.00] 
hr_9:[0.00,1.00] 
hum:[0.00,1.00] 
mnth_1:[0.00,1.00] 
mnth_10:[0.00,1.00] 
mnth_11:[0.00,1.00] 
mnth_12:[0.00,1.00] 
mnth_2:[0.00,1.00] 
mnth_3:[0.00,1.00] 
mnth_4:[0.00,1.00] 
mnth_5:[0.00,1.00] 
mnth_6:[0.00,1.00] 
mnth_7:[0.00,1.00] 
mnth_8:[0.00,1.00] 
mnth_9:[0.00,1.00] 
season_1:[0.00,1.00] 
season_2:[0.00,1.00] 
season_3:[0.00,1.00] 
season_4:[0.00,1.00] 
temp:[0.02,1.00] 
weathersit_1:[0.00,1.00] 
weathersit_2:[0.00,1.00] 
weathersit_3:[0.00,1.00] 
weathersit_4:[0.00,1.00] 
weekday_

In [17]:
from sklearn.linear_model import SGDRegressor
SGD = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001, penalty=None, random_state=1, average=True)
val_rmse = 0
val_rmsle = 0
predictions_start = 16000

def apply_log(x): return np.log(x + 1.0)
def apply_exp(x): return np.exp(x) - 1.0

for x,y,n in pull_examples(target_file=local_path+'\\'+source, 
                           vectorizer=std_row, min_max=min_max,
                           binary_features=b_vars, numeric_features=n_vars, target='cnt'):
    y_log = apply_log(y)
    # MACHINE LEARNING
    if (n+1) >= predictions_start:
        # HOLDOUT AFTER N PHASE
        predicted = SGD.predict(x)
        val_rmse += (apply_exp(predicted) - y)**2
        val_rmsle += (predicted - y_log)**2
        if (n-predictions_start+1) % 250 == 0 and (n+1) > predictions_start:
            print n,
            print '%s holdout RMSE: %0.3f' % (time.strftime('%X'), (val_rmse / float(n-predictions_start+1))**0.5),
            print 'holdout RMSLE: %0.3f' % ((val_rmsle / float(n-predictions_start+1))**0.5)
    else:
        # LEARNING PHASE
        SGD.partial_fit(x, y_log)
print '%s FINAL holdout RMSE: %0.3f' % (time.strftime('%X'), (val_rmse / float(n-predictions_start+1))**0.5)
print '%s FINAL holdout RMSLE: %0.3f' % (time.strftime('%X'), (val_rmsle / float(n-predictions_start+1))**0.5)

16249 09:42:40 holdout RMSE: 276.604 holdout RMSLE: 1.796
16499 09:42:40 holdout RMSE: 250.419 holdout RMSLE: 1.706
16749 09:42:41 holdout RMSE: 250.639 holdout RMSLE: 1.694
16999 09:42:41 holdout RMSE: 249.561 holdout RMSLE: 1.702
17249 09:42:41 holdout RMSE: 234.840 holdout RMSLE: 1.640
09:42:41 FINAL holdout RMSE: 224.404
09:42:41 FINAL holdout RMSLE: 1.594


In [18]:
source = 'shuffled_covtype.data'
local_path = os.getcwd()
n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]
std_row, min_max = explore(target_file=local_path+'\\'+source, binary_features=list(), 
                  fieldnames= n_vars+['covertype'], numeric_features=n_vars, max_rows=50000)
print 'Features: '
for f,mv,mx in zip(std_row.feature_names_, min_max.data_min_, min_max.data_max_):
    print '%s:[%0.2f,%0.2f] ' % (f,mv,mx)

Features: 
var_00:[1872.00,3846.00] 
var_01:[0.00,360.00] 
var_02:[0.00,55.00] 
var_03:[0.00,1369.00] 
var_04:[-163.00,599.00] 
var_05:[0.00,7117.00] 
var_06:[0.00,254.00] 
var_07:[78.00,254.00] 
var_08:[0.00,252.00] 
var_09:[0.00,7172.00] 
var_10:[0.00,1.00] 
var_11:[0.00,1.00] 
var_12:[0.00,1.00] 
var_13:[0.00,1.00] 
var_14:[0.00,1.00] 
var_15:[0.00,1.00] 
var_16:[0.00,1.00] 
var_17:[0.00,1.00] 
var_18:[0.00,1.00] 
var_19:[0.00,1.00] 
var_20:[0.00,1.00] 
var_21:[0.00,1.00] 
var_22:[0.00,1.00] 
var_23:[0.00,1.00] 
var_24:[0.00,1.00] 
var_25:[0.00,1.00] 
var_26:[0.00,1.00] 
var_27:[0.00,1.00] 
var_28:[0.00,0.00] 
var_29:[0.00,1.00] 
var_30:[0.00,1.00] 
var_31:[0.00,1.00] 
var_32:[0.00,1.00] 
var_33:[0.00,1.00] 
var_34:[0.00,1.00] 
var_35:[0.00,1.00] 
var_36:[0.00,1.00] 
var_37:[0.00,1.00] 
var_38:[0.00,1.00] 
var_39:[0.00,1.00] 
var_40:[0.00,1.00] 
var_41:[0.00,1.00] 
var_42:[0.00,1.00] 
var_43:[0.00,1.00] 
var_44:[0.00,1.00] 
var_45:[0.00,1.00] 
var_46:[0.00,1.00] 
var_47:[0.00,1.00] 

In [19]:
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(loss='hinge', penalty=None, random_state=1, average=True)
accuracy = 0
accuracy_record = list()
predictions_start = 50
sample = 5000
early_stop = 50000
for x,y,n in pull_examples(target_file=local_path+'\\'+source, 
                           vectorizer=std_row,
                           min_max=min_max,
                           binary_features=list(), numeric_features=n_vars,
                           fieldnames= n_vars+['covertype'], target='covertype'):
    # LEARNING PHASE
    if n > predictions_start:
        accuracy += int(int(SGD.predict(x))==y[0])
        if n % sample == 0:
            accuracy_record.append(accuracy / float(sample))
            print '%s Progressive accuracy at example %i: %0.3f' % (time.strftime('%X'), n, np.mean(accuracy_record[-sample:]))
            accuracy = 0
    if early_stop and n >= early_stop:
            break
    SGD.partial_fit(x, y, classes=range(1,8))

09:42:57 Progressive accuracy at example 5000: 0.652
09:43:08 Progressive accuracy at example 10000: 0.672
09:43:18 Progressive accuracy at example 15000: 0.681
09:43:29 Progressive accuracy at example 20000: 0.687
09:43:39 Progressive accuracy at example 25000: 0.692
09:43:50 Progressive accuracy at example 30000: 0.695
09:44:01 Progressive accuracy at example 35000: 0.697
09:44:11 Progressive accuracy at example 40000: 0.699
09:44:22 Progressive accuracy at example 45000: 0.700
09:44:32 Progressive accuracy at example 50000: 0.702


## Including non-linearities in SGD

In [20]:
from sklearn.linear_model import SGDRegressor
from  sklearn.preprocessing import PolynomialFeatures

source = '\\bikesharing\\hour.csv'
local_path = os.getcwd()
b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']
n_vars = ['hum', 'temp', 'atemp', 'windspeed']
std_row, min_max = explore(target_file=local_path+'\\'+source, binary_features=b_vars, numeric_features=n_vars)

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
SGD = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001, penalty=None, random_state=1, average=True)

val_rmse = 0
val_rmsle = 0
predictions_start = 16000

def apply_log(x): return np.log(x + 1.0)
def apply_exp(x): return np.exp(x) - 1.0

for x,y,n in pull_examples(target_file=local_path+'\\'+source, 
                           vectorizer=std_row, min_max=min_max, sparse = False,
                           binary_features=b_vars, numeric_features=n_vars, target='cnt'):
    y_log = apply_log(y)
    # Extract only quantitative features and expand them
    num_index = [j for j, i in enumerate(std_row.feature_names_) if i in n_vars]
    x_poly = poly.fit_transform(x[:,num_index])[:,len(num_index):]
    new_x = np.concatenate((x, x_poly), axis=1)
    
    # MACHINE LEARNING
    if (n+1) >= predictions_start:
        # HOLDOUT AFTER N PHASE
        predicted = SGD.predict(new_x)
        val_rmse += (apply_exp(predicted) - y)**2
        val_rmsle += (predicted - y_log)**2
        if (n-predictions_start+1) % 250 == 0 and (n+1) > predictions_start:
            print n,
            print '%s holdout RMSE: %0.3f' % (time.strftime('%X'), (val_rmse / float(n-predictions_start+1))**0.5),
            print 'holdout RMSLE: %0.3f' % ((val_rmsle / float(n-predictions_start+1))**0.5)
    else:
        # LEARNING PHASE
        SGD.partial_fit(new_x, y_log)
print '%s FINAL holdout RMSE: %0.3f' % (time.strftime('%X'), (val_rmse / float(n-predictions_start+1))**0.5)
print '%s FINAL holdout RMSLE: %0.3f' % (time.strftime('%X'), (val_rmsle / float(n-predictions_start+1))**0.5)

16249 09:44:45 holdout RMSE: 269.222 holdout RMSLE: 1.627
16499 09:44:45 holdout RMSE: 243.626 holdout RMSLE: 1.550
16749 09:44:45 holdout RMSE: 244.336 holdout RMSLE: 1.554
16999 09:44:45 holdout RMSE: 243.608 holdout RMSLE: 1.570
17249 09:44:45 holdout RMSE: 229.319 holdout RMSLE: 1.519
09:44:45 FINAL holdout RMSE: 219.191
09:44:45 FINAL holdout RMSLE: 1.480


## Trying explicit high dimensional mappings

In [21]:
source = 'shuffled_covtype.data'
local_path = os.getcwd()
n_vars = ['var_'+str(j) for j in range(54)]
std_row, min_max = explore(target_file=local_path+'\\'+source, binary_features=list(), 
                  fieldnames= n_vars+['covertype'], numeric_features=n_vars, max_rows=50000)

from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler

SGD = SGDClassifier(loss='hinge', penalty=None, random_state=1, average=True)
rbf_feature = RBFSampler(gamma=0.5, n_components=300, random_state=0)
accuracy = 0
accuracy_record = list()
predictions_start = 50
sample = 5000
early_stop = 50000
for x,y,n in pull_examples(target_file=local_path+'\\'+source, 
                           vectorizer=std_row,
                           min_max=min_max,
                           binary_features=list(),
                           numeric_features=n_vars, 
                           fieldnames= n_vars+['covertype'], target='covertype', sparse=False):
    rbf_x = rbf_feature.fit_transform(x)
    # LEARNING PHASE
    if n > predictions_start:
        accuracy += int(int(SGD.predict(rbf_x))==y[0])
        if n % sample == 0:
            accuracy_record.append(accuracy / float(sample))
            print '%s Progressive accuracy at example %i: %0.3f' % (time.strftime('%X'), \
               n, np.mean(accuracy_record[-sample:]))
            accuracy = 0
    if early_stop and n >= early_stop:
            break
    SGD.partial_fit(rbf_x, y, classes=range(1,8))

09:45:07 Progressive accuracy at example 5000: 0.621
09:45:24 Progressive accuracy at example 10000: 0.653
09:45:41 Progressive accuracy at example 15000: 0.670
09:45:59 Progressive accuracy at example 20000: 0.681
09:46:17 Progressive accuracy at example 25000: 0.689
09:46:35 Progressive accuracy at example 30000: 0.695
09:46:53 Progressive accuracy at example 35000: 0.699
09:47:12 Progressive accuracy at example 40000: 0.704
09:47:30 Progressive accuracy at example 45000: 0.706
09:47:49 Progressive accuracy at example 50000: 0.709


## Hyperparameters tuning

In [22]:
from sklearn.linear_model import SGDRegressor
from sklearn.grid_search import ParameterSampler

source = '\\bikesharing\\hour.csv'
local_path = os.getcwd()
b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']
n_vars = ['hum', 'temp', 'atemp', 'windspeed']
std_row, min_max = explore(target_file=local_path+'\\'+source, binary_features=b_vars, numeric_features=n_vars)

val_rmse = 0
val_rmsle = 0
predictions_start = 16000
tmp_rsmle = 10**6

def apply_log(x): return np.log(x + 1.0)
def apply_exp(x): return np.exp(x) - 1.0

param_grid = {'penalty':['l1', 'l2'], 'alpha': 10.0**-np.arange(2,5)}
random_tests = 3
search_schedule = list(ParameterSampler(param_grid, n_iter=random_tests, random_state=5))
results = dict()

for search in search_schedule:
    SGD = SGDRegressor(loss='epsilon_insensitive', epsilon=0.001, penalty=None, random_state=1, average=True)
    params =SGD.get_params()
    new_params = {p:params[p] if p not in search else search[p] for p in params}
    SGD.set_params(**new_params)
    print str(search)[1:-1]
    for iterations in range(200):
        for x,y,n in pull_examples(target_file=local_path+'\\'+source, 
                                   vectorizer=std_row, min_max=min_max, sparse = False,
                                   binary_features=b_vars, numeric_features=n_vars, target='cnt'):
            y_log = apply_log(y)

            # MACHINE LEARNING
            if (n+1) >= predictions_start:
                # HOLDOUT AFTER N PHASE
                predicted = SGD.predict(x)
                val_rmse += (apply_exp(predicted) - y)**2
                val_rmsle += (predicted - y_log)**2
            else:
                # LEARNING PHASE
                SGD.partial_fit(x, y_log)

        examples = float(n-predictions_start+1) * (iterations+1)
        print_rmse = (val_rmse / examples)**0.5
        print_rmsle = (val_rmsle / examples)**0.5
        if iterations == 0:
            print 'Iteration %i - RMSE: %0.3f - RMSE: %0.3f' % (iterations+1, print_rmse, print_rmsle)
        if iterations > 0:
            if tmp_rmsle / print_rmsle <= 1.01:
                print 'Iteration %i - RMSE: %0.3f - RMSE: %0.3f\n' % (iterations+1, print_rmse, print_rmsle)
                results[str(search)]= {'rmse':float(print_rmse), 'rmsle':float(print_rmsle)}
                break
        tmp_rmsle = print_rmsle

'penalty': 'l2', 'alpha': 0.001
Iteration 1 - RMSE: 216.170 - RMSE: 1.440
Iteration 20 - RMSE: 152.175 - RMSE: 0.857

'penalty': 'l2', 'alpha': 0.0001
Iteration 1 - RMSE: 714.071 - RMSE: 4.096
Iteration 31 - RMSE: 184.677 - RMSE: 1.053

'penalty': 'l1', 'alpha': 0.01
Iteration 1 - RMSE: 1050.809 - RMSE: 6.044
Iteration 36 - RMSE: 225.036 - RMSE: 1.298



## Other alternatives for SVM fast learning

###Useful dataset examples

In [23]:
with open('house_dataset','wb') as W:
    W.write("0 | price:.23 sqft:.25 age:.05 2006\n")
    W.write("1 2 'second_house | price:.18 sqft:.15 age:.35 1976\n")
    W.write("0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\n")

with open('house_dataset','rb') as R:
    for line in R:
        print line.strip()

0 | price:.23 sqft:.25 age:.05 2006
1 2 'second_house | price:.18 sqft:.15 age:.35 1976
0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924


###A way to call VW from Python

In [24]:
import subprocess

def execute_vw(parameters):
    execution = subprocess.Popen('vw '+parameters, \
       shell=True, stderr=subprocess.PIPE)
    line = ""
    history = ""
    while True:
        out = execution.stderr.read(1)
        history += out
        if out == '' and execution.poll() != None:
            print '------------ COMPLETED ------------\n'
            break
        if out != '':
            line += out
            if '\n' in line[-2:]:
                print line[:-2]
                line = ''
    return history.split('\r\n')

In [25]:
params = "house_dataset"
results = execute_vw(params)

Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = house_dataset
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0   0.0000   0.0000        5
0.666667 1.000000            2            3.0   1.0000   0.0000        5

finished run
number of examples per pass = 3
passes used = 1
weighted example sum = 4.000000
weighted label sum = 2.000000
average loss = 0.750000
best constant = 0.500000
best constant's loss = 0.250000
total feature number = 15
------------ COMPLETED ------------



###Processing examples

In [26]:
import csv

def vw_convert(origin_file, target_file, binary_features, numeric_features, target, transform_target=lambda(x):x,
               separator=',', classification=True, multiclass=False, fieldnames= None, header=True, sparse=True):
    """
    Reads a online style stream and returns a generator of normalized feature vectors
    
    Parameters
    ‐‐‐‐‐‐‐‐‐‐
    original_file = the csv file you are taken the data from 
    target file = the file to stream from
    binary_features = the list of qualitative features to consider
    numeric_features = the list of numeric features to consider
    target = the label of the response variable
    transform_target = a function transforming the response
    separator = the field separator character
    classification = a Boolean indicating if it is classification
    multiclass =  a Boolean for multiclass classification
    fieldnames = the fields' labels (can be ommitted and read from file)
    header = a boolean indicating if the original file has an header
    sparse = if a sparse vector is to be returned from the generator
    """
    with open(target_file, 'wb') as W:
        with open(origin_file, 'rb') as R:
            iterator = csv.DictReader(R, fieldnames, delimiter=separator)
            for n, row in enumerate(iterator):
                if not header or n>0:
                # DATA PROCESSING
                    response = transform_target(float(row[target]))
                    if classification and not multiclass:
                            if response == 0:
                                stream_row = '-1 '
                            else:
                                stream_row = '1 '
                    else:
                        stream_row = str(response)+' '
                    quantitative = list()
                    qualitative  = list()
                    for k,v in row.iteritems():
                        if k in binary_features:
                            qualitative.append(str(k)+'_'+str(v)+':1')
                        else:
                            if k in numeric_features and (float(v)!=0 or not sparse):
                                quantitative.append(str(k)+':'+str(v))
                    if quantitative:
                        stream_row += '|n '+' '.join(quantitative)
                    if qualitative:
                        stream_row += '|q ' + ' '.join(qualitative)
                    W.write(stream_row+'\n')

###Examples with toys datasets

In [27]:
import numpy as np
from sklearn.datasets import load_iris, load_boston
from random import seed
iris = load_iris()
seed(2)
re_order = np.random.permutation(len(iris.target))
with open('iris_versicolor.vw','wb') as W1:
    for k in re_order:
        y = iris.target[k]
        X = iris.values()[1][k,:]
        features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])
        target = '1' if y==1 else '-1'
        W1.write(target+features+'\n')

In [28]:
boston = load_boston()
seed(2)
re_order = np.random.permutation(len(boston.target))
with open('boston.vw','wb') as W1:
     for k in re_order:
        y = boston.target[k]
        X = boston.data[k,:]
        features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])
        W1.write(str(y)+features+'\n')

###Binary Iris

In [29]:
params = '--ksvm --l2 0.000001 --reprocess 2 -b 18 --kernel rbf --bandwidth=0.1 -p iris_bin.test -d iris_versicolor.vw'
results = execute_vw(params)

accuracy = 0
with open('iris_bin.test', 'rb') as R:
    with open('iris_versicolor.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if (n+1) % 10==0:
                predicted = float(line.strip())
                y = float(example.split('|')[0])
                accuracy += np.sign(predicted)==np.sign(y)
                holdouts += 1            
print 'holdout accuracy: %0.3f' % ((accuracy / holdouts)**0.5)

using l2 regularization = 1e-006
predictions = iris_bin.test
Lambda = 1e-006
Kernel = rbf
bandwidth = 0.1
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = iris_versicolor.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0  -1.0000   0.0000        5
0.502494 0.004988            2            2.0  -1.0000  -0.9950        5
0.498283 0.494072            4            4.0  -1.0000  -0.9872        5
0.942084 1.385885            8            8.0   1.0000   0.2481        5
0.651197 0.360311           16           16.0  -1.0000  -0.2290        5
0.518579 0.385960           32           32.0   1.0000   0.0946        5
0.428142 0.337705           64           64.0   1.0000   2.0354        5
0.321869 0.215597          128          128.0  -1.0000  -2.1214        5

finished run
number of ex

###Boston dataset

In [30]:
params = 'boston.vw -f boston.model --loss_function squared -k --cache_file cache_train.vw --passes=20 --nn 5 --dropout'
results = execute_vw(params)
params = '-t boston.vw -i boston.model -k --cache_file cache_test.vw -p boston.test'
results = execute_vw(params)
val_rmse = 0
with open('boston.test', 'rb') as R:
    with open('boston.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if (n+1) % 10==0:
                predicted = float(line.strip())
                y = float(example.split('|')[0])
                val_rmse += (predicted - y)**2
                holdouts += 1            
print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)

final_regressor = boston.model
using dropout for neural network training
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_train.vw
Reading datafile = boston.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
416.159973 416.159973            1            1.0  20.4000   0.0000        3
468.479584 520.799194            2            2.0  24.1000   1.2790        4
368.088074 267.696564            4            4.0  18.9000   3.6200        4
371.617607 375.147141            8            8.0  19.8000   7.9455        3
303.415839 235.214072           16           16.0  20.6000   7.1648        3
261.152876 218.889912           32           32.0  19.6000  12.2380        4
237.683406 214.213937           64           64.0  21.7000  12.8101        4
166.570917 95.458427          128          128.0  21.4000  14.5432 

## Faster bikesharing

In [31]:
import os
import numpy as np

def apply_log(x): 
    return np.log(x + 1.0)

def apply_exp(x): 
    return np.exp(x) - 1.0

local_path = os.getcwd()
b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']
n_vars = ['hum', 'temp', 'atemp', 'windspeed']
source = '\\bikesharing\\hour.csv'
origin = target_file=local_path+'\\'+source
target = target_file=local_path+'\\'+'bike.vw'
vw_convert(origin, target, binary_features=b_vars, numeric_features=n_vars, target = 'cnt', transform_target=apply_log,
               separator=',', classification=False, multiclass=False, fieldnames= None, header=True)

In [32]:
params = 'bike.vw -f regression.model -k --cache_file cache_train.vw --passes=100 --hash strings --holdout_after 16000'
results = execute_vw(params)

final_regressor = regression.model
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_train.vw
Reading datafile = bike.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
13.790617 13.790617            1            1.0   3.7136   0.0000       12
9.376448 4.962280            2            2.0   3.4965   1.2689       12
5.311779 1.247110            4            4.0   0.6931   2.1128       12
2.844452 0.377125            8            8.0   2.1972   1.4237       12
2.372758 1.901063           16           16.0   4.5433   3.9948       13
1.423223 0.473688           32           32.0   3.0445   1.8111       13
1.383102 1.342980           64           64.0   4.7095   3.7816       12
1.387249 1.391397          128          128.0   4.2627   4.1522       13
1.165221 0.943192          256          256.0   1.9459   3.1

In [33]:
params = '-t bike.vw -i regression.model -k --cache_file cache_test.vw -p pred.test'
results = execute_vw(params)
val_rmse = 0
val_rmsle = 0
with open('pred.test', 'rb') as R:
    with open('bike.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if n > 16000:
                predicted = float(line.strip())
                y_log = float(example.split('|')[0])
                y = apply_exp(y_log)
                val_rmse += (apply_exp(predicted) - y)**2
                val_rmsle += (predicted - y_log)**2
                holdouts += 1
            
print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)
print 'holdout RMSLE: %0.3f' % ((val_rmsle / holdouts)**0.5)

only testing
predictions = pred.test
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = cache_test.vw
Reading datafile = bike.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.481086 1.481086            1            1.0   3.7136   2.4966       12
1.857857 2.234629            2            2.0   3.4965   2.0016       12
1.327467 0.797078            4            4.0   0.6931   1.2289       12
3.693974 6.060481            8            8.0   2.1972   5.0060       12
2.206792 0.719610           16           16.0   4.5433   5.0346       13
2.154553 2.102313           32           32.0   3.0445   4.6435       13
1.337130 0.519708           64           64.0   4.7095   4.7743       12
0.789608 0.242085          128          128.0   4.2627   4.5554       13
0.642582 0.495556          256          256.0   1.9459   2.2025       13
0.572553 0.5

## Covertype dataset crunched by VW

In [34]:
import os
local_path = os.getcwd()
n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]
source = 'shuffled_covtype.data'
origin = target_file=local_path+'\\'+source
target = target_file=local_path+'\\'+'covtype.vw'
vw_convert(origin, target, binary_features=list(), fieldnames= n_vars+['covertype'], numeric_features=n_vars,
    target = 'covertype', separator=',', classification=True, multiclass=True, header=False, sparse=False)
params = 'covtype.vw --ect 7 -f multiclass.model -k --cache_file cache_train.vw --passes=2 -l 1.0 --cubic nnn'
results = execute_vw(params)

creating cubic features for triples: nnn 
final_regressor = multiclass.model
Num weight bits = 18
learning rate = 1
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache_train.vw
Reading datafile = covtype.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        2        1      377
1.000000 1.000000            2            2.0        1        2      377
0.750000 0.500000            4            4.0        2        2      377
0.750000 0.750000            8            8.0        6        1      377
0.687500 0.625000           16           16.0        2        2      377
0.531250 0.375000           32           32.0        2        3      377
0.484375 0.437500           64           64.0        1        1      377
0.468750 0.453125          128          128.0        2        1      377
0.464844 0.460938  

In [35]:
params = '-t covtype.vw -i multiclass.model -k --cache_file cache_test.vw -p covertype.test'
results = execute_vw(params)
accuracy = 0
with open('covertype.test', 'rb') as R:
    with open('covtype.vw', 'rb') as TRAIN:
        holdouts = 0.0
        for n,(line, example) in enumerate(zip(R,TRAIN)):
            if (n+1) % 10==0:
                predicted = float(line.strip())
                y = float(example.split('|')[0])
                accuracy += predicted ==y
                holdouts += 1
print 'holdout accuracy: %0.3f' % (accuracy / holdouts)

creating cubic features for triples: nnn 
only testing
predictions = covertype.test
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = cache_test.vw
Reading datafile = covtype.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0        2        2      377
0.500000 1.000000            2            2.0        1        2      377
0.250000 0.000000            4            4.0        2        2      377
0.125000 0.000000            8            8.0        6        6      377
0.250000 0.375000           16           16.0        2        1      377
0.187500 0.125000           32           32.0        1        2      377
0.265625 0.343750           64           64.0        1        1      377
0.226563 0.187500          128          128.0        2        2      377
0.265625 0.304688          256     