In [67]:
import pandas as pd
import numpy as np
import sys
import math
import os
from sklearn import datasets
from statwolfml import Builder
from statwolfml.builder import ExternalModel

In [68]:
from modules.local_linear_forest import LocalLinearForestRegressor, TestLiuk

In [69]:
from statwolfml.models.base_model import SupervisedModel

In [70]:
DATASET = 'syntetic-uni' # {'boston', 'syntetic-multi', 'syntetic-uni'}

In [71]:
def get_function_value(x):
    '''
    Equation 8 of LLF paper
    '''
    total = 0
    first = math.pow(20*(x['1'] - 0.5), 3) 
    total += first
    if num_features < 2: return total
    second = 0
    for j in range(2, 3 if num_features == 2 else 4):
        second += 10*x['{}'.format(j)]
    total += second
    if num_features < 4: return total
    third = 0
    for j in range(4, 5 if num_features == 4 else 6):
        third += 5*x['{}'.format(j)]
    total += third
    if num_features < 6: return total
    four = 0
    for j in range(6, num_features + 1):
        four += 2*x['{}'.format(j)]
    total += four
    return total

In [78]:
if DATASET == 'boston':    
    # Loading some example data
    boston = datasets.load_boston()
    X = boston.data
    y = boston.target
    features = boston['feature_names']
    dataset = pd.DataFrame(X, columns=features)
    dataset['target'] = y
    label = 'target'
elif DATASET == 'syntetic-multi':
    num_features = 5
    num_samples = 1000
    X = pd.DataFrame(columns=[str(i+1) for i in range(0, num_features)])
    features = X.columns.values
    count = 0
    for i in range(0, num_samples):
        X.loc[count] = np.random.uniform(0, 1, num_features)
        count += 1
    X['y'] = None
    for index, x in X.iterrows():
        X.loc[index, 'y'] = get_function_value(x)
    label = 'y'
    dataset = X
elif DATASET == 'syntetic-uni':
    features = ['X']
    label = 'y'
    dataset = pd.DataFrame(np.arange(-1, +1, 0.005), columns=features)
    dataset[label] = None
    for index, row in dataset.iterrows():
        dataset.loc[index, label] = math.log(1+math.exp(6*row['X']))
else:
    print('## WARNING ##')
    
print('DATASET: {} \n features: {}, label: {}'.format(DATASET, features, label))

DATASET: syntetic-uni 
 features: ['X'], label: y


In [79]:
for_plot = dataset[[label]].copy()

In [80]:
for_plot.sort_values(label, inplace=True)

In [81]:
for_plot.reset_index(inplace=True)

In [82]:
from data_science.output import Output

In [83]:
Output(for_plot).build_plot().scatter().x(is_index=True).y(column=label).plot()

In [84]:
from data_science.modules import cross_validation

In [85]:
LLF = LocalLinearForestRegressor(max_depth=2, random_state=0, n_estimators=100)

In [86]:
TEST = TestLiuk(max_depth=2, random_state=0, n_estimators=100)

In [87]:
# statwolfml models
lasso = Builder('lasso').feature_names(features).labels([label]).build()
rf = Builder('random_forest_regressor').feature_names(features).labels([label]).build()
llf = ExternalModel(SupervisedModel(features, [label], LLF))
test = ExternalModel(SupervisedModel(features, [label], TEST))

In [88]:
models = {
    'lasso': lasso,
    'random_forest': rf,
    'llf': llf,
    'test': test
}
cv = cross_validation.create().kFold(dataframe=dataset, cv_type='reg', models=models, n_splits=5, shuffle=False, stratified=None)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [89]:
cv.build_plot().plot()

In [90]:
cv.meta()

{'n_splits': 5,
 'cv_type': 'reg',
 'labels': ['y'],
 'models': ['lasso', 'random_forest', 'llf', 'test'],
 'scoring': {'folds': {0: {'lasso': {'scoring': {'mean_absolute_error': 2.023809465274772,
      'mean_squared_error': 4.0958492847840064,
      'root_mean_squared_error': 2.0238204675276923,
      'r2': -91972.25236079948}},
    'random_forest': {'scoring': {'mean_absolute_error': 0.017251745577627098,
      'mean_squared_error': 0.0003421557737218175,
      'root_mean_squared_error': 0.018497453168526135,
      'r2': -6.683187816536282}},
    'llf': {'scoring': {'mean_absolute_error': 1.39867032134413,
      'mean_squared_error': 2.036332932853372,
      'root_mean_squared_error': 1.4270013780138309,
      'r2': -45725.3316352244}},
    'test': {'scoring': {'mean_absolute_error': 5.079133967654263,
      'mean_squared_error': 26.614066167457416,
      'root_mean_squared_error': 5.158882259507132,
      'r2': -597624.0720601551}}},
   1: {'lasso': {'scoring': {'mean_absolute_erro

In [114]:
cv_out = cv.dataframe()

In [115]:
cv_out_1 = cv_out.query('type == "pred"')

In [116]:
cv_out_2 = cv_out.query('type == "real"')

In [117]:
cv_out_2.head()

Unnamed: 0,lasso_y,random_forest_y,llf_y,test_y,type,fold
80,0.00247569,0.00247569,0.00247569,0.00247569,real,0
81,0.00255098,0.00255098,0.00255098,0.00255098,real,0
82,0.00262857,0.00262857,0.00262857,0.00262857,real,0
83,0.00270852,0.00270852,0.00270852,0.00270852,real,0
84,0.00279089,0.00279089,0.00279089,0.00279089,real,0


In [118]:
cv_out_1['real'] = cv_out_2['test_y'].values



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [119]:
cv_out_1.sort_values('real', inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [123]:
cv_out_1.reset_index(inplace=True)

In [124]:
cv_out_1

Unnamed: 0,index,lasso_y,random_forest_y,llf_y,test_y,type,fold,real
0,0,2.03393,0.0273754,-1.88344,-6.62555,pred,0,0.00247569
1,1,2.03393,0.0273754,-1.87091,-6.58614,pred,0,0.00255098
2,2,2.03393,0.0273754,-1.85839,-6.54674,pred,0,0.00262857
3,3,2.03393,0.0273754,-1.84586,-6.50733,pred,0,0.00270852
4,4,2.03393,0.0273754,-1.83333,-6.46793,pred,0,0.00279089
5,5,2.03393,0.0273754,-1.8208,-6.42852,pred,0,0.00287576
6,6,2.03393,0.0273754,-1.80827,-6.38911,pred,0,0.00296321
7,7,2.03393,0.0273754,-1.79574,-6.34971,pred,0,0.00305332
8,8,2.03393,0.0273754,-1.78321,-6.3103,pred,0,0.00314616
9,9,2.03393,0.0273754,-1.77068,-6.2709,pred,0,0.00324182


In [125]:
pb = Output(cv_out_1).build_plot()
pb.scatter().x(is_index=True).y(column='real').settings(settings={'name': 'real'})
pb.scatter().x(is_index=True).y(column='random_forest_y').settings(settings={'name': 'RF'})
pb.scatter().x(is_index=True).y(column='llf_y').settings(settings={'name': 'LLF'})
pb.apply_palette().plot()