Multi Target/Output Regression

In [1]:
#check scikit learn version
import sklearn
print(sklearn.__version__)

0.23.1


In [3]:
# example of multioutput regression test problem
from sklearn.datasets import make_regression
# create datasets
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2)
# summarize dataset
print(X, y)

[[-0.80010451  0.96150923  0.78041757 ... -1.18535205  1.74964536
  -0.99781203]
 [-0.97366497  0.49817924 -0.11582605 ... -0.22708001 -0.38461918
  -0.29350746]
 [ 0.06522037 -0.72740964 -0.17305963 ...  1.67720227  0.99231315
  -0.6082635 ]
 ...
 [ 0.23419489  0.55849441 -0.5979833  ...  0.26754105 -0.69902938
  -1.90897215]
 [ 0.40035719 -1.22880419 -0.14764946 ...  0.10849524  0.07273139
   2.41397311]
 [-0.38853852 -1.5223212   0.58391046 ... -1.08326314 -0.58932159
   0.66628631]] [[  -8.5395577   -22.02746201]
 [ -67.03125466  -23.27890206]
 [ -12.2828592    -9.31352121]
 ...
 [  16.53377847   -0.93830318]
 [-112.98157086    5.36213462]
 [ -99.75538684  -71.43242636]]


Linear Regression  

In [4]:
from sklearn.linear_model import LinearRegression

In [6]:
# define model
model = LinearRegression()
# fit model
model.fit(X, y)
# make a prediction
test_data = [[0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]]
model.predict(test_data)

array([[ 4.03221859, 18.31832791]])

Decision Tree Regression

In [7]:
from sklearn.tree import DecisionTreeRegressor

In [8]:
# define model
model = DecisionTreeRegressor()
# fit model
model.fit(X, y)
# make a prediction
test_data = [[0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]]
model.predict(test_data)

array([[27.32340437, 31.54205611]])

K-Nearest Neighbors

In [9]:
from sklearn.neighbors import KNeighborsRegressor

In [10]:
# define model
model = KNeighborsRegressor()
# fit model
model.fit(X, y)
# make a prediction
test_data = [[0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]]
model.predict(test_data)

array([[11.58203011, 19.19999729]])

Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
# define model
model = RandomForestRegressor()
# fit model
model.fit(X, y)
# make a prediction
test_data = [[0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]]
model.predict(test_data)

array([[ 8.82125303, 20.83694901]])

Evaluate MOR with Cross Validation

In [26]:
# evaluate multioutput regression model with k-fold cross-validation
from numpy import absolute
from numpy import mean
from numpy import std
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
# create datasets
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2)
# define model
model = LinearRegression()
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=5)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
print(n_scores)

MAE: 0.000 (0.000)
[5.66308025e-14 6.07765921e-14 8.69439243e-14 7.17811227e-14
 4.27893138e-14]


Wrapper MOR/MTR
Not all regression algorithms support multioutput regression.
One example is the support vector machine, although for regression, it is referred to as support vector regression, or SVR.

The most obvious way to do this is to split a multioutput regression problem into multiple single-output regression problems.
For example, if a multioutput regression problem required the prediction of three values y1, y2 and y3 given an input X, then this could be partitioned into three single-output regression problems:

    Problem 1: Given X, predict y1.
    Problem 2: Given X, predict y2.
    Problem 3: Given X, predict y3.

There are two main approaches to implement this technique:
Direct Multioutput: Develop an independent model for each numerical value to be predicted.
Chained Multioutput: Develop a sequence of dependent models to match the number of numerical values to be predicted

In [21]:
# example of making a prediction with the direct multioutput regression model
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2)
# define base model
model = LinearSVR()
# define the direct multioutput wrapper model
wrapper = MultiOutputRegressor(model)
# fit the model on the whole dataset
wrapper.fit(X, y)
# make a single prediction
test_data = [[0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]]
yhat = wrapper.predict(test_data)
# summarize the prediction
print('Predicted: %s' % yhat[0])

Predicted: [55.15287809 53.52992398]


In [22]:
# example of making a prediction with the chained multioutput regression model
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2)
# define base model
model = LinearSVR()
# define the chained multioutput wrapper model
wrapper = RegressorChain(model)
# fit the model on the whole dataset
wrapper.fit(X, y)
# make a single prediction
test_data = [0.21947749, 0.32948997, 0.81560036, 0.440956, -0.0606303, -0.29257894, -0.2820059, -0.00290545, 0.96402263, 0.04992249]
yhat = wrapper.predict([test_data])
# summarize the prediction
print('Predicted: %s' % yhat[0])

Predicted: [ 58.64400394 -28.9040808 ]


Cross Validation for SVR

In [23]:
# example of evaluating chained multioutput regression with an SVM model
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2)
# define base model
model = LinearSVR()
# define the chained multioutput wrapper model
wrapper = RegressorChain(model)
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(wrapper, X, y, scoring='neg_mean_absolute_error', cv=5)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print(n_scores)
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

[3.48890614e-05 8.89819652e-06 2.69749012e-05 1.24397664e-04
 6.66643593e-05]
MAE: 0.000 (0.000)
