# `rf_quantile` demo

Predict conditional quantiles of a variable using the `rf_quantile` function.

## Setup

In [1]:
import synthimpute as si
from sklearn import ensemble
import pandas as pd
import numpy as np

## Generate data

In [2]:
N = 1000
x = pd.DataFrame({'x1': np.random.randn(N),
                  'x2': np.random.randn(N)})
# Construct example relationship.
y = x.x1 + np.power(x.x2, 3) + np.random.randn(N)
rf = ensemble.RandomForestRegressor(random_state=3)
rf.fit(x, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=3, verbose=0, warm_start=False)

In [None]:
def rf_impute(x_train, y_train, x_new, x_cols=None, random_state=None):
    """Impute labels from a training set to a new data set using 
       random forests quantile regression.
       
    Args:
        x_train: Training data.
        y_train: Training labels.
        x_new: New x data for which imputed labels are generated.
        x_cols: List of columns to use. If not provided, uses all columns from
            x_train (these must also be in x_new).
        random_state: Optional random seed passed to RandomForestRegressor and
            for uniform distribution of quantiles.
        
    Returns:
        Imputed labels for new_x.
    """
    rf = ensemble.RandomForestRegressor(random_state=random_state)
    rf.fit(x_train, y_train)
    if random_state is not None:
        np.random.seed(random_state)
    quantiles = np.random.rand(y_train.size)  # Uniform distribution.
    return si.rf_quantile(rf, x, quantiles)

In [19]:
ensemble.RandomForestRegressor(random_state=10)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [3]:
si.rf_quantile(rf, x, 0.5).size

1000