# PyTorch: Tabular Regression

*Predicting House Prices Using Municipal Data.*

![houses](../images/houses.png)

In [2]:
import aiqc
from aiqc import datum

---

## Example Data

This dataset is comprised of:
    
* *Features* = municipal (crime, tax rate, pollution, education) data about neighborhoods.
* *Label* = median home price in that neighborhood.

Reference [Example Datasets](example_datasets.ipynb) for more information.

In [3]:
df = datum.to_pandas('houses.csv')

In [4]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


---

## Pipeline

Reference [High-Level API Docs](api_high_level.ipynb) for more information including how to work with non-tabular data.

In [5]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler, PowerTransformer

In [6]:
splitset = aiqc.Pipeline.Tabular(
    # --- Data source ---
    df_or_path = df
    , dtype = None

    # --- Label preprocessing ---
    , label_column = 'price'
    , label_interpolater = None
    , label_encoder = dict(sklearn_preprocess = StandardScaler(copy=False))

    # --- Feature preprocessing ---
    , feature_cols_excluded = 'price'
    , feature_interpolaters = None
    , feature_window = None
    , feature_encoders = [
        dict(dtypes=['float64'], sklearn_preprocess=RobustScaler(copy=False))
        , dict(dtypes=['int64'], sklearn_preprocess=OrdinalEncoder())
    ]
    , feature_reshape_indices = None

    # --- Stratification ---
    , size_test = 0.12
    , size_validation = 0.22
    , bin_count = 4
)


___/ featurecoder_index: 0 \_________

=> The column(s) below matched your filter(s) featurecoder filters.

['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'ptratio', 'lstat']

=> The remaining column(s) and dtype(s) are available for downstream featurecoder(s):
{'chas': 'int64', 'rad': 'int64', 'tax': 'int64'}


___/ featurecoder_index: 1 \_________

=> The column(s) below matched your filter(s) featurecoder filters.

['chas', 'rad', 'tax']

=> Done. All feature column(s) have featurecoder(s) associated with them.
No more FeatureCoders can be added to this Encoderset.



---

## Modeling

In [7]:
import torch
import torch.nn as nn
from torch import optim
import torchmetrics
from aiqc.utils.pytorch import fit

In [8]:
def fn_build(features_shape, label_shape, **hp):
    # Just giving hyperparameter a shorter reference.
    nc = hp['neuron_count']

    model = nn.Sequential(
        nn.Linear(features_shape[0], nc),
        nn.BatchNorm1d(nc,nc),
        nn.ReLU(),
        nn.Dropout(p=0.4),

        nn.Linear(nc, nc),
        nn.BatchNorm1d(nc,nc),
        nn.ReLU(),
        nn.Dropout(p=0.4),

        nn.Linear(nc, label_shape[0])
    )
    return model

In [9]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
    return fit(
        model, loser, optimizer, 
        samples_train, samples_evaluate,
        epochs=30, batch_size=5,
        metrics=[torchmetrics.MeanSquaredError(), torchmetrics.R2Score()]
    )

Optional, will be automatically selected based on `analysis_type` if left as `None`.

In [10]:
def fn_lose(**hp):
    if (hp['loss_type'] == 'mae'):
        loser = nn.L1Loss()#mean absolute error.
    elif (hp['loss_type'] == 'mse'):
        loser = nn.MSELoss()
    return loser

In [11]:
hyperparameters = dict(
    neuron_count=[22,24], loss_type=["mae","mse"]
)

In [13]:
queue = aiqc.Experiment(
    # --- Analysis type ---
    library = "pytorch"
    , analysis_type = "regression"
    
    # --- Model functions ---
    , fn_build = fn_build
    , fn_train = fn_train
    , fn_lose = fn_lose #auto
    , fn_optimize = None #auto
    , fn_predict = None #auto    
    
    # --- Training options ---
    , repeat_count = 1
    , hyperparameters = hyperparameters
    , search_percent = None
    
    # --- Data sources ---
    , splitset_id = splitset.id
    , hide_test = False
)

In [14]:
queue.run_jobs()

🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:15<00:00,  3.97s/it]


---

## Visualization & Interpretation

For more information on visualization of performance metrics, reference the [Dashboard](dashboard.html) documentation.