# Tutorial: Creating a Custom Task

In this tutorial we will go over the process of creating a custom task, along with a custom data module.

In [None]:
%%capture
! pip install git+https://github.com/PyTorchLightning/lightning-flash.git
! pip install git+https://github.com/PyTorchLightning/pytorch-lightning.git

In [None]:
from typing import Any, List, Tuple, Dict

import numpy as np
import torch
from pytorch_lightning import seed_everything
from sklearn import datasets
from sklearn.model_selection import train_test_split
from torch import nn, Tensor

import flash
from flash.core.data.auto_dataset import AutoDataset
from flash.core.data.data_source import DataSource
from flash.core.data.process import Preprocess

ND = np.ndarray

In [None]:
seed_everything(42)

## The Task

Here we create a basic linear regression task by subclassing `flash.Task`. For the majority of tasks, you will likely only need to override the `__init__` and `forward` methods of task.

In [None]:
class CustomPostprocess(Postprocess):

    THRESHOLD = 14.72

    def predict_per_sample_transform(self, pred: Any) -> Any:
        if pred > self.THRESHOLD:

            def send_slack_message(pred):
                print(f"This prediction: {pred} is above the threshold: {self.THRESHOLD}")

            send_slack_message(pred)
        return pred

In [None]:
class LinearRegression(flash.Task):

    def __init__(self, num_inputs, learning_rate=0.001, metrics=None):
        # what kind of model do we want?
        model = nn.Linear(num_inputs, 1)

        # what loss function do we want?
        loss_fn = torch.nn.functional.mse_loss

        # what optimizer to do we want?
        optimizer = torch.optim.SGD

        super().__init__(
            model=model,
            loss_fn=loss_fn,
            optimizer=optimizer,
            metrics=metrics,
            learning_rate=learning_rate,
            postprocess=CustomPostprocess(),
        )

    def forward(self, x):
        # we don't actually need to override this method for this example
        return self.model(x)

### Where is the training step?

Most models can be trained simply by passing the output of `forward` to the supplied `loss_fn`, and then passing the resulting loss to the supplied `optimizer`. If you need a more custom configuration, you can override `step` (which is called for training, validation, and testing) or override `training_step`, `validation_step`, and `test_step` individually. These methods behave identically to PyTorch Lightning's [methods](https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html#methods).


## The Data

For a task you will likely need a specific way of loading data. 
First, you will need to implement a :class:`~flash.core.data.process.Preprocess` containing your processing logic.
Secondly, you will need to implement a `flash.core.data.data_module.DataModule` containing the instantiation logic.

We will be using the scikit-learn [Diabetes dataset](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset).

In [None]:
class NumpyDataSource(DataSource):

    def load_data(self, data: Tuple[ND, ND], dataset: AutoDataset) -> List[Tuple[ND, float]]:
        if self.training:
            dataset.num_inputs = data[0].shape[1]
        return [(x, y) for x, y in zip(*data)]

    def predict_load_data(self, data: ND) -> ND:
        return data

In [None]:
class NumpyPreprocess(Preprocess):

    def __init__(self):
        super().__init__(data_sources={"numpy": NumpyDataSource()}, default_data_source="numpy")

    def to_tensor_transform(self, sample: Any) -> Tuple[Tensor, Tensor]:
        x, y = sample
        x = torch.from_numpy(x).float()
        y = torch.tensor(y, dtype=torch.float)
        return x, y

    def predict_to_tensor_transform(self, sample: ND) -> ND:
        return torch.from_numpy(sample).float()

    def get_state_dict(self) -> Dict[str, Any]:
        return {}

    @classmethod
    def load_state_dict(cls, state_dict: Dict[str, Any], strict: bool):
        return cls()

In [None]:
class SklearnDataModule(flash.DataModule):

    @classmethod
    def from_dataset(cls, x: np.ndarray, y: np.ndarray, preprocess: Preprocess, batch_size: int = 64, num_workers: int = 0):

        preprocess = preprocess

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.20, random_state=0)

        dm = cls.from_data_source(
            "numpy",
            train_data=(x_train, y_train),
            test_data=(x_test, y_test),
            preprocess=preprocess,
            batch_size=batch_size,
            num_workers=num_workers
        )
        dm.num_inputs = dm._train_ds.num_inputs
        return dm

In [None]:
x, y = datasets.load_diabetes(return_X_y=True)
datamodule = SklearnDataModule.from_dataset(x, y, NumpyPreprocess())

In [None]:
model = LinearRegression(num_inputs=datamodule.num_inputs)

## Fit

You will notice that now the `Preprocess` and `Postprocess` objects are defined, it is possible to make an inference directly from `model.predict`.

In [None]:
trainer = flash.Trainer(max_epochs=10, progress_bar_refresh_rate=20)

In [None]:
trainer.fit(model, datamodule=datamodule)

In [None]:
predict_data = np.array([[0.0199, 0.0507, 0.1048, 0.0701, -0.0360, -0.0267, -0.0250, -0.0026, 0.0037, 0.0403],
                         [-0.0128, -0.0446, 0.0606, 0.0529, 0.0480, 0.0294, -0.0176, 0.0343, 0.0702, 0.0072],
                         [0.0381, 0.0507, 0.0089, 0.0425, -0.0428, -0.0210, -0.0397, -0.0026, -0.0181, 0.0072],
                         [-0.0128, -0.0446, -0.0235, -0.0401, -0.0167, 0.0046, -0.0176, -0.0026, -0.0385, -0.0384],
                         [-0.0237, -0.0446, 0.0455, 0.0907, -0.0181, -0.0354, 0.0707, -0.0395, -0.0345, -0.0094]])

In [None]:
predictions = model.predict(predict_data)
# out: This prediction: tensor([14.7288]) is above the threshold: 14.72

In [None]:
print(predictions)
# out: [tensor([14.7190]), tensor([14.7100]), tensor([14.7288]), tensor([14.6685]), tensor([14.6687])]

Like any Flash Task, we can fit our model using the `flash.Trainer` by supplying the task itself, and the associated data:

With a trained model we can now perform inference. Here we will use a few examples from the test set of our data:

Because of our custom data pipeline's `after_uncollate` method, we will get a nicely formatted output like the following:
```
[['disease progression: 14.84'],
 ['disease progression: 14.86'],
 ['disease progression: 14.78'],
 ['disease progression: 14.73'],
 ['disease progression: 14.71']]
```