In [1]:
%%capture
%pip install -U 'rockfish[labs]' -f 'https://packages.rockfish.ai'

In [2]:
import io
import rockfish as rf
import rockfish.actions as ra

## Fill missing values

1. fill by indicated value
2. fill by previous value
3. fill by next value
4. fill by its mean
5. fill by its median


In [3]:
# create a dataset with missing value
data = b"""\
a,b,c
1,2,3
4,5,6
,7,8
9,0,1
"""

dataset = rf.Dataset.from_csv("nulls", io.BytesIO(data))

### 1. fill missing values by the indicated value


In [4]:
dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,,7,8
3,9.0,0,1


In [5]:
conn = rf.Connection.local()

In [6]:
fill_value = 42
fill_col = "a"
fill_null = ra.Transform({"function": {"fill_null": [fill_col, fill_value]}})

In [7]:
save = rf.actions.DatasetSave(name="fill_value_dataset")
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(fill_null, parents=[dataset])
builder.add_action(save, parents=[fill_null])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: cOtEvoT4aB5ixLjlMDOCE


In [8]:
new_dataset = None
async for sds in workflow.datasets():
    new_dataset = await sds.to_local(conn)
new_dataset.to_pandas()

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,42,7,8
3,9,0,1


### 2. fill missing values by its previous value in that column


In [9]:
dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,,7,8
3,9.0,0,1


In [10]:
fill_col = "a"
fill_null = ra.Transform({"function": {"fill_null_forward": [fill_col]}})

In [11]:
save = rf.actions.DatasetSave(name="fill_null_forward_dataset")
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(fill_null, parents=[dataset])
builder.add_action(save, parents=[fill_null])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 3USVaEkOy0kNNeUa9MXnal


In [12]:
new_dataset = None
async for sds in workflow.datasets():
    new_dataset = await sds.to_local(conn)
new_dataset.to_pandas()

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,4,7,8
3,9,0,1


### 3. fill missing values by its next value in that column


In [13]:
dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,,7,8
3,9.0,0,1


In [14]:
fill_col = "a"
fill_null = ra.Transform({"function": {"fill_null_backward": [fill_col]}})

In [15]:
save = rf.actions.DatasetSave(name="fill_null_backward_dataset")
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(fill_null, parents=[dataset])
builder.add_action(save, parents=[fill_null])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 717OORsuUnzlnl0Y1VISpU


In [16]:
new_dataset = None
async for sds in workflow.datasets():
    new_dataset = await sds.to_local(conn)
new_dataset.to_pandas()

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,9,7,8
3,9,0,1


### 4. fill missing values by its mean value in that column


In [17]:
dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,,7,8
3,9.0,0,1


In [18]:
fill_col = "a"
fill_method = "mean"
fill_null = ra.Transform(
    {"function": {"fill_null_aggregation": [fill_col, fill_method]}}
)

In [19]:
save = rf.actions.DatasetSave(name="fill_mean_dataset")
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(fill_null, parents=[dataset])
builder.add_action(save, parents=[fill_null])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 4IoO4dcVkYKrtenA6abffn


In [20]:
new_dataset = None
async for sds in workflow.datasets():
    new_dataset = await sds.to_local(conn)
new_dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,4.666667,7,8
3,9.0,0,1


### 5. fill missing values by its median value in that column


In [21]:
dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,,7,8
3,9.0,0,1


In [22]:
fill_col = "a"
fill_method = "median"
fill_null = ra.Transform(
    {"function": {"fill_null_aggregation": [fill_col, fill_method]}}
)

In [23]:
save = rf.actions.DatasetSave(name="fill_median_dataset")
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(fill_null, parents=[dataset])
builder.add_action(save, parents=[fill_null])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 4dAopcf3e255mUimHIuBtH


In [24]:
new_dataset = None
async for sds in workflow.datasets():
    new_dataset = await sds.to_local(conn)
new_dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,4.0,7,8
3,9.0,0,1


## Append new column for the transformed field

Add new column for the result after filling missing with indicated values and the original column with missing values keeps the same


In [25]:
dataset.to_pandas()

Unnamed: 0,a,b,c
0,1.0,2,3
1,4.0,5,6
2,,7,8
3,9.0,0,1


In [26]:
fill_value = 42
fill_col = "a"
new_col_name = "new_a"
fill_null = ra.Apply(
    {
        "function": {"fill_null": [fill_col, fill_value]},
        "append_field": new_col_name,
    }
)

In [27]:
save = ra.DatasetSave(name="new_column_filled_dataset")
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(fill_null, parents=[dataset])
builder.add_action(save, parents=[fill_null])
workflow = await builder.start(conn)

print(f"Workflow: {workflow.id()}")

Workflow: azHALA6euOtwwWhyUuwKm


In [28]:
new_dataset = None
async for sds in workflow.datasets():
    new_dataset = await sds.to_local(conn)
new_dataset.to_pandas()

Unnamed: 0,a,b,c,new_a
0,1.0,2,3,1
1,4.0,5,6,4
2,,7,8,42
3,9.0,0,1,9
