# Chapter 5: Handling Missing Values

## Identifying missing values

In [None]:
import polars as pl

In [None]:
from datetime import date
import numpy as np

date_col = pl.date_range(date(2023, 1, 1), date(2023, 1, 15), '1d', eager=True)
avg_temp_c_list = [-3,None,6,-1,np.nan,6,4,None,1,2,np.nan,7,9,-2,None]
df = pl.DataFrame({
    'date': date_col,
    'avg_temp_celsius': avg_temp_c_list
})

In [None]:
df.head()

### How to do it...

In [None]:
df.null_count()

In [None]:
df.select('avg_temp_celsius').null_count()

In [None]:
df.select('date', 'avg_temp_celsius').null_count()

In [None]:
df.select(pl.col('avg_temp_celsius').null_count())

In [None]:
df.select(
    pl.col('avg_temp_celsius')
    .is_null()
    .sum()
)

In [None]:
(
    df
    .filter(pl.col('avg_temp_celsius').is_null())
    .select(pl.count())
)

In [None]:
df.filter(pl.col('avg_temp_celsius').is_null()).shape[0]

In [None]:
df.select(
    pl.col('avg_temp_celsius')
    .is_nan()
    .sum()
)

In [None]:
(
    df
    .filter(pl.col('avg_temp_celsius').is_nan())
    .select(pl.count())
)

## Deleting rows and columns containing missing values

### How to do it...

In [2]:
import polars as pl

In [3]:
df = pl.read_csv('../data/temperatures.csv') 

In [4]:
df.head()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-05""",


In [None]:
df.drop_nulls().null_count()

In [None]:
df.select(
    pl.col('avg_temp_celsius')
    .drop_nulls()
    .null_count()
)    

In [None]:
df.filter(pl.col('avg_temp_celsius').is_not_null())

In [None]:
cols_to_drop = [column for column in df.columns if df.select(pl.col(column).is_null().any())[0,0]]
df.drop(cols_to_drop).columns

In [None]:
df.select(
    pl.col('avg_temp_celsius')
    .drop_nans()
    .is_nan()
    .sum()
)

In [None]:
df.filter(pl.col('avg_temp_celsius').is_not_nan())

In [None]:
import polars.selectors as cs
cols_to_drop = df.select(cs.float().is_nan().any()).columns
df.drop(cols_to_drop).columns

In [None]:
df.select(pl.exclude(cols_to_drop)).columns

### There is more...

In [None]:
df.fill_nan(None).drop_nulls()

## Filling missing values

### How to do it...

In [35]:
df = pl.read_csv('../data/temperatures.csv') 
df.head()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-05""",


In [8]:
df.select(
    'avg_temp_celsius',
    avg_temp_nulls_filled=pl.col('avg_temp_celsius').fill_null(pl.lit('1'))
)

avg_temp_celsius,avg_temp_nulls_filled
f64,str
-3.0,"""-3.0"""
,"""1"""
6.0,"""6.0"""
-1.0,"""-1.0"""
,"""NaN"""
6.0,"""6.0"""
4.0,"""4.0"""
,"""1"""
1.0,"""1.0"""
2.0,"""2.0"""


In [10]:
df.select(
    'avg_temp_celsius',
    forward_filled=pl.col('avg_temp_celsius').fill_null(strategy='forward'),
    backward_filled=pl.col('avg_temp_celsius').fill_null(strategy='backward'),
    mean_filled=pl.col('avg_temp_celsius').fill_null(strategy='mean'),
    min_filled=pl.col('avg_temp_celsius').fill_null(strategy='min'),
    max_filled=pl.col('avg_temp_celsius').fill_null(strategy='max'),
)

avg_temp_celsius,forward_filled,backward_filled,mean_filled,min_filled,max_filled
f64,f64,f64,f64,f64,f64
-3.0,-3.0,-3.0,-3.0,-3.0,-3.0
,-3.0,6.0,,-3.0,9.0
6.0,6.0,6.0,6.0,6.0,6.0
-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
,,,,,
6.0,6.0,6.0,6.0,6.0,6.0
4.0,4.0,4.0,4.0,4.0,4.0
,4.0,1.0,,-3.0,9.0
1.0,1.0,1.0,1.0,1.0,1.0
2.0,2.0,2.0,2.0,2.0,2.0


In [14]:
df.select(
    'avg_temp_celsius',
    interpolated_linear=pl.col('avg_temp_celsius').interpolate(),
    interpolated_nearest=pl.col('avg_temp_celsius').interpolate(method='nearest')
)

avg_temp_celsius,interpolated_linear,interpolated_nearest
f64,f64,f64
-3.0,-3.0,-3.0
,1.5,6.0
6.0,6.0,6.0
-1.0,-1.0,-1.0
,,
6.0,6.0,6.0
4.0,4.0,4.0
,2.5,1.0
1.0,1.0,1.0
2.0,2.0,2.0


In [27]:
df.select(
    'avg_temp_celsius',
    avg_temp_median=pl.col('avg_temp_celsius')
        .fill_null(
            pl.col('avg_temp_celsius').median()
        ),
    avg_temp_max_minus_min=pl.col('avg_temp_celsius')
        .fill_null(
            pl.col('avg_temp_celsius').max() - pl.col('avg_temp_celsius').min()
        )
)

avg_temp_celsius,avg_temp_median,avg_temp_max_minus_min
f64,f64,f64
-3.0,-3.0,-3.0
,5.0,12.0
6.0,6.0,6.0
-1.0,-1.0,-1.0
,,
6.0,6.0,6.0
4.0,4.0,4.0
,5.0,12.0
1.0,1.0,1.0
2.0,2.0,2.0


### There is more...

In [39]:
df = pl.DataFrame(
    {'values': [1,2,None,None,None,3,4,None,5]}
)
df

values
i64
1.0
2.0
""
""
""
3.0
4.0
""
5.0


In [42]:
df.select(
    'values',
    forward_fill=pl.col('values').forward_fill(),
    forward_fill_1=pl.col('values').forward_fill(limit=1),
    backward_fill=pl.col('values').backward_fill(),
    backward_fill_2=pl.col('values').backward_fill(limit=2),
)

values,forward_fill,forward_fill_1,backward_fill,backward_fill_2
i64,i64,i64,i64,i64
1.0,1,1.0,1,1.0
2.0,2,2.0,2,2.0
,2,2.0,3,
,2,,3,3.0
,2,,3,3.0
3.0,3,3.0,3,3.0
4.0,4,4.0,4,4.0
,4,4.0,5,5.0
5.0,5,5.0,5,5.0
