# Chapter 5: Handling Missing Values

## Identifying missing values

In [1]:
import polars as pl

In [3]:
from datetime import date
"""
This script creates a DataFrame using the Polars library with two columns: 'date' and 'avg_temp_celsius'.
The 'date' column contains a range of dates from January 1, 2023, to January 15, 2023.
The 'avg_temp_celsius' column contains a list of average temperatures in Celsius for each corresponding date, 
with some values being None or NaN to represent missing data.

Imports:
    - date from datetime: Used to create date objects.
    - numpy as np: Used to represent NaN values.

Variables:
    - date_col: A range of dates from January 1, 2023, to January 15, 2023.
    - avg_temp_c_list: A list of average temperatures in Celsius, with some missing values represented by None or NaN.
    - df: A Polars DataFrame containing the 'date' and 'avg_temp_celsius' columns.
"""
import numpy as np

date_col = pl.date_range(date(2023, 1, 1), date(2023, 1, 15), '1d', eager=True)
avg_temp_c_list = [-3,None,6,-1,np.nan,6,4,None,1,2,np.nan,7,9,-2,None]
df = pl.DataFrame({
    'date': date_col,
    'avg_temp_celsius': avg_temp_c_list
}, strict=False)

In [6]:
df.head(10)

date,avg_temp_celsius
date,f64
2023-01-01,-3.0
2023-01-02,
2023-01-03,6.0
2023-01-04,-1.0
2023-01-05,
2023-01-06,6.0
2023-01-07,4.0
2023-01-08,
2023-01-09,1.0
2023-01-10,2.0


### How to do it...

In [5]:
df.null_count()

date,avg_temp_celsius
u32,u32
0,3


In [7]:
df.select('avg_temp_celsius').null_count()

avg_temp_celsius
u32
3


In [8]:
df.select('date', 'avg_temp_celsius').null_count()

date,avg_temp_celsius
u32,u32
0,3


In [9]:
df.select(pl.col('avg_temp_celsius').null_count())

avg_temp_celsius
u32
3


In [10]:
df.select(
    pl.col('avg_temp_celsius')
    .is_null()
    .sum()
)

avg_temp_celsius
u32
3


In [11]:
(
    df
    .filter(pl.col('avg_temp_celsius').is_null())
    .select(pl.len())
)

len
u32
3


In [12]:
df.filter(pl.col('avg_temp_celsius').is_null()).shape[0]

3

In [13]:
df.select(
    pl.col('avg_temp_celsius')
    .is_nan()
    .sum()
)

avg_temp_celsius
u32
2


In [14]:
(
    df
    .filter(pl.col('avg_temp_celsius').is_nan())
    .select(pl.len())
)

len
u32
2


## Deleting rows and columns containing missing values

### How to do it...

In [15]:
import polars as pl

In [16]:
df = pl.read_csv('../data/temperatures.csv') 

In [17]:
df.head()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-05""",


In [18]:
df.drop_nulls().null_count()

date,avg_temp_celsius
u32,u32
0,0


In [19]:
df.select(
    pl.col('avg_temp_celsius')
    .drop_nulls()
    .null_count()
)    

avg_temp_celsius
u32
0


In [20]:
df.filter(pl.col('avg_temp_celsius').is_not_null())

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-05""",
"""2023-01-06""",6.0
…,…
"""2023-01-10""",2.0
"""2023-01-11""",
"""2023-01-12""",7.0
"""2023-01-13""",9.0


In [21]:
cols_to_drop = [column for column in df.columns if df.select(pl.col(column).is_null().any())[0,0]]
df.drop(cols_to_drop).columns

['date']

In [22]:
cols_to_drop = [column for column in df.columns if df.select(pl.col(column).is_null().any()).item()]
df.drop(cols_to_drop).columns

['date']

In [23]:
df.select(
    pl.col('avg_temp_celsius')
    .drop_nans()
    .is_nan()
    .sum()
)

avg_temp_celsius
u32
0


In [24]:
df.filter(pl.col('avg_temp_celsius').is_not_nan())

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-06""",6.0
"""2023-01-07""",4.0
"""2023-01-09""",1.0
"""2023-01-10""",2.0
"""2023-01-12""",7.0
"""2023-01-13""",9.0
"""2023-01-14""",-2.0


In [25]:
import polars.selectors as cs
cols_to_drop = df.select(cs.float().is_nan().any()).columns
df.drop(cols_to_drop).columns

['date']

In [26]:
df.select(pl.exclude(cols_to_drop)).columns

['date']

### There is more...

In [27]:
df.fill_nan(None).drop_nulls()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-06""",6.0
"""2023-01-07""",4.0
"""2023-01-09""",1.0
"""2023-01-10""",2.0
"""2023-01-12""",7.0
"""2023-01-13""",9.0
"""2023-01-14""",-2.0


## Filling missing values

### How to do it...

In [28]:
df = pl.read_csv('../data/temperatures.csv') 
df.head()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-05""",


In [29]:
df.select(
    'avg_temp_celsius',
    avg_temp_nulls_filled=pl.col('avg_temp_celsius').fill_null(pl.lit('1'))
)

avg_temp_celsius,avg_temp_nulls_filled
f64,str
-3.0,"""-3.0"""
,"""1"""
6.0,"""6.0"""
-1.0,"""-1.0"""
,"""NaN"""
…,…
,"""NaN"""
7.0,"""7.0"""
9.0,"""9.0"""
-2.0,"""-2.0"""


In [32]:
"""
Selects the 'avg_temp_celsius' column from the DataFrame and creates new columns with different null-filling strategies.

Columns:
- 'avg_temp_celsius': Original column.
- 'forward_filled': Null values in 'avg_temp_celsius' filled using forward fill strategy.
- 'backward_filled': Null values in 'avg_temp_celsius' filled using backward fill strategy.
- 'mean_filled': Null values in 'avg_temp_celsius' filled with the mean of the column.
- 'min_filled': Null values in 'avg_temp_celsius' filled with the minimum value of the column.
- 'max_filled': Null values in 'avg_temp_celsius' filled with the maximum value of the column.
"""
df.select(
    'avg_temp_celsius',
    forward_filled=pl.col('avg_temp_celsius').fill_null(strategy='forward'),
    backward_filled=pl.col('avg_temp_celsius').fill_null(strategy='backward'),
    mean_filled=pl.col('avg_temp_celsius').fill_null(strategy='mean'),
    min_filled=pl.col('avg_temp_celsius').fill_null(strategy='min'),
    max_filled=pl.col('avg_temp_celsius').fill_null(strategy='max'),
)

avg_temp_celsius,forward_filled,backward_filled,mean_filled,min_filled,max_filled
f64,f64,f64,f64,f64,f64
-3.0,-3.0,-3.0,-3.0,-3.0,-3.0
,-3.0,6.0,,-3.0,9.0
6.0,6.0,6.0,6.0,6.0,6.0
-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
,,,,,
…,…,…,…,…,…
,,,,,
7.0,7.0,7.0,7.0,7.0,7.0
9.0,9.0,9.0,9.0,9.0,9.0
-2.0,-2.0,-2.0,-2.0,-2.0,-2.0


In [35]:
"""
Selects the 'avg_temp_celsius' column from the DataFrame and creates two new columns:
'interpolated_linear' and 'interpolated_nearest'. The 'interpolated_linear' column is 
generated by applying linear interpolation to the 'avg_temp_celsius' column, while the 
'interpolated_nearest' column is generated by applying nearest interpolation to the 
'avg_temp_celsius' column.

Returns:
    DataFrame: A DataFrame with the original 'avg_temp_celsius' column and two new 
    interpolated columns: 'interpolated_linear' and 'interpolated_nearest'.
"""
df.select(
    'avg_temp_celsius',
    interpolated_linear=pl.col('avg_temp_celsius').interpolate(),
    interpolated_nearest=pl.col('avg_temp_celsius').interpolate(method='nearest')
)

avg_temp_celsius,interpolated_linear,interpolated_nearest
f64,f64,f64
-3.0,-3.0,-3.0
,1.5,6.0
6.0,6.0,6.0
-1.0,-1.0,-1.0
,,
…,…,…
,,
7.0,7.0,7.0
9.0,9.0,9.0
-2.0,-2.0,-2.0


In [37]:
df.select(
    'avg_temp_celsius',
    avg_temp_median=pl.col('avg_temp_celsius')
        .fill_null(
            pl.col('avg_temp_celsius').median()
        ),
    avg_temp_max_minus_min=pl.col('avg_temp_celsius')
        .fill_null(
            pl.col('avg_temp_celsius').max() - pl.col('avg_temp_celsius').min()
        )
)

avg_temp_celsius,avg_temp_median,avg_temp_max_minus_min
f64,f64,f64
-3.0,-3.0,-3.0
,5.0,12.0
6.0,6.0,6.0
-1.0,-1.0,-1.0
,,
…,…,…
,,
7.0,7.0,7.0
9.0,9.0,9.0
-2.0,-2.0,-2.0


### There is more...

In [38]:
df = pl.DataFrame(
    {'values': [1,2,None,None,None,3,4,None,5]}
)
df

values
i64
1.0
2.0
""
""
""
3.0
4.0
""
5.0


In [39]:
df.select(
    'values',
    forward_fill=pl.col('values').forward_fill(),
    forward_fill_1=pl.col('values').forward_fill(limit=1),
    backward_fill=pl.col('values').backward_fill(),
    backward_fill_2=pl.col('values').backward_fill(limit=2),
)

values,forward_fill,forward_fill_1,backward_fill,backward_fill_2
i64,i64,i64,i64,i64
1.0,1,1.0,1,1.0
2.0,2,2.0,2,2.0
,2,2.0,3,
,2,,3,3.0
,2,,3,3.0
3.0,3,3.0,3,3.0
4.0,4,4.0,4,4.0
,4,4.0,5,5.0
5.0,5,5.0,5,5.0
