# Handling Missing Values

## Identifying missing values

In [39]:
import polars as pl

In [40]:
from datetime import date
import numpy as np

date_col = pl.date_range(date(2023, 1, 1), date(2023, 1, 15), '1d', eager=True)
avg_temp_c_list = [-3,None,6,-1,np.nan,6,4,None,1,2,np.nan,7,9,-2,None]
df = pl.DataFrame({
    'date': date_col,
    'avg_temp_celsius': avg_temp_c_list
})

In [41]:
df.head()

date,avg_temp_celsius
date,f64
2023-01-01,-3.0
2023-01-02,
2023-01-03,6.0
2023-01-04,-1.0
2023-01-05,


### How to do it...

In [42]:
df.null_count()

date,avg_temp_celsius
u32,u32
0,3


In [75]:
df.select('avg_temp_celsius').null_count()

avg_temp_celsius,date
u32,u32
3,0


In [76]:
df.select('date', 'avg_temp_celsius').null_count()

date,avg_temp_celsius
u32,u32
0,3


In [77]:
df.select(pl.col('avg_temp_celsius').null_count())

avg_temp_celsius
u32
3


In [74]:
df.select(
    pl.col('avg_temp_celsius')
    .is_null()
    .sum()
)

avg_temp_celsius
u32
3


In [70]:
(
    df
    .filter(pl.col('avg_temp_celsius').is_null())
    .select(pl.count())
)

count
u32
3


In [71]:
df.filter(pl.col('avg_temp_celsius').is_null()).shape[0]

3

In [81]:
df.select(
    pl.col('avg_temp_celsius')
    .is_nan()
    .sum()
)

avg_temp_celsius
u32
2


In [82]:
(
    df
    .filter(pl.col('avg_temp_celsius').is_nan())
    .select(pl.count())
)

count
u32
2


## Deleting rows and columns containing missing values

### How to do it...

In [87]:
df = pl.read_csv('temperatures.csv') 

In [88]:
df.head()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-05""",


In [93]:
df.drop_nulls().null_count()

date,avg_temp_celsius
u32,u32
0,0


In [96]:
df.select(
    pl.col('avg_temp_celsius')
    .drop_nulls()
    .null_count()
)    

avg_temp_celsius
u32
0


In [151]:
df.filter(pl.col('avg_temp_celsius').is_not_null())

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-06""",6.0
"""2023-01-07""",4.0
"""2023-01-08""",
"""2023-01-09""",1.0
"""2023-01-10""",2.0
"""2023-01-12""",7.0


In [137]:
cols_to_drop = [column for column in df.columns if df.select(pl.col(column).is_null().any())[0,0]]
df.drop(cols_to_drop).columns

['date']

In [162]:
df.select(
    pl.col('avg_temp_celsius')
    .drop_nans()
    .is_nan()
    .sum()
)

avg_temp_celsius
u32
0


In [163]:
df.filter(pl.col('avg_temp_celsius').is_not_nan())

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-02""",
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-06""",6.0
"""2023-01-07""",4.0
"""2023-01-08""",
"""2023-01-09""",1.0
"""2023-01-10""",2.0
"""2023-01-12""",7.0


In [193]:
import polars.selectors as cs
cols_to_drop = df.select(cs.float().is_nan().any()).columns
df.drop(cols_to_drop).columns

['date']

In [203]:
df.select(pl.exclude(cols_to_drop)).columns

['date']

### There is more...

In [204]:
df.fill_nan(None).drop_nulls()

date,avg_temp_celsius
str,f64
"""2023-01-01""",-3.0
"""2023-01-03""",6.0
"""2023-01-04""",-1.0
"""2023-01-06""",6.0
"""2023-01-07""",4.0
"""2023-01-09""",1.0
"""2023-01-10""",2.0
"""2023-01-12""",7.0
"""2023-01-13""",9.0
"""2023-01-14""",-2.0
