# NA in dataframes

In [1]:
import numpy as np
import pandas as pd
# Restricting number of displaying rows, just for convenience
pd.set_option('max_rows', 8)

## Load data

In [2]:
films = pd.read_csv('data/movie.csv')

## Finding NA in df
Method `isna()` is an alias for `isnull()` - they are equivalent

In [21]:
# Returns boolean dataframe with True (NA) or False for each value
films.isna()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,False,True,False,False,True,False,False,False,True,False,...,False,False,False,False,True,True,False,False,False,False
4913,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,True,False
4914,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4915,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
# Returns series with True if at least 1 NA is in column for all columns
films.isna().any()

color                      True
director_name              True
num_critic_for_reviews     True
duration                   True
                          ...  
actor_2_facebook_likes     True
imdb_score                False
aspect_ratio               True
movie_facebook_likes      False
Length: 28, dtype: bool

In [13]:
# Returns True if NA in dataframe
films.isna().any().any()

True

Also there is opposite method to `isna()` - `notna()` which returns True if value is present and False otherwise. It is an alias for `notnull()` and similar to it

In [43]:
films.notna()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,True,False,True,True,False,True,True,True,False,True,...,True,True,True,True,False,False,True,True,True,True
4913,True,True,True,True,True,True,True,True,False,True,...,True,True,True,False,True,True,True,True,False,True
4914,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,False,True,True,True,True,True
4915,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


## Counting NA
`True` and `False` are interpreted as 1 and 0, so we can use `sum()` to count NA

In [17]:
# Returns series with number of NA in each column
films.isna().sum()

color                      19
director_name             102
num_critic_for_reviews     49
duration                   15
                         ... 
actor_2_facebook_likes     13
imdb_score                  0
aspect_ratio              326
movie_facebook_likes        0
Length: 28, dtype: int64

In [23]:
# Returns number of NA in dataframe
films.isna().sum().sum()

2654

## Note about style
In previous examples we chained methods together due to the fact that each of them return object which have subsequent method. It is possible to arrange them vertically, 1 method call beneath the other.

In [26]:
# 1 way with \
films.isna()\ # df with boolean values
     .sum()\  # series with number of NA for each column
     .sum()   # scalar with number of NA in df

# The other way with surrounding expression in parenthesis ()
(films.isna() # df with boolean values
      .sum()  # series with number of NA for each column
      .sum()) # scalar with number of NA in df

2654

In [37]:
pd.DataFrame.equals()

  return umr_minimum(a, axis, None, out, keepdims)


num_critic_for_reviews     NaN
duration                   NaN
director_facebook_likes    NaN
actor_3_facebook_likes     NaN
                          ... 
actor_2_facebook_likes     NaN
imdb_score                 1.6
aspect_ratio               NaN
movie_facebook_likes       0.0
Length: 16, dtype: float64

## Drop NA
One of the ways to deal with NA is simply remove samples with them. But you will lose some data in this case.  
`dropna()` has following parameters:
* `axis` to operate on, default 0
* `how` - mode to drop NA:
    * 'any' - drop row or column if it has 1 NA
    * 'all' - drop row or column if all cells in it are NA
* `thresh` - threshold for number of nonNA in row or column, if there is less nonNA than specified line will be droped

In [74]:
films.dropna(how='all')

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660
4915,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,Documentary,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


## Fill NA
Usually fill NA with some values (the closer them to real value the better) is more powerful than discarding rows with NA, due to keeping information. The most primitive way is to substitute missing values with some constant, usually 0

In [4]:
# Substitute all NA with 0
films.fillna(0)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,Color,0,43.0,43.0,0.0,319.0,Valorie Curry,841.0,0.0,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,0.0,0.0,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,0.0,Drama|Horror|Thriller,...,3.0,English,USA,0,1400.0,2013.0,0.0,6.3,0.00,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,0.0,2012.0,719.0,6.3,2.35,660
4915,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,Documentary,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


`fillna()` method provides ability to repeat values in a column to replace NA with different fill strategies available in `method` argument.

In [6]:
# Fill prreviously encountered in column value in the NA cell
films.fillna(method='ffill')

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,Color,Scott Smith,43.0,43.0,2.0,319.0,Valorie Curry,841.0,4584.0,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,9000.0,2013.0,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,4584.0,Drama|Horror|Thriller,...,3.0,English,USA,TV-14,1400.0,2013.0,0.0,6.3,16.00,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,1400.0,2012.0,719.0,6.3,2.35,660
4915,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,Documentary,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


Quite common to substitute missing values median or some other meaningfull value

In [15]:
# Replace NA in budget by median of that column
films.fillna({'budget': films['budget'].median()})

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,19850000.0,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,19850000.0,2012.0,719.0,6.3,2.35,660
4915,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,Documentary,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


Better usage of `fillna()` is to fill them with some value inferred from group of similar observations - unknown height of girl probably closer to mean heght of other girls than to mean height of boys and girls or young and old people together.  
So we can group observations according to some feature (sex or age in previous example) infer group value (mean) and substitute NA with it

In [43]:
# Here we take a column by which we will group observations and another column where 
# we will substitute NA
# After that we apply function which fill NA with mean to each group independently with apply method
mean_sub = lambda group: group.fillna(np.mean(group))
films[['movie_facebook_likes', 'director_name']].groupby('director_name').apply(mean_sub)

Unnamed: 0_level_0,Unnamed: 1_level_0,movie_facebook_likes,director_name
director_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A. Raven Cruz,4498,128,A. Raven Cruz
Aaron Hann,4221,0,Aaron Hann
Aaron Schneider,3430,0,Aaron Schneider
Aaron Seltzer,2156,806,Aaron Seltzer
...,...,...,...
Álex de la Iglesia,3185,0,Álex de la Iglesia
Émile Gaudreault,3609,352,Émile Gaudreault
Éric Tessier,4103,39,Éric Tessier
Étienne Faure,4731,114,Étienne Faure


`fillna()` is a special case of `replace()` method which give you ability to replace some value or values with another. It can take single values, lists or dictionary

## Interpolation
There is also `interpolate()` method which try to infer missed values with several available methods in `method` parameter

In [48]:
films['budget'].interpolate()

## Note about operations with NA
As you know we can conduct arithmeric operations upon pandas objects. If they have non-identical index their product will have NA. Sometimes it is desirable behaviour, sometimes not - e.g. 0s instead are desirable. If it is ok, just use operators, if not - you can use methods analogous to operators with 

In [64]:
# Two series with non-identical index
a = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4])
b = pd.Series([1, 2, 3, 4, 5], index=[0, 2, 3, 5, 7])
a

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [67]:
# Their sum or anything else will have NA due to non overlapped parts
a + b

0    2.0
1    NaN
2    5.0
3    7.0
4    NaN
5    NaN
7    NaN
dtype: float64

In [69]:
# Absent values from other series is treated as 0
a.add(b, fill_value=0)

0    2.0
1    2.0
2    5.0
3    7.0
4    5.0
5    4.0
7    5.0
dtype: float64