In [1]:
import numpy as np
import pandas as pd

In [5]:
np.nan # Not a number

nan

In [7]:
np.nan == np.nan # False, as you you dont know what it is both on LHS and RHS so avoid comparing

False

In [8]:
# for checking if a value is nan
np.nan is np.nan # True

True

In [9]:
myVar = np.nan

In [11]:
myVar is np.nan # True

True

In [12]:
df = pd.read_csv("../movie_scores.csv")

In [13]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


One way to check if any value is null is to use isnull() method on df

In [16]:
df.isnull() # returns a dataframe with boolean values
# df.notnull() for opposite

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


For movie score having non-null values we do

In [18]:
df[df['pre_movie_score'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [20]:
# for row missing pre_movie_score
df[df['pre_movie_score'].isnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
1,,,,,,
2,Hugh,Jackman,51.0,m,,


In [24]:
df[(df['pre_movie_score'].isnull()) & (df['first_name'].notnull())] # & not &&

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
2,Hugh,Jackman,51.0,m,,


In [25]:
# KEEP DATA => just keep missing values
# DROP DATA
# FILL DATA

In [29]:
# dropna() => drops rows with missing values
# main parameters are 'axis' for where to drop, 'how' for which values to drop(some/all), 'thresh' for min number of non-missing values to keep
# subset allows for certain columns

In [30]:
df.dropna() # drops all rows with any missing values

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [34]:
# to drop row with all NA values and not rows with some NA values we can use thresh parameter in dropna()
# where thresh = 2 means that row must have atleast 2 non-missing values to be kept
df.dropna(thresh=2) # here hugh has more than 2 non-missing values so it is kept, at thres=5 he will be removed

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [36]:
# default axis is 0, so we can drop columns with missing values by setting axis=1
df.dropna(axis=1) # since every column has at least 1 missing value, all columns are dropped

0
1
2
3
4


In [41]:
df.dropna(axis=1, thresh=4) # pre_movi_score has 3 non-missing values so it is dropped

Unnamed: 0,first_name,last_name,age,sex
0,Tom,Hanks,63.0,m
1,,,,
2,Hugh,Jackman,51.0,m
3,Oprah,Winfrey,66.0,f
4,Emma,Stone,31.0,f


In [42]:
# subset parameter allows us to drop rows/columns with missing values in certain columns
df.dropna(subset=['last_name']) # drops rows with missing values in pre_movie_score column

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


#### Filling data in NA values using fillna()

In [46]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [51]:
df.fillna('New Value') # fills all missing values with 'New Value'
# better to grab column or row to fill missing values

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,New Value,New Value,New Value,New Value,New Value,New Value
2,Hugh,Jackman,51.0,m,New Value,New Value
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [52]:
df['pre_movie_score'].fillna(0.0)

0    8.0
1    0.0
2    0.0
3    6.0
4    7.0
Name: pre_movie_score, dtype: float64

In [54]:
# df['pre_movie_score] = df['pre_movie_score'].fillna(0.0) # this will change the original dataframe

In [59]:
# if we want to fill with avg movie scores, we can use mean()
avg_score = df['pre_movie_score'].mean() # considers existing values only
df['pre_movie_score'].fillna(avg_score)

0    8.0
1    7.0
2    7.0
3    6.0
4    7.0
Name: pre_movie_score, dtype: float64

In [60]:
air_tickets = {'first':100, 'business': np.nan, 'economy-plus':50, 'economy':30}

In [61]:
# convert to pandas series
ser = pd.Series(air_tickets)

In [62]:
ser

first           100.0
business          NaN
economy-plus     50.0
economy          30.0
dtype: float64

In [63]:
# maybe a good idea here to fill the business class values here with the average of first and economy-plus
ser.fillna(ser.mean())

first           100.0
business         60.0
economy-plus     50.0
economy          30.0
dtype: float64

In [65]:
# you can interpolate values to fill missing values instead of mean in linear manner from smaller to bigger
ser.interpolate() # fills business class with 75

first           100.0
business         75.0
economy-plus     50.0
economy          30.0
dtype: float64

#### Conclusions
1. Avoid comparisons on null values, instead use 'is'
2. There is isnull() and notnull() method to check 
3. There is dropna() and fillna() method to remove or fill NA values