In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [6]:
"""
Handling missing data
For numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data.
We call this a sentinel value that can be easily detected:
NA: not available, NA data may either be data that does not exist or that exists but was not observed
"""
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
string_data[0] = None   # None is also treated as NA
string_data

'\nHandling missing data\nFor numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data.\nWe call this a sentinel value that can be easily detected:\nNA: not available, NA data may either be data that does not exist or that exists but was not observed\n'

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [11]:
"""
dropna
fillna
"""
string_data.dropna()
string_data
string_data.fillna(0)   # fill 0
string_data[string_data.notnull()]

'\ndropna\nfillna\n'

1    artichoke
3      avocado
dtype: object

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

0            0
1    artichoke
2            0
3      avocado
dtype: object

1    artichoke
3      avocado
dtype: object

In [24]:
"""
With DataFrame objects, things are a bit more complex. 
You may want to drop rows or columns that are all NA or only those containing any NAs. 
dropna by default drops any row containing a missing value:
"""
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
data.dropna(how='all')  # only drop rows that are all NA
data[4] = NA
data
data.dropna(axis=1, how='all')
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()
df.dropna(thresh=2)  # keep rows that |non-NA| >= thresh

'\nWith DataFrame objects, things are a bit more complex. \nYou may want to drop rows or columns that are all NA or only those containing any NAs. \ndropna by default drops any row containing a missing value:\n'

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,-0.321675,,
1,0.339903,,
2,-0.029175,,1.072476
3,-0.742042,,-0.926387
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


Unnamed: 0,0,1,2
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


Unnamed: 0,0,1,2
2,-0.029175,,1.072476
3,-0.742042,,-0.926387
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


In [28]:
# filling in missing data
df
df.fillna(0)
df.fillna({1: 0.5, 2: 0})   # calling fillna with a dict, you can use a different fill value for each column
_ = df.fillna(0, inplace=True)  # set in-place to modify the existing object in-place
df

Unnamed: 0,0,1,2
0,-0.321675,,
1,0.339903,,
2,-0.029175,,1.072476
3,-0.742042,,-0.926387
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


Unnamed: 0,0,1,2
0,-0.321675,0.0,0.0
1,0.339903,0.0,0.0
2,-0.029175,0.0,1.072476
3,-0.742042,0.0,-0.926387
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


Unnamed: 0,0,1,2
0,-0.321675,0.5,0.0
1,0.339903,0.5,0.0
2,-0.029175,0.5,1.072476
3,-0.742042,0.5,-0.926387
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


Unnamed: 0,0,1,2
0,-0.321675,0.0,0.0
1,0.339903,0.0,0.0
2,-0.029175,0.0,1.072476
3,-0.742042,0.0,-0.926387
4,1.239185,0.119542,0.847205
5,-0.169738,1.053247,1.415476
6,-0.763716,-0.133331,0.127168


In [32]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.33363,1.145278,0.759558
1,1.33956,0.550751,0.937734
2,0.986391,,0.053105
3,0.95394,,1.04739
4,-0.327633,,
5,-0.290286,,


Unnamed: 0,0,1,2
0,-0.33363,1.145278,0.759558
1,1.33956,0.550751,0.937734
2,0.986391,0.550751,0.053105
3,0.95394,0.550751,1.04739
4,-0.327633,0.550751,1.04739
5,-0.290286,0.550751,1.04739


Unnamed: 0,0,1,2
0,-0.33363,1.145278,0.759558
1,1.33956,0.550751,0.937734
2,0.986391,0.550751,0.053105
3,0.95394,0.550751,1.04739
4,-0.327633,,1.04739
5,-0.290286,,1.04739
