## 7.1	Handling	Missing	Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data = pd.Series(['a', 'b', np.nan, 'c'])

In [3]:
string_data

0      a
1      b
2    NaN
3      c
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
# python None will also be treated as nan

string_data[0] = None

In [6]:
string_data

0    None
1       b
2     NaN
3       c
dtype: object

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [8]:
vars(string_data)

{'_at': <pandas.core.indexing._AtIndexer at 0x7fdbdf6e24e0>,
 '_cache': {'hasnans': True},
 '_data': SingleBlockManager
 Items: RangeIndex(start=0, stop=4, step=1)
 ObjectBlock: 4 dtype: object,
 '_iat': <pandas.core.indexing._iAtIndexer at 0x7fdbdf6e75f8>,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7fdbdf6e7048>,
 '_index': RangeIndex(start=0, stop=4, step=1),
 '_item_cache': {},
 '_ix': <pandas.core.indexing._IXIndexer at 0x7fdbdf7e60b8>,
 '_loc': <pandas.core.indexing._LocIndexer at 0x7fdbdf7e6400>,
 '_name': None,
 '_subtyp': 'series',
 'is_copy': None}

In [9]:
type(string_data)

pandas.core.series.Series

### Filtering	Out	Missing	Data

In [10]:
data = pd.Series([1, np.nan, 2.5, np.nan, 7])

In [11]:
data

0    1.0
1    NaN
2    2.5
3    NaN
4    7.0
dtype: float64

In [12]:
data.dropna()

0    1.0
2    2.5
4    7.0
dtype: float64

In [13]:
# equivalent to

data[data.notnull()]

0    1.0
2    2.5
4    7.0
dtype: float64

In [14]:
# for dataframe 

df = pd.DataFrame([[1, 3, 5],
                   [np.nan, 4, 6],
                   [1, np.nan, np.nan],
                   [np.nan, np.nan, np.nan]])

In [15]:
df

Unnamed: 0,0,1,2
0,1.0,3.0,5.0
1,,4.0,6.0
2,1.0,,
3,,,


In [16]:
cleaned = df.dropna()

In [17]:
cleaned

Unnamed: 0,0,1,2
0,1.0,3.0,5.0


In [39]:
df.dropna?

In [19]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,3.0,5.0
1,,4.0,6.0
2,1.0,,


In [22]:
df[4] = np.nan

In [23]:
df

Unnamed: 0,0,1,2,4
0,1.0,3.0,5.0,
1,,4.0,6.0,
2,1.0,,,
3,,,,


In [25]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,3.0,5.0
1,,4.0,6.0
2,1.0,,
3,,,


In [27]:
# other paramters
# thresh

In [28]:
df = pd.DataFrame(np.random.randn(7,3))

In [29]:
df

Unnamed: 0,0,1,2
0,-0.075611,0.801524,0.600328
1,0.599316,-0.535362,0.803058
2,0.380024,0.544764,0.653485
3,0.266128,1.456878,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [30]:
df.iloc[:4, 1] = np.nan

In [31]:
df.iloc[:2, 2] = np.nan

In [32]:
df

Unnamed: 0,0,1,2
0,-0.075611,,
1,0.599316,,
2,0.380024,,0.653485
3,0.266128,,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [33]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.380024,,0.653485
3,0.266128,,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [38]:
df.dropna(axis=1, thresh=2)

Unnamed: 0,0,1,2
0,-0.075611,,
1,0.599316,,
2,0.380024,,0.653485
3,0.266128,,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


### Filling	In	Missing	Data

In [40]:
df

Unnamed: 0,0,1,2
0,-0.075611,,
1,0.599316,,
2,0.380024,,0.653485
3,0.266128,,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [41]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.075611,0.0,0.0
1,0.599316,0.0,0.0
2,0.380024,0.0,0.653485
3,0.266128,0.0,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [42]:
# different value for diff row

df.fillna({1: 0.5, 2: 0.3})

Unnamed: 0,0,1,2
0,-0.075611,0.5,0.3
1,0.599316,0.5,0.3
2,0.380024,0.5,0.653485
3,0.266128,0.5,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [43]:
df.fillna?

In [44]:
# modify data inplace
# the dropna also has the paramter

df

Unnamed: 0,0,1,2
0,-0.075611,,
1,0.599316,,
2,0.380024,,0.653485
3,0.266128,,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [45]:
df.fillna(0, inplace=True)

In [46]:
df

Unnamed: 0,0,1,2
0,-0.075611,0.0,0.0
1,0.599316,0.0,0.0
2,0.380024,0.0,0.653485
3,0.266128,0.0,-0.44887
4,-0.182209,0.961493,-0.832439
5,-0.417402,-0.510996,-0.678746
6,-0.572507,-1.184289,-0.328055


In [47]:
# ffill method
# and limit

In [48]:
df = pd.DataFrame(np.random.randn(6,3))

In [49]:
df

Unnamed: 0,0,1,2
0,1.276711,-0.526854,1.467186
1,0.826375,0.058719,-0.911046
2,-0.350556,0.318702,1.120862
3,-0.72385,-0.726668,1.436381
4,0.203781,0.898422,-1.012126
5,1.070021,-2.320078,1.692916


In [50]:
df.iloc[2:, 1] = np.nan

In [51]:
df.iloc[4:, 2] = np.nan

In [52]:
df

Unnamed: 0,0,1,2
0,1.276711,-0.526854,1.467186
1,0.826375,0.058719,-0.911046
2,-0.350556,,1.120862
3,-0.72385,,1.436381
4,0.203781,,
5,1.070021,,


In [53]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.276711,-0.526854,1.467186
1,0.826375,0.058719,-0.911046
2,-0.350556,0.058719,1.120862
3,-0.72385,0.058719,1.436381
4,0.203781,0.058719,1.436381
5,1.070021,0.058719,1.436381


In [54]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,1.276711,-0.526854,1.467186
1,0.826375,0.058719,-0.911046
2,-0.350556,,1.120862
3,-0.72385,,1.436381
4,0.203781,,
5,1.070021,,
