In [1]:
import numpy as np
import pandas as pd

# Working with missing data

http://pandas.pydata.org/pandas-docs/stable/missing_data.html

## Missing data basics 

"Missing" = Not available for some reason. In Python/pandas it's `NaN`. Commonly introduced because of reindexing. 


In [2]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns=['one', 'two', 'three'])

In [3]:
df

Unnamed: 0,one,two,three
a,0.580965,-1.349902,-0.368215
c,0.413943,1.171888,0.859315
e,0.995922,0.000268,0.810805
f,0.793222,0.977586,1.770551
h,0.525656,-1.056565,1.835253


In [6]:
# Add a column by broadcasting

df['four'] = 'bar'
df['five'] = df.one > 0

In [7]:
df

Unnamed: 0,one,two,three,four,five
a,0.580965,-1.349902,-0.368215,bar,True
c,0.413943,1.171888,0.859315,bar,True
e,0.995922,0.000268,0.810805,bar,True
f,0.793222,0.977586,1.770551,bar,True
h,0.525656,-1.056565,1.835253,bar,True


In [8]:
# Reindexing, adding three new indices

df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

In [9]:
df2

Unnamed: 0,one,two,three,four,five
a,0.580965,-1.349902,-0.368215,bar,True
b,,,,,
c,0.413943,1.171888,0.859315,bar,True
d,,,,,
e,0.995922,0.000268,0.810805,bar,True
f,0.793222,0.977586,1.770551,bar,True
g,,,,,
h,0.525656,-1.056565,1.835253,bar,True


For finding the missing values: `isna` and `notna` functions.

In [13]:
df2.one

a    0.580965
b         NaN
c    0.413943
d         NaN
e    0.995922
f    0.793222
g         NaN
h    0.525656
Name: one, dtype: float64

In [14]:
pd.isna(df2.one)

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [15]:
pd.notna(df2.one)

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

In [17]:
# Or as a method

df2['one'].isna()

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [18]:
# Method chaining

df2.one.notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

## DateTimes

In [19]:
df2 = df.copy()

In [20]:
df2['date'] = pd.Timestamp('20171222')
df2

Unnamed: 0,one,two,three,four,five,date
a,0.580965,-1.349902,-0.368215,bar,True,2017-12-22
c,0.413943,1.171888,0.859315,bar,True,2017-12-22
e,0.995922,0.000268,0.810805,bar,True,2017-12-22
f,0.793222,0.977586,1.770551,bar,True,2017-12-22
h,0.525656,-1.056565,1.835253,bar,True,2017-12-22


In [24]:
# Missing time/date values are denoted with NaT 
# Interoperates with NaN

df2.loc[['a','c','h'],['one','date']] = np.nan
df2

Unnamed: 0,one,two,three,four,five,date
a,,-1.349902,-0.368215,bar,True,NaT
c,,1.171888,0.859315,bar,True,NaT
e,0.995922,0.000268,0.810805,bar,True,2017-12-22
f,0.793222,0.977586,1.770551,bar,True,2017-12-22
h,,-1.056565,1.835253,bar,True,NaT


In [25]:
df2.get_dtype_counts()

bool              1
datetime64[ns]    1
float64           3
object            1
dtype: int64

## Inserting missing data


In [27]:
df2

Unnamed: 0,one,two,three,four,five,date
a,,-1.349902,-0.368215,bar,True,NaT
c,,1.171888,0.859315,bar,True,NaT
e,0.995922,0.000268,0.810805,bar,True,2017-12-22
f,0.793222,0.977586,1.770551,bar,True,2017-12-22
h,,-1.056565,1.835253,bar,True,NaT


In [30]:
df2.loc['a', 'five'] = None

In [32]:
df2

Unnamed: 0,one,two,three,four,five,date
a,,-1.349902,-0.368215,bar,,NaT
c,,1.171888,0.859315,bar,1.0,NaT
e,0.995922,0.000268,0.810805,bar,1.0,2017-12-22
f,0.793222,0.977586,1.770551,bar,1.0,2017-12-22
h,,-1.056565,1.835253,bar,1.0,NaT


## Calculations with missing data

NaN's propagate across calculations.

In [35]:
frame1 = df2['two']
frame2 = df2['three']
frame1

a   -1.349902
c    1.171888
e    0.000268
f    0.977586
h   -1.056565
Name: two, dtype: float64

In [36]:
frame2

a   -0.368215
c    0.859315
e    0.810805
f    1.770551
h    1.835253
Name: three, dtype: float64

In [37]:
# Insert some missing values 

frame1.loc['a'] = None
frame2.loc['f'] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [38]:
frame1

a         NaN
c    1.171888
e    0.000268
f    0.977586
h   -1.056565
Name: two, dtype: float64

In [39]:
frame2

a   -0.368215
c    0.859315
e    0.810805
f         NaN
h    1.835253
Name: three, dtype: float64

In [41]:
frame1 + frame2

a         NaN
c    2.031204
e    0.811073
f         NaN
h    0.778688
dtype: float64

In [42]:
frame1 * frame2

a         NaN
c    1.007022
e    0.000217
f         NaN
h   -1.939064
dtype: float64

In [44]:
# Sums ignore NaNs

frame1.sum()

1.0931766407659047

## Cleaning and filling missing data

In [45]:
df2

Unnamed: 0,one,two,three,four,five,date
a,,,-0.368215,bar,,NaT
c,,1.171888,0.859315,bar,1.0,NaT
e,0.995922,0.000268,0.810805,bar,1.0,2017-12-22
f,0.793222,0.977586,,bar,1.0,2017-12-22
h,,-1.056565,1.835253,bar,1.0,NaT


In [46]:
# Fill NaNs with a scalar

df2.fillna(0)

Unnamed: 0,one,two,three,four,five,date
a,0.0,0.0,-0.368215,bar,0.0,0
c,0.0,1.171888,0.859315,bar,1.0,0
e,0.995922,0.000268,0.810805,bar,1.0,2017-12-22 00:00:00
f,0.793222,0.977586,0.0,bar,1.0,2017-12-22 00:00:00
h,0.0,-1.056565,1.835253,bar,1.0,0


In [47]:
# This can be a string if the column is string

df2.loc['four', 'e'] = None

In [48]:
df2

Unnamed: 0,one,two,three,four,five,date,e
a,,,-0.368215,bar,,NaT,
c,,1.171888,0.859315,bar,1.0,NaT,
e,0.995922,0.000268,0.810805,bar,1.0,2017-12-22,
f,0.793222,0.977586,,bar,1.0,2017-12-22,
h,,-1.056565,1.835253,bar,1.0,NaT,
four,,,,,,NaT,


In [49]:
df2.loc['e','four'] = None

In [50]:
df2

Unnamed: 0,one,two,three,four,five,date,e
a,,,-0.368215,bar,,NaT,
c,,1.171888,0.859315,bar,1.0,NaT,
e,0.995922,0.000268,0.810805,,1.0,2017-12-22,
f,0.793222,0.977586,,bar,1.0,2017-12-22,
h,,-1.056565,1.835253,bar,1.0,NaT,
four,,,,,,NaT,


In [51]:
df2['four'].fillna('Missing')

a           bar
c           bar
e       Missing
f           bar
h           bar
four    Missing
Name: four, dtype: object

In [52]:
# Reset original DataFrame

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns=['one', 'two', 'three'])

In [53]:
df

Unnamed: 0,one,two,three
a,-1.330568,-0.712067,1.168101
c,0.155838,1.501229,-1.034452
e,0.5859,0.062052,-1.018777
f,-0.537612,-1.000003,0.933689
h,-0.995782,-1.26154,-1.683496


In [55]:
# Knock out some of the data

df.loc['f':] = None

In [56]:
df

Unnamed: 0,one,two,three
a,-1.330568,-0.712067,1.168101
c,0.155838,1.501229,-1.034452
e,0.5859,0.062052,-1.018777
f,,,
h,,,


In [58]:
# Fill in the missing data by padding

df3 = df.copy()
df3.fillna(method='pad')

Unnamed: 0,one,two,three
a,-1.330568,-0.712067,1.168101
c,0.155838,1.501229,-1.034452
e,0.5859,0.062052,-1.018777
f,0.5859,0.062052,-1.018777
h,0.5859,0.062052,-1.018777


In [59]:
# Or fill them in with a pandas object that aligns

dff = pd.DataFrame(np.random.randn(10,3), columns=list('ABC'))
dff.iloc[3:5,0] = np.nan
dff.iloc[4:6,1] = np.nan
dff.iloc[5:8,2] = np.nan

In [60]:
dff

Unnamed: 0,A,B,C
0,1.672204,-0.161571,-0.499355
1,0.5471,0.260069,-0.023778
2,-0.881139,0.090301,0.599049
3,,1.856215,0.301525
4,,,0.928933
5,1.291969,,
6,-1.235145,1.564642,
7,1.090675,-0.033697,
8,-1.175103,0.138954,0.892285
9,-0.49665,-1.483051,-1.299149


In [61]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,1.672204,-0.161571,-0.499355
1,0.5471,0.260069,-0.023778
2,-0.881139,0.090301,0.599049
3,0.101739,1.856215,0.301525
4,0.101739,0.278983,0.928933
5,1.291969,0.278983,0.128501
6,-1.235145,1.564642,0.128501
7,1.090675,-0.033697,0.128501
8,-1.175103,0.138954,0.892285
9,-0.49665,-1.483051,-1.299149


In [73]:
# Or just drop any column with missing values 

dff2 = dff.fillna(dff.mean())
dff2.iloc[3,3] = None
dff2

Unnamed: 0,A,B,C,D
0,1.672204,-0.161571,-0.499355,0.527792
1,0.5471,0.260069,-0.023778,-1.300362
2,-0.881139,0.090301,0.599049,-1.329465
3,0.101739,1.856215,0.301525,
4,0.101739,0.278983,0.928933,-0.498353
5,1.291969,0.278983,0.128501,0.307779
6,-1.235145,1.564642,0.128501,-2.126383
7,1.090675,-0.033697,0.128501,0.865309
8,-1.175103,0.138954,0.892285,0.515821
9,-0.49665,-1.483051,-1.299149,0.324465


In [74]:
dff2.dropna(axis=1)

Unnamed: 0,A,B,C
0,1.672204,-0.161571,-0.499355
1,0.5471,0.260069,-0.023778
2,-0.881139,0.090301,0.599049
3,0.101739,1.856215,0.301525
4,0.101739,0.278983,0.928933
5,1.291969,0.278983,0.128501
6,-1.235145,1.564642,0.128501
7,1.090675,-0.033697,0.128501
8,-1.175103,0.138954,0.892285
9,-0.49665,-1.483051,-1.299149


In [76]:
# Filling in via interpolation

df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8],'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]})
df

Unnamed: 0,A,B
0,1.0,0.25
1,2.1,
2,,
3,4.7,4.0
4,5.6,12.2
5,6.8,14.4


In [77]:
df.interpolate()

Unnamed: 0,A,B
0,1.0,0.25
1,2.1,1.5
2,3.4,2.75
3,4.7,4.0
4,5.6,12.2
5,6.8,14.4


## Replacing generic values 

In [79]:
s = pd.Series([0,1,2,3,4,5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [80]:
s.replace(0,9)

0    9
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [82]:
# Or give a list 

s.replace([0,1,2,3,4,5], [11,12,13,14,15,16])

0    11
1    12
2    13
3    14
4    15
5    16
dtype: int64

In [84]:
# Or a dictionary that specifies the mapping
s.replace({0: 99, 2:999})

0     99
1      1
2    999
3      3
4      4
5      5
dtype: int64

### Replacing with strings and regex's

I can see this being important in FlipCalc.

In [90]:
d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
df = pd.DataFrame(d)
df

Unnamed: 0,a,b,c
0,0,a,a
1,1,b,b
2,2,.,
3,3,.,d


In [92]:
# This looks for a raw string with the form ".", and removes the whitespace before and after

df.replace(r'\s*\.\s*', np.nan, regex=True)

Unnamed: 0,a,b,c
0,0,a,a
1,1,b,b
2,2,,
3,3,,d
