# Pandas

# Missing Data

A few convenient methods to deal with Missing Data in pandas:

In [108]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [60]:
df.sum()

A    3.0
B    5.0
C    6.0
dtype: float64

In [63]:
df['A'].sum()

3.0

In [64]:
df['A'].count()

2

In [52]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [69]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [109]:
df.isnull().sum()

# gives the count of number of missing values in each column

A    1
B    2
C    0
dtype: int64

In [111]:
df.notnull() # Opposite of the previous array

#Can also be used on a series

Unnamed: 0,A,B,C
0,True,True,True
1,True,False,True
2,False,False,True


In [71]:
df.dropna()
# default axis is 0

# drops all the rows with missing values. Even if there is one missing value
# in the entire row

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [72]:
df.dropna(axis=1)

#drops the columns

Unnamed: 0,C
0,1
1,2
2,3


In [73]:
df.dropna(thresh=2)

# with thresh you can specify what is the minimum threashold of number of
# missing values in each row or column

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [74]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [95]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [96]:
df.fillna(method = 'ffill')

# This is a forward fill method. Fills the entries with the ones in the previous column

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,2.0,5.0,3
