Missing Data
1. Identifying missing Data
2. Dropping missing Data
3. Filling in missing Data
4. Pandas data functions automatically ignores missing values

In [1]:
import pandas as pd
import numpy as np

ser = pd.Series({'AAPL': 0.01, 'MSFT': 0.02, 'TSLA':np.nan, 'LULU': 0.05})

index = ['20201201','20201202','20201203','20201204']
columns = ['AAPL','MSFT','TSLA','LULU']

data =[[np.nan,0.03,0.05,0.005],
       [np.nan,np.nan,-0.05,-0.0025],
       [np.nan,np.nan,np.nan,np.nan],
       [np.nan,0.015,0.03,0.01]]

df=pd.DataFrame(data,index=index,columns=columns)
df

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,,-0.05,-0.0025
20201203,,,,
20201204,,0.015,0.03,0.01


Identifying missing data

In [2]:
#missing data is represented by np.nan
nan=np.nan
nan

nan

In [3]:
pd.isnull(nan)

True

In [4]:
pd.isnull(0)

False

In [5]:
ser

AAPL    0.01
MSFT    0.02
TSLA     NaN
LULU    0.05
dtype: float64

In [6]:
ser.isnull()

AAPL    False
MSFT    False
TSLA     True
LULU    False
dtype: bool

In [7]:
ser.notnull()

AAPL     True
MSFT     True
TSLA    False
LULU     True
dtype: bool

In [8]:
ser.count()

np.int64(3)

In [9]:
df.isnull()

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,True,False,False,False
20201202,True,True,False,False
20201203,True,True,True,True
20201204,True,False,False,False


Dropping missing Data

In [10]:
ser[ser.notnull()]

AAPL    0.01
MSFT    0.02
LULU    0.05
dtype: float64

In [11]:
ser.dropna()

AAPL    0.01
MSFT    0.02
LULU    0.05
dtype: float64

In [12]:
#By default dropna drops any row containing NaN
df.dropna()

Unnamed: 0,AAPL,MSFT,TSLA,LULU


In [13]:
df

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,,-0.05,-0.0025
20201203,,,,
20201204,,0.015,0.03,0.01


In [14]:
df.dropna(how='all')

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,,-0.05,-0.0025
20201204,,0.015,0.03,0.01


In [15]:
df.dropna(thresh=3)

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201204,,0.015,0.03,0.01


In [16]:
df.dropna(axis=1, thresh=3)

Unnamed: 0,TSLA,LULU
20201201,0.05,0.005
20201202,-0.05,-0.0025
20201203,,
20201204,0.03,0.01


Filling Missing Data

In [17]:
ser

AAPL    0.01
MSFT    0.02
TSLA     NaN
LULU    0.05
dtype: float64

In [19]:
ser.fillna(0)

AAPL    0.01
MSFT    0.02
TSLA    0.00
LULU    0.05
dtype: float64

In [20]:
df

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,,-0.05,-0.0025
20201203,,,,
20201204,,0.015,0.03,0.01


In [21]:
df.fillna(0)

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,0.0,0.03,0.05,0.005
20201202,0.0,0.0,-0.05,-0.0025
20201203,0.0,0.0,0.0,0.0
20201204,0.0,0.015,0.03,0.01


In [22]:
df.fillna({'AAPL':0.08,'MSFT':0.08,'TSLA':0.08,'LULU':0.08})

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,0.08,0.03,0.05,0.005
20201202,0.08,0.08,-0.05,-0.0025
20201203,0.08,0.08,0.08,0.08
20201204,0.08,0.015,0.03,0.01


In [24]:
avg = df.mean()
avg

AAPL         NaN
MSFT    0.022500
TSLA    0.010000
LULU    0.004167
dtype: float64

In [25]:
df.fillna(avg)

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,0.0225,-0.05,-0.0025
20201203,,0.0225,0.01,0.004167
20201204,,0.015,0.03,0.01


In [26]:
df.fillna(method='ffill', limit = 1)

  df.fillna(method='ffill', limit = 1)


Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,0.03,-0.05,-0.0025
20201203,,,-0.05,-0.0025
20201204,,0.015,0.03,0.01


Pandas functions Auto Exclude NaN

In [27]:
df.mean()

AAPL         NaN
MSFT    0.022500
TSLA    0.010000
LULU    0.004167
dtype: float64

In [28]:
#only use non-NaN values
df.fillna(0).sum()/df.count()

AAPL         NaN
MSFT    0.022500
TSLA    0.010000
LULU    0.004167
dtype: float64

In [29]:
np.mean(df.values,1)

array([nan, nan, nan, nan])

In [30]:
df.rank()

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,2.0,3.0,2.0
20201202,,,1.0,1.0
20201203,,,,
20201204,,1.0,2.0,3.0
