In [1]:
import pandas as pd
import io
import math
import numpy as np

## Strategies: Missing Data Indicators

To indicate the presence of missing data in a table or DataFrame:

- Using a mask that globally indicates missing values as True/False Boolean arrays:
    - Requires additional memory to store a mask Boolean array
- Choosing a sentinel value that indicates a missing entry, such as:
    - 9999 as a missing integer value
    - NaN (Not a Number) as IEEE floating-point special value
        - NaN are not available for all data types

## Taxonomy: Missing Value Representations

Missing values can be represented in a number of ways:

- NA: Not Available
- NaN: Not a Number:
- None: Pythonic missing data, a Python object that can be used in arrays with data type object
- inf: positive infinity
- -inf: negative infinity

## None as a Missing Value
- None is a Python object
- It cannot be used in any arbitrary NumPy or Pandas array but only in arrays with data type dtype = object
- By default, Pandas will convert None values to NaN
- Calculations (e.g., sum, min, max) performed on arrays that contain None elements result in TypeError message

In [4]:
1 + None

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [5]:
array_none = np.array([0, None, 1, 2])

In [6]:
array_none

array([0, None, 1, 2], dtype=object)

In [7]:
array_none.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

# NaN as a Missing Value
- NaN is a special floating-point value in IEEE floating-point representation
- It acts like a data virus: the result of arithmetic with NaN is NaN

In [8]:
1 + np.nan

nan

In [9]:
0 * np.nan

nan

In [10]:
vec = np.array([0, np.nan, 1])

In [11]:
vec.sum()

nan

In [12]:
vec.min()

  return umr_minimum(a, axis, None, out, keepdims, initial)


nan

In [13]:
vec.max()

  return umr_maximum(a, axis, None, out, keepdims, initial)


nan

#### Special functions that ignore NaN


In [16]:
np.nanmax(vec)

1.0

In [17]:
np.nanmin(vec)

0.0

In [18]:
np.nansum(vec)

1.0

## NA (Not Available) as a Missing Value
- If Pandas encounters a Not Available value, then it treats it as NaN
- However, NumPy Arrays can NOT handle NA values; they can handle NaN values


In [19]:
data_pd_na = '''Gender|Age|Weight
    M | 22 | 72.0
    F | 29 | 55.0
    M | 24 |
    F || 57.0
'''
df_na = pd.read_table (io.StringIO(data_pd_na), sep = '|')
df_na

Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,29.0,55.0
2,M,24.0,
3,F,,57.0


In [20]:
df_na.min()

Gender        F 
Age           22
Weight        55
dtype: object

#### NumPy cannot handle NA values

In [21]:
d1 = '''M  22  72.0
        F  29  55.0
        M  24  78.0
        F  25  57.0
'''
data_np = io.StringIO(d1)

np.loadtxt(data_np, dtype={'names': ('Gender', 'Age', 'Color'), 
                     'formats': ('S1', 'i4', 'f4')})

array([(b'M', 22, 72.), (b'F', 29, 55.), (b'M', 24, 78.), (b'F', 25, 57.)],
      dtype=[('Gender', 'S1'), ('Age', '<i4'), ('Color', '<f4')])

In [22]:
d2 = '''M  22  72.0
        F  29  
        M  24  78.0
        F  25  57.0
'''
data_np_na = io.StringIO(d2)

# Should produce a ValueError
np.loadtxt(data_np_na, 
           dtype={'names': ('Gender', 'Age', 'Color'), 
                  'formats': ('S1', 'i4', 'f4')})

ValueError: Wrong number of columns at line 2

#### Missing value as NaN

In [24]:
d3 = '''M  22  72.0
        F  29  NaN
        M  24  78.0
        F  25  57.0
'''

data_np_nan = io.StringIO(d3)

In [25]:
np_array_nan = np.loadtxt(data_np_nan,
                         dtype = {'names':('Gender','Age','temp'),
                                 'formats':('S1','i4','f4')},)

In [26]:
np_array_nan

array([(b'M', 22, 72.), (b'F', 29, nan), (b'M', 24, 78.), (b'F', 25, 57.)],
      dtype=[('Gender', 'S1'), ('Age', '<i4'), ('temp', '<f4')])

In [29]:
np.reshape(np_array_nan, (4,1))

array([[(b'M', 22, 72.)],
       [(b'F', 29, nan)],
       [(b'M', 24, 78.)],
       [(b'F', 25, 57.)]],
      dtype=[('Gender', 'S1'), ('Age', '<i4'), ('temp', '<f4')])

In [33]:
d3 = '''1  22  1
        1  29  NaN
        1  24  1
        1  25  1
'''
data_np_nan = io.StringIO(d3)

# state explicitly to treat nan as floating point
np_array_nan = np.loadtxt(data_np_nan, dtype=np.float)

print (np_array_nan)
print (np_array_nan.shape)

[[ 1. 22.  1.]
 [ 1. 29. nan]
 [ 1. 24.  1.]
 [ 1. 25.  1.]]
(4, 3)


In [34]:
# should fail if try to treat nan as int:
d4 = '''1  22  1
        1  29  NaN
        1  24  1
        1  25  1
'''
data_np_nan = io.StringIO(d4)

np_array_nan_int = np.loadtxt(data_np_nan, dtype=np.int)

ValueError: invalid literal for int() with base 10: 'NaN'

## Coercion / Upcasting in Pandas' Handling of NaNs
In Pandas, when NaN value is introduced then the following upcasting conventions are followed dependning on the Type:

- float converts to np.nan (no change)
- object converts to None or np.nan (no change)
- int converts to np.nan (Cast to float64)
- bool converts to None or np.nan (Cast to object)

# Positive and Negative Infinity
To represent positive and/or negative infinity, a couple ways exist:

- float("inf") or float("-inf")
- np.inf, np.PINF, or -np.inf, np.NINF

In [35]:
float('inf')

inf

In [36]:
float('-inf')

-inf

In [37]:
np.inf

inf

In [39]:
-np.inf

-inf

In [40]:
np.PINF

inf

In [41]:
np.NINF

-inf

### To check if the number is infinite, one can use:

- the .isinf() function from the math module or
- np.isinf(), np.isposinf(), np.isneginf(), or np.isfinite() functions from the NumPy module
- pd.DataFrame.isin(): Pandas .isin() method: e.g., df.isin ([np.nan, np.inf, -np.inf])
- Use pd.DataFrame.isin() and check for rows that have any with  pd.DataFrame.any.
- Finally, use the boolean array to slice the DataFrame.

In [42]:
p_inf = np.PINF

In [43]:
n_inf = np.NINF

In [44]:
math.isinf(p_inf)

True

In [49]:
math.isinf(n_inf)

True

In [50]:
np.isposinf(p_inf)

True

In [51]:
np.isneginf(n_inf)

True

In [52]:
np.isinf(p_inf)

True

In [53]:
np.isinf(n_inf)

True

In [54]:
data_pd_na = '''Gender|Age|Weight
    M | 22 | 72.0
    F || 55.0
    M | 24 |-inf
    F |inf| 57.0
'''

In [55]:
df = pd.read_table(io.StringIO(data_pd_na), sep = '|')
df

Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,,55.0
2,M,24.0,-inf
3,F,inf,57.0


In [57]:
df.isin ([np.inf, -np.inf])

Unnamed: 0,Gender,Age,Weight
0,False,False,False
1,False,False,False
2,False,False,True
3,False,True,False


#### Remove rows with inf

In [59]:
df [ ~df.isin([np.inf, -np.inf]).any(1) ]

Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,,55.0


### Handling Infinity in pandas
- If you want to consider inf and -inf to be NaN in computations, you can also set:

In [60]:
pd.set_option('mode.use_inf_as_null', False)


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [62]:
pd.get_option('mode.use_inf_as_null')


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



False

In [63]:
data_pd_inf = '''Gender|Age|Weight
    M | 22 | 72.0
    F || 55.0
    M | 24 |-inf
    F |inf| 57.0
'''
df_inf = pd.read_table (io.StringIO(data_pd_inf), 
                        sep = '|')
df_inf

Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,,55.0
2,M,24.0,-inf
3,F,inf,57.0


In [64]:
pd.set_option ('mode.use_inf_as_null', True)
pd.get_option ('mode.use_inf_as_null')


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



True

In [65]:
# treat inf as NaN
data_pd_inf = '''Gender|Age|Weight
    M | 22 | 72.0
    F || 55.0
    M | 24 |-inf
    F |inf| 57.0
'''
df_nan = pd.read_table (io.StringIO(data_pd_inf), 
                        sep = '|')
df_nan

Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,,55.0
2,M,24.0,
3,F,,57.0


# Steps: Dealing with Missing Data
1. Identify the missing data
2. Examine the causes of the missing data (often times, it is a statistical question, and hence, it is skipped in this lecture)
3. Handle missing data:
    - Delete the cases containing missing data or
    - Replace (impute) the missing values with reasonable alternative values (it is often times a statistical question, and hence, it is omitted in this lecture)

## Step 1  : Identify missing data

To identify missing data, Python and its packages provide a number of functions/methods (detailed below) to test whether any of those incomplete values are present in the data :

- math package:
    - Function: math.isinf()
- NumPy package:
    - Function: np.isneginf(), np.isposinf(), np.isnan(), np.isfinite()
- Pandas package:
    - Methods: df.isnull(), df.notnull(), df.isin([ np.nan, np.inf, -np.inf ])

In [66]:
x = '''Gender|Age|Weight
    M | 22 | 72.0
    F | 29 | 55.0
    M | 24 |
    F || 57.0
'''
df_na = pd.read_table (io.StringIO(x), sep = '|')
df_na.isnull()

Unnamed: 0,Gender,Age,Weight
0,False,False,False
1,False,False,False
2,False,False,True
3,False,True,False


## Step-3: Handle Missing Data
To handle missing data, Pandas provides a number of basic (non-statistical) methods:

- df.dropna(axis = 'columns', how = 'any'): Return a filtered version of the data, namely drop full rows (axis=0) or full columns (axis=1) or with how='any' or how='all' have a more fine grained control over the values to drop (read documentation)
- df.fillna(): Return a copy of the data with missing values filled or imputed to propagate the previous value forward (method='ffill'), a backfill to propagate the next values backward (method='bfill') or impute with specific values, as illustrated:
- df.fillna('missing')
- df.fillna({'Age': df.Age.median(), 'Weight': 0.0})
- df.replace(np.nan, value = -1)

In [76]:
data_pd_na = '''Gender|Age|Weight
    M | 22 | 72.0
    F || 55.0
    M | 24 |-inf
    F |inf| 57.0
'''
df = pd.read_table (io.StringIO(data_pd_na), sep = '|')
df


Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,,55.0
2,M,24.0,
3,F,,57.0


In [73]:
df.dropna(axis = 1)

Unnamed: 0,Gender
0,M
1,F
2,M
3,F


In [75]:
df.fillna('missing_value')

Unnamed: 0,Gender,Age,Weight
0,M,22,72
1,F,missing_value,55
2,M,24,missing_value
3,F,missing_value,57


In [78]:
df.replace(np.nan, value = -1)

Unnamed: 0,Gender,Age,Weight
0,M,22.0,72.0
1,F,-1.0,55.0
2,M,24.0,-1.0
3,F,-1.0,57.0


# Use case: Sleep dataset with missing values

In [108]:
df = pd.read_csv('data_raw/sleep.csv', sep = ';')

In [82]:
df.shape

(62, 11)

In [83]:
df.columns

Index(['Unnamed: 0', 'BodyWgt', 'BrainWgt', 'NonD', 'Dream', 'Sleep', 'Span',
       'Gest', 'Pred', 'Exp', 'Danger'],
      dtype='object')

In [84]:
df.index

RangeIndex(start=0, stop=62, step=1)

In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,BodyWgt,BrainWgt,NonD,Dream,Sleep,Span,Gest,Pred,Exp,Danger
0,1,6654,5712,,,33,386.0,645,3,5,3
1,2,1,66,63.0,2.0,83,45.0,42,3,1,3
2,3,3385,445,,,125,14.0,60,1,1,1
3,4,92,57,,,165,,25,5,2,3
4,5,2547,4603,21.0,18.0,39,69.0,624,3,5,4


In [86]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Pred,Exp,Danger
count,62.0,62.0,62.0,62.0
mean,31.5,2.870968,2.419355,2.612903
std,18.041619,1.476414,1.604792,1.441252
min,1.0,1.0,1.0,1.0
25%,16.25,2.0,1.0,1.0
50%,31.5,3.0,2.0,2.0
75%,46.75,4.0,4.0,4.0
max,62.0,5.0,5.0,5.0


#### NOTE: Summary does not report the number of missing values

In [87]:
df.isin([np.nan, np.inf, -np.inf]).head()

Unnamed: 0.1,Unnamed: 0,BodyWgt,BrainWgt,NonD,Dream,Sleep,Span,Gest,Pred,Exp,Danger
0,False,False,False,True,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,True,False,False,False,False,False,False
3,False,False,False,True,True,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False


### Report the Count of Missing Values
To report the count of non-missing values, use the .info() method in Pandas:

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 11 columns):
Unnamed: 0    62 non-null int64
BodyWgt       62 non-null object
BrainWgt      62 non-null object
NonD          48 non-null object
Dream         50 non-null object
Sleep         58 non-null object
Span          58 non-null object
Gest          58 non-null object
Pred          62 non-null int64
Exp           62 non-null int64
Danger        62 non-null int64
dtypes: int64(4), object(7)
memory usage: 5.4+ KB


#### Get the count of missing values

In [91]:
df.isnull().sum()

Unnamed: 0     0
BodyWgt        0
BrainWgt       0
NonD          14
Dream         12
Sleep          4
Span           4
Gest           4
Pred           0
Exp            0
Danger         0
dtype: int64

In [92]:
df.count()

Unnamed: 0    62
BodyWgt       62
BrainWgt      62
NonD          48
Dream         50
Sleep         58
Span          58
Gest          58
Pred          62
Exp           62
Danger        62
dtype: int64

In [95]:
len(df.index)- df.count()

Unnamed: 0     0
BodyWgt        0
BrainWgt       0
NonD          14
Dream         12
Sleep          4
Span           4
Gest           4
Pred           0
Exp            0
Danger         0
dtype: int64

#### To get percentage of the missing values by DataFrame column:

In [99]:
df.isnull().sum()/len(df) *100

Unnamed: 0     0.000000
BodyWgt        0.000000
BrainWgt       0.000000
NonD          22.580645
Dream         19.354839
Sleep          6.451613
Span           6.451613
Gest           6.451613
Pred           0.000000
Exp            0.000000
Danger         0.000000
dtype: float64

#### To check the overall count of the missing values:

In [102]:
df.isnull().sum().sum()

38

## Remove Rows with Missing Values
There are multiple ways to remove rows in the data frame that have any type of missingness:

- positive or negative infinity or
- NaN


#### Approach 1: To check for rows that have any type of the missing values in the Pandas DataFrame and use the mask (Boolean) array to filter them out:

- Use pd.DataFrame.isin([np.nan, np.inf, -np.inf]) method
- Check for rows that have any with pd.DataFrame.any of the missing value types
- Finally, use the boolean array to slice the DataFrame.

In [103]:
df [ ~df.isin([np.nan, np.inf, -np.inf]).any(1) ].head(5)

Unnamed: 0.1,Unnamed: 0,BodyWgt,BrainWgt,NonD,Dream,Sleep,Span,Gest,Pred,Exp,Danger
1,2,1,66,63,2,83,45,42,3,1,3
4,5,2547,4603,21,18,39,69,624,3,5,4
5,6,1055,1795,91,7,98,27,180,4,4,4
6,7,23,3,158,39,197,19,35,1,1,1
7,8,160,169,52,1,62,304,392,4,5,4


#### Approach 2: Replace inf and -inf with NaN, and then select non-null rows:

In [106]:
df [ df.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1) ].head(5)

Unnamed: 0.1,Unnamed: 0,BodyWgt,BrainWgt,NonD,Dream,Sleep,Span,Gest,Pred,Exp,Danger
1,2,1,66,63,2,83,45,42,3,1,3
4,5,2547,4603,21,18,39,69,624,3,5,4
5,6,1055,1795,91,7,98,27,180,4,4,4
6,7,23,3,158,39,197,19,35,1,1,1
7,8,160,169,52,1,62,304,392,4,5,4


#### Approach 3: Replace inf and -inf with NaN, and then drop NaN's with .dropna() method:

In [109]:
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1).head(5)


Unnamed: 0.1,Unnamed: 0,BodyWgt,BrainWgt,Pred,Exp,Danger
0,1,6654,5712,3,5,3
1,2,1,66,3,1,3
2,3,3385,445,1,1,1
3,4,92,57,5,2,3
4,5,2547,4603,3,5,4
