# Missing Data & Index Hierarchy

In [1]:
import numpy as np
import pandas as pd

In [5]:
# create a series
ser1 = pd.Series(['one', 'two', np.nan, 'four'])
ser1

0     one
1     two
2     NaN
3    four
dtype: object

In [6]:
# find null values
ser1.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [9]:
# return all non-null values
ser1.dropna()

0     one
1     two
3    four
dtype: object

In [12]:
# create a DataFrame
df1 = pd.DataFrame([[1, 2, 3],[np.nan, 5, 6], [np.nan, np.nan, np.nan]])
df1

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,,,


In [15]:
# create a new DataFrame without null values
df1_clean = df1.dropna()
df1_clean

Unnamed: 0,0,1,2
0,1,2,3


In pandas, when you call .dropna() on a DataFrame, any row with one or more null values is dropped. We can, however, specify to only drop rows that are mising all data. 

In [18]:
# drop rows missing all data
df1_clean2 = df1.dropna(how = 'all')
df1_clean2

Unnamed: 0,0,1,2
0,1.0,2,3
1,,5,6


In [19]:
# drop columns instead of rows
df1.dropna(axis = 1)

0
1
2


This makes sense, because every column has at least one null value, so every column got dropped. 

In [20]:
# create a shortcut for typing null values
nil = np.nan

In [23]:
# create new DataFrame
df2 = pd.DataFrame([[1, 2, 3, nil], 
                    [2, nil, 5, 6], 
                    [nil, 7, nil, 9], 
                    [1, nil, nil, nil]])
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [24]:
# keep rows that have at least two data points
df2.dropna(thresh = 2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [25]:
# keep rows that have at least three data points
df2.dropna(thresh = 3)

Unnamed: 0,0,1,2,3
0,1,2.0,3,
1,2,,5,6.0


In [26]:
# let's instead fill the null values with 1
df2.fillna(1)

Unnamed: 0,0,1,2,3
0,1,2,3,1
1,2,1,5,6
2,1,7,1,9
3,1,1,1,1


In [29]:
# we can also fill null values specific to each column with a dictionary
df2.fillna({0:'?', 1:' ', 2:'Null', 3:'Nil'})

Unnamed: 0,0,1,2,3
0,1,2.0,3,Nil
1,2,,5,6
2,?,7.0,Null,9
3,1,,Null,Nil


When using .dropna() or .fillna(), we display temporary results. If you want to affect the actual DataFrame permanently, you can use the inplace = True keyword argument. 

### Index Hierarchy

In [32]:
# create example series
ser2 = pd.Series(np.random.rand(6), 
                 index = [[1, 1, 1, 2, 2, 2],
                          list('abcabc')])
ser2

1  a    0.175758
   b    0.360435
   c    0.029764
2  a    0.955344
   b    0.823548
   c    0.090821
dtype: float64

What this does is it creates a multiple index. It matches up the two lists, and assigns one index level to the other one. In this case, 1 was assigned to the first three values of abc, and 2 was assigned to the next three values. 

We can see the multiple index using .index.

In [34]:
# see the multiple index
ser2.index

MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [36]:
# call the first top-level multi index
ser2[1]

a    0.175758
b    0.360435
c    0.029764
dtype: float64

In [40]:
# call all values on the second-level multi index
# the colon tells us to grab everything from the first index
ser2[:,'a']

1    0.175758
2    0.955344
dtype: float64

In [41]:
# we can create a DataFrame from a multiple-level series
ser2.unstack()

Unnamed: 0,a,b,c
1,0.175758,0.360435,0.029764
2,0.955344,0.823548,0.090821


In [51]:
# construct a DataFrame with multiple index and column levels
df3 = pd.DataFrame(np.arange(16).reshape(4, 4),
                  index = [['a', 'a', 'b', 'b'],
                          [1, 2, 1, 2]],
                  columns = [['NY', 'NY', 'LA', 'SF'],
                            ['cold', 'hot', 'hot', 'cold']]
                  )
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [52]:
# name our index levels
df3.index.names = ['Index_1', 'Index_2']
df3.columns.names = ['Cities', 'Temp']
df3

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
Index_1,Index_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [53]:
# swap index levels
# need to put axis = 1 because we're swapping columns
df3.swaplevel('Cities', 'Temp', axis = 1)

Unnamed: 0_level_0,Temp,cold,hot,hot,cold
Unnamed: 0_level_1,Cities,NY,NY,LA,SF
Index_1,Index_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [55]:
# sort levels
df3.sortlevel('Index_2')
# df3.sortlevel(1) would do the same thing

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
Index_1,Index_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
b,1,8,9,10,11
a,2,4,5,6,7
b,2,12,13,14,15


In [59]:
# perform operations on particular levels
df3.sum(level = 'Temp', axis = 1)

Unnamed: 0_level_0,Temp,cold,hot
Index_1,Index_2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3,3
a,2,11,11
b,1,19,19
b,2,27,27
