In [1]:
# handling missing data such as null NaN or NA
import numpy as np
import pandas as pd

In [2]:
# pythonic missing data 
# the first sentinel value for missing value is none which is a python object it can only be used in arrays with data type 'objet'
vals1 = np.array([1, None , 3 , 4])
vals1

array([1, None, 3, 4], dtype=object)

In [3]:
for dtype in ['object' , 'int']:
    print("dtype =", dtype)
    %timeit np.arange(100, dtype=dtype).sum()
    print()

dtype = object
5.39 µs ± 23.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

dtype = int
2.91 µs ± 69.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)



In [4]:
# you can not perform aggregation like sum or min

In [6]:
# NaN as missing value representation , nan is a special floating-point value recognized by all system
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [8]:
# any operation with nan will be another nan , it is like a virus
1 + np.nan

nan

In [9]:
0 *  np.nan

nan

In [10]:
vals2.sum() , vals2.min() , vals2.max()

(nan, nan, nan)

In [11]:
# to ignore nan, using this 
np.nansum(vals2) , np.nanmax(vals2) , np.nanmin(vals2)

(8.0, 4.0, 1.0)

In [12]:
# nan and none in pandas , pandas handles them samely 
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [13]:
x = pd.Series(range(2), dtype=int)
x

0    0
1    1
dtype: int32

In [14]:
# if we set a value in an integer array to np.nan, it will automatically be upcast to a floating-point type to accommodate the NA
x[0] = None
x

0    NaN
1    1.0
dtype: float64

In [15]:
# cast the integer array to floating point and converts None to NaN
# floating No change np.nan , object No change None or np.nan , integer Cast to float64 np.nan , boolean Cast to object None or np.nan
# typeclass conversion when storing NAs NA sentinel value

In [16]:
# operating on null values
# detecting , removing and replacing null value 
# isnull() , notnull() , dropna() , fillna()

In [17]:
# detecting null values
data = pd.Series([1, np.nan, 'hello', None])
data.isnull() # result is a series in this case

0    False
1     True
2    False
3     True
dtype: bool

In [18]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [19]:
data[data.isnull()]

1     NaN
3    None
dtype: object

In [20]:
# dropping null values
data.dropna()

0        1
2    hello
dtype: object

In [21]:
# for df there are more options 
df = pd.DataFrame([[1, np.nan , 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [22]:
df.dropna() # drop all the rows containing null

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [24]:
df.dropna(axis = 1) # can use axis = 'columns' as well

Unnamed: 0,2
0,2
1,5
2,6


In [25]:
# this will drop some good values in row or columns , if we use how = "any" we drop any row or column that contains numm, if 
# we use how = "all" , we drop row or column that are all null values
df[3] = np.nan # index refers to row so this gives us the 4th row, slicing gives us row
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [28]:
df.dropna(axis = 'columns' , how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [32]:
# thresh lets you specify a minimum number of non-null values for the row/column to be kept
df.dropna(axis = 0 , thresh = 3) # here the first and last row are dopped because they only have 2 non-null values

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [33]:
# filling null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [34]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [35]:
# We can specify a forward-fill to propagate the previous value forward
data.fillna(method ="ffill")

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [36]:
# back-fill
data.fillna(method = "bfill")

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [37]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [42]:
df.fillna(method = "ffill" , axis = 1) # specify an axis along which the fill takes place, axis = 1 along column 

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [43]:
df.fillna(method = "ffill" , axis = 0) # along rows

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [15]:
# hierarchical indexing 
# multiple index levels within a single index 
import pandas as pd
import numpy as np

In [16]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]

In [17]:
pop = pd.Series(populations , index = index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [18]:
index = pd.MultiIndex.from_tuples(index)
index # MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
      # labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) this is what we should get

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [19]:
pop = pop.reindex(index)
pop # first 2 cols show the multiple index, third col shows the data

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [20]:
pop [: , 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [21]:
# unstack converts a multiply-indexed series into a convetionally indexed df
pop_df = pop.unstack()
pop_df # stack provides the opposite operation

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [22]:
# with multi index it is easy to add another col
pop_df = pd.DataFrame({'total' : pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [23]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [24]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [26]:
# create multiple index
# a list of 2 OR more index array
df = pd.DataFrame(np.random.rand(4,2), index = [['a','a','b','b'],[1,2,1,2]], columns = ['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.978956,0.106597
a,2,0.346964,0.362333
b,1,0.869082,0.497684
b,2,0.736745,0.179


In [27]:
# pass a dic with appropriate tuple as keys , pandas will use a MultiIndex by default
data = {('California', 2000): 33871648,
                ('California', 2010): 37253956,
                ('Texas', 2000): 20851820,
                ('Texas', 2010): 25145561,
                ('New York', 2000): 18976457,
                ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [28]:
# explicit multiindex constructors
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [29]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [30]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [34]:
# levels: list of lists containing available index value for each level, labels : a list of lists that reference these labels

In [35]:
# multi-index level names
pop.index.names = ['state' ,'year']

In [36]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [39]:
# multi index for cols
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

In [40]:
data = np.round(np.random.randn(4, 6), 1)
data

array([[-1.4,  1.1,  1.5, -0.4, -1.2,  0.7],
       [ 0.1, -0.7,  1.1, -0.6, -0.3, -1. ],
       [-1. , -1.2, -1.1, -0.6,  1.3, -0.6],
       [-0.8,  0.6, -1.1, -1.4, -0.1, -0.7]])

In [42]:
data[:,::2] *= 10

In [43]:
data

array([[-14. ,   1.1,  15. ,  -0.4, -12. ,   0.7],
       [  1. ,  -0.7,  11. ,  -0.6,  -3. ,  -1. ],
       [-10. ,  -1.2, -11. ,  -0.6,  13. ,  -0.6],
       [ -8. ,   0.6, -11. ,  -1.4,  -1. ,  -0.7]])

In [44]:
data += 37

In [45]:
data

array([[23. , 38.1, 52. , 36.6, 25. , 37.7],
       [38. , 36.3, 48. , 36.4, 34. , 36. ],
       [27. , 35.8, 26. , 36.4, 50. , 36.4],
       [29. , 37.6, 26. , 35.6, 36. , 36.3]])

In [46]:
health_data = pd.DataFrame(data, index = index, columns = columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,23.0,38.1,52.0,36.6,25.0,37.7
2013,2,38.0,36.3,48.0,36.4,34.0,36.0
2014,1,27.0,35.8,26.0,36.4,50.0,36.4
2014,2,29.0,37.6,26.0,35.6,36.0,36.3


In [47]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,52.0,36.6
2013,2,48.0,36.4
2014,1,26.0,36.4
2014,2,26.0,35.6


In [48]:
# Indexing and Slicing a MultiIndex
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [50]:
pop['California',2000]

33871648

In [51]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [52]:
pop.loc['California' : 'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [53]:
pop[:, 2000] # partial indexing on lower level by passing an empty slice in the first index

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [54]:
pop[pop > 22000000] # boolean masks

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [55]:
pop[['California', 'Texas']] # fancy indexing

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [59]:
# MULTIPLY INDEXED DATAFRAMES
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,23.0,38.1,52.0,36.6,25.0,37.7
2013,2,38.0,36.3,48.0,36.4,34.0,36.0
2014,1,27.0,35.8,26.0,36.4,50.0,36.4
2014,2,29.0,37.6,26.0,35.6,36.0,36.3


In [60]:
health_data['Guido', 'HR']

year  visit
2013  1        52.0
      2        48.0
2014  1        26.0
      2        26.0
Name: (Guido, HR), dtype: float64

In [69]:
health_data.iloc[2:3,2:3]

Unnamed: 0_level_0,subject,Guido
Unnamed: 0_level_1,type,HR
year,visit,Unnamed: 2_level_2
2014,1,26.0


In [70]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        23.0
      2        38.0
2014  1        27.0
      2        29.0
Name: (Bob, HR), dtype: float64

In [71]:
idx = pd.IndexSlice
idx

<pandas.core.indexing._IndexSlice at 0x1ef73821e88>

In [72]:
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,23.0,52.0,25.0
2014,1,27.0,26.0,50.0


In [75]:
# Rearranging Multi-Indices index should be sorted
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.random(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.136797
      2      0.663657
c     1      0.248256
      2      0.005100
b     1      0.456784
      2      0.255854
dtype: float64

In [76]:
data = data.sort_index() # sortlevel
data

char  int
a     1      0.136797
      2      0.663657
b     1      0.456784
      2      0.255854
c     1      0.248256
      2      0.005100
dtype: float64

In [77]:
data['a':'b']

char  int
a     1      0.136797
      2      0.663657
b     1      0.456784
      2      0.255854
dtype: float64

In [78]:
# STACKING AND UNSTACKING INDICES
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [79]:
pop.unstack(level = 0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [80]:
pop.unstack(level = 1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [82]:
pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [85]:
# INDEX SETTING AND RESETTING
pop_flat = pop.reset_index(name = 'population')
pop_flat # trun the index labes into columns

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [86]:
pop_flat.set_index(['state', 'year']) # build a multi-index from the column values 

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [97]:
# Data Aggregations on Multi-Indices
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,23.0,38.1,52.0,36.6,25.0,37.7
2013,2,38.0,36.3,48.0,36.4,34.0,36.0
2014,1,27.0,35.8,26.0,36.4,50.0,36.4
2014,2,29.0,37.6,26.0,35.6,36.0,36.3


In [100]:
data_mean = health_data.mean(level = 'year')
data_mean # data aggregation methods shoud be used on the level 

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,30.5,37.2,50.0,36.5,29.5,36.85
2014,28.0,36.7,26.0,36.0,43.0,36.35


In [101]:
data_mean = health_data.mean(axis = 1 , level = 'type')
data_mean

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,33.333333,37.466667
2013,2,40.0,36.233333
2014,1,34.333333,36.2
2014,2,30.333333,36.5
