Due to cheap storage, it is common that data is collected greedily, and your first step with pandas is to cut down the data to a dataset that is appropriate for your use.

In [10]:
import pandas as pd
import numpy as np

# Dropping useless data with indices


In [5]:
# Creating a MultiIndex (hierarchical index) object

In [8]:
levels = [('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

index = pd.MultiIndex.from_tuples(levels, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [11]:
pd.Series(np.random.randn(8), index=index)

first  second
bar    one      -0.057071
       two      -0.331952
baz    one       1.036632
       two       0.646001
foo    one      -0.975347
       two       0.500213
qux    one       2.454401
       two       1.087361
dtype: float64

In [18]:
# Basic indexing on axis with MultiIndex
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=[
    np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
    np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])
])
df

Unnamed: 0_level_0,bar,bar,baz,baz,foo,foo,qux,qux
Unnamed: 0_level_1,one,two,one,two,one,two,one,two
A,0.389582,-0.602542,2.503596,0.299168,-0.177092,-0.668312,0.958869,-1.101072
B,-0.617153,-0.075642,1.259489,-1.003335,-0.477362,-1.872181,-1.135609,1.734045
C,-0.708835,0.909013,-1.266279,1.23477,-1.127176,-0.679341,1.334789,-0.238502


In [19]:
df["bar"]

Unnamed: 0,one,two
A,0.389582,-0.602542
B,-0.617153,-0.075642
C,-0.708835,0.909013


In [21]:
df.T.loc[('bar', 'two')]

A   -0.602542
B   -0.075642
C    0.909013
Name: (bar, two), dtype: float64

In [22]:
df.T.loc[('bar', 'two')]["A"]

-0.6025415878242355

In [24]:
df.T.loc['baz':'foo']

Unnamed: 0,Unnamed: 1,A,B,C
baz,one,2.503596,1.259489,-1.266279
baz,two,0.299168,-1.003335,1.23477
foo,one,-0.177092,-0.477362,-1.127176
foo,two,-0.668312,-1.872181,-0.679341


In [40]:
df.T

Unnamed: 0,Unnamed: 1,A,B,C
bar,one,0.389582,-0.617153,-0.708835
bar,two,-0.602542,-0.075642,0.909013
baz,one,2.503596,1.259489,-1.266279
baz,two,0.299168,-1.003335,1.23477
foo,one,-0.177092,-0.477362,-1.127176
foo,two,-0.668312,-1.872181,-0.679341
qux,one,0.958869,-1.135609,1.334789
qux,two,-1.101072,1.734045,-0.238502


In [42]:
df.T.mean(level=0)

Unnamed: 0,A,B,C
bar,-0.10648,-0.346397,0.100089
baz,1.401382,0.128077,-0.015754
foo,-0.422702,-1.174772,-0.903258
qux,-0.071101,0.299218,0.548143


In [43]:
df.T.mean(level=1)

Unnamed: 0,A,B,C
one,0.918739,-0.242659,-0.441875
two,-0.518189,-0.304278,0.306485


# Advanced data cleaning with query and where

In [50]:
# query
df = pd.DataFrame(np.random.rand(10, 3), columns=list('xyz'))
df

Unnamed: 0,x,y,z
0,0.049428,0.920578,0.788714
1,0.285711,0.888506,0.324758
2,0.151997,0.880747,0.027628
3,0.106532,0.466499,0.604355
4,0.251896,0.560326,0.073755
5,0.981131,0.272537,0.860998
6,0.301034,0.329013,0.976976
7,0.395585,0.949703,0.710435
8,0.876363,0.247781,0.627287
9,0.984707,0.162863,0.177463


In [51]:
# we can query the df like this
df[(df.x< df.y) & (df.y < df.z)]

Unnamed: 0,x,y,z
3,0.106532,0.466499,0.604355
6,0.301034,0.329013,0.976976


In [52]:
# or, equivalently
df.query('(x < y) & (y < z)')

Unnamed: 0,x,y,z
3,0.106532,0.466499,0.604355
6,0.301034,0.329013,0.976976


In [56]:
# query based on the VALUE of the index
df.query('index > y > z')

Unnamed: 0,x,y,z
1,0.285711,0.888506,0.324758
2,0.151997,0.880747,0.027628
4,0.251896,0.560326,0.073755
7,0.395585,0.949703,0.710435


In [57]:
# using in and not in
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc')})
df

Unnamed: 0,a,b
0,a,a
1,a,a
2,b,a
3,b,a
4,c,b
5,c,b
6,d,b
7,d,b
8,e,c
9,e,c


In [58]:
df.query('a in b')

Unnamed: 0,a,b
0,a,a
1,a,a
2,b,a
3,b,a
4,c,b
5,c,b


In [59]:
df.query('a not in b')

Unnamed: 0,a,b
6,d,b
7,d,b
8,e,c
9,e,c
10,f,c
11,f,c


In [60]:
df.query('b == ["a", "b"]')

Unnamed: 0,a,b
0,a,a
1,a,a
2,b,a
3,b,a
4,c,b
5,c,b
6,d,b
7,d,b


In [47]:
# where
# DataFrame.where(cond, other=nan, inplace=False, axis=None, 
#     level=None, errors='raise', try_cast=False, raise_on_error=None)

s = pd.Series(range(5))
s

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [48]:
# select a subset
s[s > 0]

1    1
2    2
3    3
4    4
dtype: int64

In [45]:
# select a subset, but retain the shape of the Series
s.where(s > 0)

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [46]:
# similar, but different
s.mask(s > 0)

0    0.0
1    NaN
2    NaN
3    NaN
4    NaN
dtype: float64

# Untangling chained indices with views and copies

In [61]:
# chained indices?
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=[
    np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
    np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])
])
df

Unnamed: 0_level_0,bar,bar,baz,baz,foo,foo,qux,qux
Unnamed: 0_level_1,one,two,one,two,one,two,one,two
A,-1.479241,-1.347288,-0.416548,-0.733315,0.324568,-2.248343,-0.177548,1.189222
B,1.957982,-1.311407,0.759745,-0.267377,-1.461903,-0.753395,-0.73541,-0.03286
C,-1.452182,0.445434,-0.961542,2.800392,1.628802,1.06365,-1.761501,0.596913


In [62]:
# chaining indices
df["bar"]["one"]

A   -1.479241
B    1.957982
C   -1.452182
Name: one, dtype: float64

In [63]:
# one call
df.loc[:, ("bar", "one")]

A   -1.479241
B    1.957982
C   -1.452182
Name: (bar, one), dtype: float64

## Views vs. Copies

**Views** = a way of viewing the data of the array. the data of both objects is shared

**Copy** = an actual copy of the data

In [64]:
# correct, safe way to handle the ambiguity
df = pd.DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})
df.loc[0, 'A'] = 11
df

Unnamed: 0,A,B
0,11,1
1,bbb,2
2,ccc,3


In [65]:
# don't do this
df = df.copy()
df["A"][2] = "ddd"
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,A,B
0,11,1
1,bbb,2
2,ddd,3


In [67]:
# don't do this either
df = pd.DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})
df.loc[1]['A'] = "ff"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
