In [2]:
# standard imports
import numpy as np
import pandas as pd
# import scipy.stats as stats
# from sklearn import ...

# graphing imports
import matplotlib as mpl
import matplotlib.pyplot as plt
# import seaborn as sns

%matplotlib inline

In [3]:
# create a data frame from a table copied from wikipedia
calgary_df = pd.read_clipboard()

In [6]:
# display data frame information
calgary_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 10
Data columns (total 6 columns):
Rank      11 non-null int64
Nation    11 non-null object
Gold      11 non-null int64
Silver    11 non-null int64
Bronze    11 non-null int64
Total     11 non-null int64
dtypes: int64(5), object(1)
memory usage: 616.0+ bytes


In [7]:
# create a data frame by reading the clipboard using data copied out of Teradata
area1_df = pd.read_clipboard()

In [4]:
# create a series of random normal variables with a specified index
s1 = pd.Series(np.random.randn(10), index=list('ABCDEFGHIJ'))
s1

A   -0.477103
B   -0.633084
C    2.551989
D   -0.918584
E   -1.008493
F    0.561798
G    0.539526
H   -0.788453
I    0.067096
J    1.151809
dtype: float64

In [5]:
# create a series with a custom index
s2 = pd.Series(np.arange(10)**2, index=list('ABCEGHIJKL'))
s2

A     0
B     1
C     4
E     9
G    16
H    25
I    36
J    49
K    64
L    81
dtype: int32

In [6]:
# add two pandas series together, and then drop the null values
s3 = s1 + s2
s3.dropna()

A    -0.477103
B     0.366916
C     6.551989
E     7.991507
G    16.539526
H    24.211547
I    36.067096
J    50.151809
dtype: float64

In [7]:
# create a Pandas index of a date range
date_list = pd.date_range('20150101', periods=6, freq='D')
date_list

DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
# create a data frame with the date range index we just created
df1 = pd.DataFrame(np.random.randn(6, 4), index=date_list, columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
2015-01-01,-0.852818,-1.648361,0.342131,0.899479
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-03,1.572601,1.592835,-1.772723,0.713786
2015-01-04,1.797927,0.600935,-0.18338,-1.144088
2015-01-05,-0.843192,0.744456,-1.847296,1.004035
2015-01-06,-1.430559,-0.758815,0.3545,0.038475


In [9]:
# create a data frame from a dictionary
df2 = pd.DataFrame({'A' : 1., 
                    'B' : pd.Timestamp('20160229'), 
                    'C' : pd.Series(1, index=list(range(4)), dtype='float32'), 
                    'D' : np.array([3] * 4, dtype = 'int32'), 
                    'E' : pd.Categorical(['test', 'train', 'test', 'train']), 
                    'F' : 'foo' 
                   })

df2

Unnamed: 0,A,B,C,D,E,F
0,1,2016-02-29,1,3,test,foo
1,1,2016-02-29,1,3,train,foo
2,1,2016-02-29,1,3,test,foo
3,1,2016-02-29,1,3,train,foo


In [10]:
# show the datatypes in our data frame
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:
# display the top 5 (by default) rows of the data frame
df1.head()

Unnamed: 0,A,B,C,D
2015-01-01,-0.852818,-1.648361,0.342131,0.899479
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-03,1.572601,1.592835,-1.772723,0.713786
2015-01-04,1.797927,0.600935,-0.18338,-1.144088
2015-01-05,-0.843192,0.744456,-1.847296,1.004035


In [12]:
# display the last 3 rows of the data frame
df1.tail(3)

Unnamed: 0,A,B,C,D
2015-01-04,1.797927,0.600935,-0.18338,-1.144088
2015-01-05,-0.843192,0.744456,-1.847296,1.004035
2015-01-06,-1.430559,-0.758815,0.3545,0.038475


In [13]:
# show the data frame index
df1.index

DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
# show the data frame columns
df1.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
# show all the non-index or column-name values as a NumPy array
df1.values

array([[-0.85281807, -1.64836054,  0.3421311 ,  0.89947889],
       [ 0.72591201,  0.08780012,  0.13900568, -0.31891048],
       [ 1.57260113,  1.59283524, -1.77272305,  0.71378594],
       [ 1.79792673,  0.60093474, -0.18338042, -1.14408794],
       [-0.84319175,  0.74445576, -1.84729558,  1.00403457],
       [-1.43055912, -0.75881546,  0.3544998 ,  0.03847514]])

In [16]:
# describe (quick summary statistics) on the data frame
df1.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.161645,0.103142,-0.494627,0.198796
std,1.382791,1.156653,1.037573,0.836856
min,-1.430559,-1.648361,-1.847296,-1.144088
25%,-0.850411,-0.547162,-1.375387,-0.229564
50%,-0.05864,0.344367,-0.022187,0.376131
75%,1.360929,0.708576,0.29135,0.853056
max,1.797927,1.592835,0.3545,1.004035


In [17]:
# transpose the data frame
df1.T

Unnamed: 0,2015-01-01 00:00:00,2015-01-02 00:00:00,2015-01-03 00:00:00,2015-01-04 00:00:00,2015-01-05 00:00:00,2015-01-06 00:00:00
A,-0.852818,0.725912,1.572601,1.797927,-0.843192,-1.430559
B,-1.648361,0.0878,1.592835,0.600935,0.744456,-0.758815
C,0.342131,0.139006,-1.772723,-0.18338,-1.847296,0.3545
D,0.899479,-0.31891,0.713786,-1.144088,1.004035,0.038475


In [18]:
# sort the data frame using the index
df1.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2015-01-01,0.899479,0.342131,-1.648361,-0.852818
2015-01-02,-0.31891,0.139006,0.0878,0.725912
2015-01-03,0.713786,-1.772723,1.592835,1.572601
2015-01-04,-1.144088,-0.18338,0.600935,1.797927
2015-01-05,1.004035,-1.847296,0.744456,-0.843192
2015-01-06,0.038475,0.3545,-0.758815,-1.430559


In [19]:
# sort the data frame by non-index values
df1.sort_values(by='B')

Unnamed: 0,A,B,C,D
2015-01-01,-0.852818,-1.648361,0.342131,0.899479
2015-01-06,-1.430559,-0.758815,0.3545,0.038475
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-04,1.797927,0.600935,-0.18338,-1.144088
2015-01-05,-0.843192,0.744456,-1.847296,1.004035
2015-01-03,1.572601,1.592835,-1.772723,0.713786


In [20]:
# call a column in the data frame as a series
df1.A

2015-01-01   -0.852818
2015-01-02    0.725912
2015-01-03    1.572601
2015-01-04    1.797927
2015-01-05   -0.843192
2015-01-06   -1.430559
Freq: D, Name: A, dtype: float64

In [21]:
# call a column in the data frame as a series
df1['A']

2015-01-01   -0.852818
2015-01-02    0.725912
2015-01-03    1.572601
2015-01-04    1.797927
2015-01-05   -0.843192
2015-01-06   -1.430559
Freq: D, Name: A, dtype: float64

In [22]:
# slice the data frame using positions
df1[0:3]

Unnamed: 0,A,B,C,D
2015-01-01,-0.852818,-1.648361,0.342131,0.899479
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-03,1.572601,1.592835,-1.772723,0.713786


In [23]:
# slice the data frame using index values
df1['2015-01-02':'2015-01-05']

Unnamed: 0,A,B,C,D
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-03,1.572601,1.592835,-1.772723,0.713786
2015-01-04,1.797927,0.600935,-0.18338,-1.144088
2015-01-05,-0.843192,0.744456,-1.847296,1.004035


In [24]:
# remember what the date list index that we made looks like
date_list

DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06'],
              dtype='datetime64[ns]', freq='D')

In [25]:
# you can slide the date list like any other iterable in Python
date_list[0]

Timestamp('2015-01-01 00:00:00', offset='D')

In [26]:
# call .loc to return a row as a series based on a supplied index
df1.loc['2015-01-03']

A    1.572601
B    1.592835
C   -1.772723
D    0.713786
Name: 2015-01-03 00:00:00, dtype: float64

In [27]:
# you can also do this by slicing the index
df1.loc[date_list[0]]

A   -0.852818
B   -1.648361
C    0.342131
D    0.899479
Name: 2015-01-01 00:00:00, dtype: float64

In [28]:
# data frame slicing
df1.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2015-01-01,-0.852818,-1.648361
2015-01-02,0.725912,0.0878
2015-01-03,1.572601,1.592835
2015-01-04,1.797927,0.600935
2015-01-05,-0.843192,0.744456
2015-01-06,-1.430559,-0.758815


In [29]:
# data frame slicing
df1.loc['2015-01-02':'2015-01-05', ['A', 'B']]

Unnamed: 0,A,B
2015-01-02,0.725912,0.0878
2015-01-03,1.572601,1.592835
2015-01-04,1.797927,0.600935
2015-01-05,-0.843192,0.744456


In [30]:
# data frame slicing
df1.loc['2015-01-04', ['B', 'C']]

B    0.600935
C   -0.183380
Name: 2015-01-04 00:00:00, dtype: float64

In [31]:
# data frame slicing
df1.loc['2015-01-01', 'B']

-1.6483605387377085

In [32]:
# data frame slicing by position
df1.iloc[3]

A    1.797927
B    0.600935
C   -0.183380
D   -1.144088
Name: 2015-01-04 00:00:00, dtype: float64

In [33]:
# data frame slicing by position
df1.iloc[3:5, 0:2]

Unnamed: 0,A,B
2015-01-04,1.797927,0.600935
2015-01-05,-0.843192,0.744456


In [34]:
# data frames with conditions
# pull any rows where the value in column A is > 0
df1[df1.A > 0]

Unnamed: 0,A,B,C,D
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-03,1.572601,1.592835,-1.772723,0.713786
2015-01-04,1.797927,0.600935,-0.18338,-1.144088


In [35]:
# data frames with conditions
# pull any rows where the value in column A is > 0
df1[df1['A'] > 0]

Unnamed: 0,A,B,C,D
2015-01-02,0.725912,0.0878,0.139006,-0.31891
2015-01-03,1.572601,1.592835,-1.772723,0.713786
2015-01-04,1.797927,0.600935,-0.18338,-1.144088


In [36]:
# data frame with conditions
# show any cell where the value is > 0
df1[df1 > 0]

Unnamed: 0,A,B,C,D
2015-01-01,,,0.342131,0.899479
2015-01-02,0.725912,0.0878,0.139006,
2015-01-03,1.572601,1.592835,,0.713786
2015-01-04,1.797927,0.600935,,
2015-01-05,,0.744456,,1.004035
2015-01-06,,,0.3545,0.038475


In [37]:
# create a new data frame as a copy of an existing data frame
df2 = df1.copy()

In [38]:
# print the IDs of both data frames, to show that they're separate objects in memory
print(id(df1), id(df2), sep='\n')

162029920
162119016


In [39]:
# create a new column 
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2015-01-01,-0.852818,-1.648361,0.342131,0.899479,one
2015-01-02,0.725912,0.0878,0.139006,-0.31891,one
2015-01-03,1.572601,1.592835,-1.772723,0.713786,two
2015-01-04,1.797927,0.600935,-0.18338,-1.144088,three
2015-01-05,-0.843192,0.744456,-1.847296,1.004035,four
2015-01-06,-1.430559,-0.758815,0.3545,0.038475,three


In [40]:
# use the "is in" method to test for inclusion
# show Booleans for column E testing for inclusion criteria
df2['E'].isin(['two', 'four'])

2015-01-01    False
2015-01-02    False
2015-01-03     True
2015-01-04    False
2015-01-05     True
2015-01-06    False
Freq: D, Name: E, dtype: bool

In [41]:
# another form of the syntax where you can use "is in"
# display all rows where column E has the value 'two' or 'four'
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2015-01-03,1.572601,1.592835,-1.772723,0.713786,two
2015-01-05,-0.843192,0.744456,-1.847296,1.004035,four


In [42]:
# create a new series
s1 = pd.Series(np.arange(1, 7), index=pd.date_range('20150101', periods=6))
s1

2015-01-01    1
2015-01-02    2
2015-01-03    3
2015-01-04    4
2015-01-05    5
2015-01-06    6
Freq: D, dtype: int32

In [43]:
# add that new series to our existing data frame
df2['F'] = s1

In [44]:
# display the data frame
df2

Unnamed: 0,A,B,C,D,E,F
2015-01-01,-0.852818,-1.648361,0.342131,0.899479,one,1
2015-01-02,0.725912,0.0878,0.139006,-0.31891,one,2
2015-01-03,1.572601,1.592835,-1.772723,0.713786,two,3
2015-01-04,1.797927,0.600935,-0.18338,-1.144088,three,4
2015-01-05,-0.843192,0.744456,-1.847296,1.004035,four,5
2015-01-06,-1.430559,-0.758815,0.3545,0.038475,three,6


In [45]:
# use the .at method to pull a single value by specifying the index and column
df2.at[date_list[0], 'F']

1

In [46]:
# use the .at method
df2.at[date_list[0], 'F'] = 0
df2

Unnamed: 0,A,B,C,D,E,F
2015-01-01,-0.852818,-1.648361,0.342131,0.899479,one,0
2015-01-02,0.725912,0.0878,0.139006,-0.31891,one,2
2015-01-03,1.572601,1.592835,-1.772723,0.713786,two,3
2015-01-04,1.797927,0.600935,-0.18338,-1.144088,three,4
2015-01-05,-0.843192,0.744456,-1.847296,1.004035,four,5
2015-01-06,-1.430559,-0.758815,0.3545,0.038475,three,6


In [47]:
# use .iat to do the same this with positional arguments
df2.iat[1, 1] = -10
df2

Unnamed: 0,A,B,C,D,E,F
2015-01-01,-0.852818,-1.648361,0.342131,0.899479,one,0
2015-01-02,0.725912,-10.0,0.139006,-0.31891,one,2
2015-01-03,1.572601,1.592835,-1.772723,0.713786,two,3
2015-01-04,1.797927,0.600935,-0.18338,-1.144088,three,4
2015-01-05,-0.843192,0.744456,-1.847296,1.004035,four,5
2015-01-06,-1.430559,-0.758815,0.3545,0.038475,three,6


In [48]:
# overwrite column D in our data frame
df2.loc[:, 'D'] = np.array([5] * len(df2))
df2

Unnamed: 0,A,B,C,D,E,F
2015-01-01,-0.852818,-1.648361,0.342131,5,one,0
2015-01-02,0.725912,-10.0,0.139006,5,one,2
2015-01-03,1.572601,1.592835,-1.772723,5,two,3
2015-01-04,1.797927,0.600935,-0.18338,5,three,4
2015-01-05,-0.843192,0.744456,-1.847296,5,four,5
2015-01-06,-1.430559,-0.758815,0.3545,5,three,6


In [49]:
# create a new data frame from our existing one using .reindex
df3 = df2.reindex(index=date_list[0:4], columns=list(df1.columns) + ['E'])

In [50]:
# show the new data frame
df3

Unnamed: 0,A,B,C,D,E
2015-01-01,-0.852818,-1.648361,0.342131,5,one
2015-01-02,0.725912,-10.0,0.139006,5,one
2015-01-03,1.572601,1.592835,-1.772723,5,two
2015-01-04,1.797927,0.600935,-0.18338,5,three


In [54]:
df4 = df1[df1 > 0]
df4

Unnamed: 0,A,B,C,D
2015-01-01,,,0.342131,0.899479
2015-01-02,0.725912,0.0878,0.139006,
2015-01-03,1.572601,1.592835,,0.713786
2015-01-04,1.797927,0.600935,,
2015-01-05,,0.744456,,1.004035
2015-01-06,,,0.3545,0.038475


In [57]:
df4.isnull()

Unnamed: 0,A,B,C,D
2015-01-01,True,True,False,False
2015-01-02,False,False,False,True
2015-01-03,False,False,True,False
2015-01-04,False,False,True,True
2015-01-05,True,False,True,False
2015-01-06,True,True,False,False


In [58]:
df4.notnull()

Unnamed: 0,A,B,C,D
2015-01-01,False,False,True,True
2015-01-02,True,True,True,False
2015-01-03,True,True,False,True
2015-01-04,True,True,False,False
2015-01-05,False,True,False,True
2015-01-06,False,False,True,True


In [59]:
df4.dropna()

Unnamed: 0,A,B,C,D


In [61]:
df4.fillna(value=5)

Unnamed: 0,A,B,C,D
2015-01-01,5.0,5.0,0.342131,0.899479
2015-01-02,0.725912,0.0878,0.139006,5.0
2015-01-03,1.572601,1.592835,5.0,0.713786
2015-01-04,1.797927,0.600935,5.0,5.0
2015-01-05,5.0,0.744456,5.0,1.004035
2015-01-06,5.0,5.0,0.3545,0.038475


In [63]:
df1.apply(np.mean)

A    0.161645
B    0.103142
C   -0.494627
D    0.198796
dtype: float64

In [65]:
df1.mean(axis=1)

2015-01-01   -0.314892
2015-01-02    0.158452
2015-01-03    0.526625
2015-01-04    0.267848
2015-01-05   -0.235499
2015-01-06   -0.449100
Freq: D, dtype: float64

In [68]:
df1.apply(lambda x: x.max() - x.min())

A    3.228486
B    3.241196
C    2.201795
D    2.148123
dtype: float64

In [76]:
df1.std(ddof=1)

A    1.382791
B    1.156653
C    1.037573
D    0.836856
dtype: float64

In [81]:
s = pd.Series(np.random.randint(0, 7, size=10), index=list('ABCDEFGHIJ'))
s

A    6
B    3
C    2
D    4
E    2
F    5
G    0
H    1
I    4
J    0
dtype: int32

In [82]:
s.value_counts()

4    2
2    2
0    2
6    1
5    1
3    1
1    1
dtype: int64

In [86]:
s2 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s2

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [87]:
s2.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [88]:
df5 = pd.DataFrame(np.random.randn(10, 4))
df5

Unnamed: 0,0,1,2,3
0,1.442579,0.430383,0.327151,1.454519
1,1.935176,0.28332,-1.039744,-0.584535
2,-0.920897,-0.874215,0.927261,-0.12876
3,1.027315,0.65574,-0.93766,-1.437416
4,-1.109134,0.470868,-1.455269,0.361015
5,-0.262673,0.865941,-0.965899,-1.965701
6,-1.300713,-0.854294,2.644012,0.238711
7,0.364314,-0.194016,-0.448197,1.725454
8,-0.229565,0.549458,-2.287685,1.154289
9,-0.522452,1.420734,1.081838,0.557964


In [91]:
pieces = [df5[:3], df5[3:7], df5[7:]]
pieces

[          0         1         2         3
 0  1.442579  0.430383  0.327151  1.454519
 1  1.935176  0.283320 -1.039744 -0.584535
 2 -0.920897 -0.874215  0.927261 -0.128760,
           0         1         2         3
 3  1.027315  0.655740 -0.937660 -1.437416
 4 -1.109134  0.470868 -1.455269  0.361015
 5 -0.262673  0.865941 -0.965899 -1.965701
 6 -1.300713 -0.854294  2.644012  0.238711,
           0         1         2         3
 7  0.364314 -0.194016 -0.448197  1.725454
 8 -0.229565  0.549458 -2.287685  1.154289
 9 -0.522452  1.420734  1.081838  0.557964]

In [92]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.442579,0.430383,0.327151,1.454519
1,1.935176,0.28332,-1.039744,-0.584535
2,-0.920897,-0.874215,0.927261,-0.12876
3,1.027315,0.65574,-0.93766,-1.437416
4,-1.109134,0.470868,-1.455269,0.361015
5,-0.262673,0.865941,-0.965899,-1.965701
6,-1.300713,-0.854294,2.644012,0.238711
7,0.364314,-0.194016,-0.448197,1.725454
8,-0.229565,0.549458,-2.287685,1.154289
9,-0.522452,1.420734,1.081838,0.557964


In [102]:
left_df = pd.DataFrame({'key1':['foo', 'bar', 'x'], 
                        'left_value':[1, 2, 3]})
right_df = pd.DataFrame({'key1':['foo', 'bar'], 
                         'right_value':[4, 5]})

In [103]:
left_df

Unnamed: 0,key1,left_value
0,foo,1
1,bar,2
2,x,3


In [104]:
right_df

Unnamed: 0,key1,right_value
0,foo,4
1,bar,5


In [107]:
pd.merge(left_df, right_df, on='key1')

Unnamed: 0,key1,left_value,right_value
0,foo,1,4
1,bar,2,5


In [108]:
pd.merge(left_df, right_df, how='left', on='key1')

Unnamed: 0,key1,left_value,right_value
0,foo,1,4.0
1,bar,2,5.0
2,x,3,


In [113]:
df6 = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df6

Unnamed: 0,A,B,C,D
0,-0.68572,-0.089066,0.054327,-0.27527
1,0.311189,-0.381734,0.609451,1.009294
2,0.764242,0.952373,0.192865,-0.80531
3,-1.012907,0.812474,-0.797912,0.277512
4,-0.994547,0.100564,1.287342,-0.262688
5,1.582326,-0.272017,1.535785,0.604324
6,-0.852874,0.875267,-0.351018,0.485054
7,2.115547,-0.099241,0.660667,0.345415


In [119]:
s = df6.iloc[3]
s

A   -1.012907
B    0.812474
C   -0.797912
D    0.277512
Name: 3, dtype: float64

In [120]:
df6.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.68572,-0.089066,0.054327,-0.27527
1,0.311189,-0.381734,0.609451,1.009294
2,0.764242,0.952373,0.192865,-0.80531
3,-1.012907,0.812474,-0.797912,0.277512
4,-0.994547,0.100564,1.287342,-0.262688
5,1.582326,-0.272017,1.535785,0.604324
6,-0.852874,0.875267,-0.351018,0.485054
7,2.115547,-0.099241,0.660667,0.345415
8,-1.012907,0.812474,-0.797912,0.277512


In [122]:
df7 = pd.DataFrame({'A':['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
                    'B':['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 
                    'C':np.random.randn(8),
                    'D':np.random.randn(8)})
df7

Unnamed: 0,A,B,C,D
0,foo,one,-0.094603,-0.43742
1,bar,one,-2.697415,-1.199993
2,foo,two,0.100904,-0.707075
3,bar,three,-0.366228,-1.447231
4,foo,two,-0.406455,-0.40907
5,bar,two,-1.048436,0.363856
6,foo,one,0.295565,-0.127371
7,bar,three,-2.004404,0.2861


In [124]:
df7.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-6.116483,-1.997268
foo,-0.104589,-1.680937


In [125]:
df7.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-2.697415,-1.199993
bar,three,-2.370632,-1.161131
bar,two,-1.048436,0.363856
foo,one,0.200962,-0.564792
foo,two,-0.305551,-1.116145
