# 10 Minutes to pandas 

## Workalong notebook 

In [1]:
# Import all the things 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Create a series 

s = pd.Series([1,3,5,np.nan,6,8])

In [3]:
# Create a DataFrame 

# index = dates sets the ROWS to be the dates
# columns = whatever would name the columns 

dates = pd.date_range('20130101', periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, 
                  columns=list('ABCD'))

df

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [4]:
# Create a DataFrame by passing a dictionary where the keys are the columns and the values are indexed 0..whatever

# Notes: 
# 'A':1 broadcasts 1 to every row of column A
# 'B' consists of pandas time-date objects
# 'C' puts a series in as a column

df2 = pd.DataFrame({ 'A' : 1.,
   'B' : pd.Timestamp('20130102'),
   'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
   'D' : np.array([3] * 4,dtype='int32'),
   'E' : pd.Categorical(["test","train","test","train"]),
   'F' : 'foo' })

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [5]:
# Take a look at the types of the stuff in df2

df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
# THis is a little different than info

df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
A    4 non-null float64
B    4 non-null datetime64[ns]
C    4 non-null float32
D    4 non-null int32
E    4 non-null category
F    4 non-null object
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 260.0+ bytes


In [7]:
# Get the index 

df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [9]:
# Get columns 

df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [11]:
# Get values 

print(df.values)
print(df2.values)

[[ 1.08224223  1.8980201  -0.98546648 -1.65213786]
 [ 0.60772269 -0.06635267  1.2811407   0.85067572]
 [-1.06671389 -0.88528587 -0.79488663  0.39643353]
 [ 0.56291503 -0.82684477 -0.1583402   1.43045711]
 [ 1.30167744  0.21536078 -1.49720836 -1.24926802]
 [ 0.0464043  -0.81410149 -1.72157434  0.18096773]]
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]


In [12]:
# Get summary stats for numerical data

df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.422375,-0.079867,-0.646056,-0.007145
std,0.851051,1.071929,1.092749,1.204134
min,-1.066714,-0.885286,-1.721574,-1.652138
25%,0.175532,-0.823659,-1.369273,-0.891709
50%,0.585319,-0.440227,-0.890177,0.288701
75%,0.963612,0.144932,-0.317477,0.737115
max,1.301677,1.89802,1.281141,1.430457


In [13]:
# This method ignores all non-numerical data

df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [14]:
# Transpose data

df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.082242,0.607723,-1.066714,0.562915,1.301677,0.046404
B,1.89802,-0.066353,-0.885286,-0.826845,0.215361,-0.814101
C,-0.985466,1.281141,-0.794887,-0.15834,-1.497208,-1.721574
D,-1.652138,0.850676,0.396434,1.430457,-1.249268,0.180968


In [15]:
# Sort by columns 

df.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [16]:
# Sort by rows 

df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.046404,-0.814101,-1.721574,0.180968
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-01,1.082242,1.89802,-0.985466,-1.652138


In [17]:
# Sort by values in a particular column

# Before: 

df

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [18]:
# After: 

df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-06,0.046404,-0.814101,-1.721574,0.180968
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-01,1.082242,1.89802,-0.985466,-1.652138


In [19]:
# Getting a single column

df['A']

2013-01-01    1.082242
2013-01-02    0.607723
2013-01-03   -1.066714
2013-01-04    0.562915
2013-01-05    1.301677
2013-01-06    0.046404
Freq: D, Name: A, dtype: float64

In [20]:
# Getting a slice of rows for all columns 

df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434


In [21]:

df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457


In [22]:
# Getting cross section using a label

dates[0]

Timestamp('2013-01-01 00:00:00', freq='D')

In [23]:
df.loc[dates[0]]

# Note this is a ROW even though it's formatted vertically

A    1.082242
B    1.898020
C   -0.985466
D   -1.652138
Name: 2013-01-01 00:00:00, dtype: float64

In [24]:
# Selecting all rows for a list of columns

df.loc[:, ['A','C']]

Unnamed: 0,A,C
2013-01-01,1.082242,-0.985466
2013-01-02,0.607723,1.281141
2013-01-03,-1.066714,-0.794887
2013-01-04,0.562915,-0.15834
2013-01-05,1.301677,-1.497208
2013-01-06,0.046404,-1.721574


In [25]:
# Slicing in both directions

df.loc['20130102':'20130104', 'A':'C']

# Note that endpoints are included 

Unnamed: 0,A,B,C
2013-01-02,0.607723,-0.066353,1.281141
2013-01-03,-1.066714,-0.885286,-0.794887
2013-01-04,0.562915,-0.826845,-0.15834


In [26]:
# Selection by position (integer location)

df.iloc[3]

# Again this is a ROW

A    0.562915
B   -0.826845
C   -0.158340
D    1.430457
Name: 2013-01-04 00:00:00, dtype: float64

In [27]:
# With slices 

df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.562915,-0.826845
2013-01-05,1.301677,0.215361


In [28]:
# With a list of values 

df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,0.607723,1.281141
2013-01-03,-1.066714,-0.794887
2013-01-05,1.301677,-1.497208


In [29]:
# Getting a selection of rows + all columns

df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434


In [30]:
# Getting a selection of columns + all rows

df.iloc[:, 2:4]

Unnamed: 0,C,D
2013-01-01,-0.985466,-1.652138
2013-01-02,1.281141,0.850676
2013-01-03,-0.794887,0.396434
2013-01-04,-0.15834,1.430457
2013-01-05,-1.497208,-1.249268
2013-01-06,-1.721574,0.180968


In [31]:
# Getting a single specific value 

df.iloc[1,2]

1.2811406969903332

In [32]:
# Boolean indexing

df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [33]:
# Selecting all values from a DataFrame when a condition is met 
# Puts NaN where appropriate 

df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,,
2013-01-02,0.607723,,1.281141,0.850676
2013-01-03,,,,0.396434
2013-01-04,0.562915,,,1.430457
2013-01-05,1.301677,0.215361,,
2013-01-06,0.046404,,,0.180968


In [34]:
# Setting 

s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130102', periods = 6))

In [35]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [36]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.082242,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [37]:
# We can set values by label:
df.at[dates[0], 'A'] = 0 
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,1.89802,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [38]:
# or by position

df.iat[0,1] = 99
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,99.0,-0.985466,-1.652138
2013-01-02,0.607723,-0.066353,1.281141,0.850676
2013-01-03,-1.066714,-0.885286,-0.794887,0.396434
2013-01-04,0.562915,-0.826845,-0.15834,1.430457
2013-01-05,1.301677,0.215361,-1.497208,-1.249268
2013-01-06,0.046404,-0.814101,-1.721574,0.180968


In [39]:
# or by assigning with a numpy array

df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,99.0,-0.985466,5
2013-01-02,0.607723,-0.066353,1.281141,5
2013-01-03,-1.066714,-0.885286,-0.794887,5
2013-01-04,0.562915,-0.826845,-0.15834,5
2013-01-05,1.301677,0.215361,-1.497208,5
2013-01-06,0.046404,-0.814101,-1.721574,5


In [40]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,99.0,-0.985466,5,
2013-01-02,0.607723,-0.066353,1.281141,5,1.0
2013-01-03,-1.066714,-0.885286,-0.794887,5,2.0
2013-01-04,0.562915,-0.826845,-0.15834,5,3.0
2013-01-05,1.301677,0.215361,-1.497208,5,4.0
2013-01-06,0.046404,-0.814101,-1.721574,5,5.0


### Missing data 

Missing data are coded `NaN` by default and are not included in calculations. 

In [41]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,99.0,-0.985466,5,,
2013-01-02,0.607723,-0.066353,1.281141,5,1.0,
2013-01-03,-1.066714,-0.885286,-0.794887,5,2.0,
2013-01-04,0.562915,-0.826845,-0.15834,5,3.0,


In [42]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,99.0,-0.985466,5,,1.0
2013-01-02,0.607723,-0.066353,1.281141,5,1.0,1.0
2013-01-03,-1.066714,-0.885286,-0.794887,5,2.0,
2013-01-04,0.562915,-0.826845,-0.15834,5,3.0,


In [43]:
# Drop rows with missing data

df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,0.607723,-0.066353,1.281141,5,1.0,1.0


In [44]:
# Fill in missing data 

df1.fillna(value = 5)
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,99.0,-0.985466,5,,1.0
2013-01-02,0.607723,-0.066353,1.281141,5,1.0,1.0
2013-01-03,-1.066714,-0.885286,-0.794887,5,2.0,
2013-01-04,0.562915,-0.826845,-0.15834,5,3.0,


In [45]:
# Find all the Nan's

pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### Statistics

In [46]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,99.0,-0.985466,5,
2013-01-02,0.607723,-0.066353,1.281141,5,1.0
2013-01-03,-1.066714,-0.885286,-0.794887,5,2.0
2013-01-04,0.562915,-0.826845,-0.15834,5,3.0
2013-01-05,1.301677,0.215361,-1.497208,5,4.0
2013-01-06,0.046404,-0.814101,-1.721574,5,5.0


In [47]:
df.mean()

A     0.242001
B    16.103796
C    -0.646056
D     5.000000
F     3.000000
dtype: float64

In [48]:
# Same operation but switch axes

df.mean(1)

2013-01-01    25.753633
2013-01-02     1.564502
2013-01-03     0.850623
2013-01-04     1.515546
2013-01-05     1.803966
2013-01-06     1.502146
Freq: D, dtype: float64

In [49]:
# Applying functions to a DataFrame

df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,99.0,-0.985466,5,
2013-01-02,0.607723,98.933647,0.295674,10,1.0
2013-01-03,-0.458991,98.048361,-0.499212,15,3.0
2013-01-04,0.103924,97.221517,-0.657553,20,6.0
2013-01-05,1.405601,97.436877,-2.154761,25,10.0
2013-01-06,1.452006,96.622776,-3.876335,30,15.0


In [50]:
df.apply(lambda x: x.max()-x.min())

A     2.368391
B    99.885286
C     3.002715
D     0.000000
F     4.000000
dtype: float64

In [51]:
# Histograms

s = pd.Series(np.random.randint(0,7,size=10))
s

0    1
1    3
2    0
3    4
4    6
5    0
6    1
7    2
8    3
9    6
dtype: int64

In [52]:
s.value_counts()

6    2
3    2
1    2
0    2
4    1
2    1
dtype: int64

### Merging

In [53]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,0.433921,-0.447035,-0.780396,-0.576083
1,-0.520594,-0.172561,0.111025,1.316893
2,0.298094,-0.017749,-0.522586,1.114356
3,0.774287,1.53544,-0.652664,-1.492395
4,-0.063666,0.26374,-0.052628,-0.844872
5,0.562713,-0.049432,0.248993,-2.166203
6,-0.963455,-1.941703,-0.148794,-0.79528
7,-2.275238,-0.886029,0.187534,0.061996
8,-1.706374,0.51726,-1.488457,0.22113
9,0.727758,0.04716,0.754146,-2.14907


In [54]:
piece1 = df[:3]

In [55]:
piece2 = df[3:7]
piece3 = df[7:]

piece1

Unnamed: 0,0,1,2,3
0,0.433921,-0.447035,-0.780396,-0.576083
1,-0.520594,-0.172561,0.111025,1.316893
2,0.298094,-0.017749,-0.522586,1.114356


In [56]:
piece2

Unnamed: 0,0,1,2,3
3,0.774287,1.53544,-0.652664,-1.492395
4,-0.063666,0.26374,-0.052628,-0.844872
5,0.562713,-0.049432,0.248993,-2.166203
6,-0.963455,-1.941703,-0.148794,-0.79528


In [57]:
piece3

Unnamed: 0,0,1,2,3
7,-2.275238,-0.886029,0.187534,0.061996
8,-1.706374,0.51726,-1.488457,0.22113
9,0.727758,0.04716,0.754146,-2.14907


In [58]:
# Glue them back together by rows 

pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,0.433921,-0.447035,-0.780396,-0.576083
1,-0.520594,-0.172561,0.111025,1.316893
2,0.298094,-0.017749,-0.522586,1.114356
3,0.774287,1.53544,-0.652664,-1.492395
4,-0.063666,0.26374,-0.052628,-0.844872
5,0.562713,-0.049432,0.248993,-2.166203
6,-0.963455,-1.941703,-0.148794,-0.79528
7,-2.275238,-0.886029,0.187534,0.061996
8,-1.706374,0.51726,-1.488457,0.22113
9,0.727758,0.04716,0.754146,-2.14907


In [59]:
# Appending rows 

s = df.iloc[2]
s

0    0.298094
1   -0.017749
2   -0.522586
3    1.114356
Name: 2, dtype: float64

In [60]:
df.append(s)

Unnamed: 0,0,1,2,3
0,0.433921,-0.447035,-0.780396,-0.576083
1,-0.520594,-0.172561,0.111025,1.316893
2,0.298094,-0.017749,-0.522586,1.114356
3,0.774287,1.53544,-0.652664,-1.492395
4,-0.063666,0.26374,-0.052628,-0.844872
5,0.562713,-0.049432,0.248993,-2.166203
6,-0.963455,-1.941703,-0.148794,-0.79528
7,-2.275238,-0.886029,0.187534,0.061996
8,-1.706374,0.51726,-1.488457,0.22113
9,0.727758,0.04716,0.754146,-2.14907


In [61]:
df.append(s, ignore_index=True)

Unnamed: 0,0,1,2,3
0,0.433921,-0.447035,-0.780396,-0.576083
1,-0.520594,-0.172561,0.111025,1.316893
2,0.298094,-0.017749,-0.522586,1.114356
3,0.774287,1.53544,-0.652664,-1.492395
4,-0.063666,0.26374,-0.052628,-0.844872
5,0.562713,-0.049432,0.248993,-2.166203
6,-0.963455,-1.941703,-0.148794,-0.79528
7,-2.275238,-0.886029,0.187534,0.061996
8,-1.706374,0.51726,-1.488457,0.22113
9,0.727758,0.04716,0.754146,-2.14907


In [62]:
# Grouping 

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
           'foo', 'bar', 'foo', 'foo'],
    'B' : ['one', 'one', 'two', 'three',
           'two', 'two', 'one', 'three'],
    'C' : np.random.randn(8),
    'D' : np.random.randn(8)})

df

Unnamed: 0,A,B,C,D
0,foo,one,-2.808619,-1.176746
1,bar,one,0.26537,0.008737
2,foo,two,-0.951416,1.568407
3,bar,three,0.393087,0.873977
4,foo,two,0.333033,-2.188414
5,bar,two,0.086266,-0.26719
6,foo,one,-0.173435,1.365818
7,foo,three,-0.038127,-2.52513


In [63]:
df.groupby('A')

<pandas.core.groupby.DataFrameGroupBy object at 0x7f95166fbe80>

In [64]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.744722,0.615523
foo,-3.638563,-2.956064


In [65]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.26537,0.008737
bar,three,0.393087,0.873977
bar,two,0.086266,-0.26719
foo,one,-2.982053,0.189072
foo,three,-0.038127,-2.52513
foo,two,-0.618383,-0.620007


### Reshaping

Stacking: `.stack()` method compresses a data frame in one of its columns

In [66]:
dates = pd.date_range('20130101', periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, 
                  columns=list('ABCD'))

df

Unnamed: 0,A,B,C,D
2013-01-01,0.787913,-0.320264,-0.332076,1.407452
2013-01-02,0.551695,-1.572335,-0.136959,-1.669422
2013-01-03,1.041609,0.825005,-0.732751,0.316686
2013-01-04,-1.185508,1.511815,-0.155797,-0.073565
2013-01-05,1.219288,-1.848178,-1.194112,-0.786933
2013-01-06,1.311687,-1.338593,-0.390794,0.115883


In [67]:
df.stack()

2013-01-01  A    0.787913
            B   -0.320264
            C   -0.332076
            D    1.407452
2013-01-02  A    0.551695
            B   -1.572335
            C   -0.136959
            D   -1.669422
2013-01-03  A    1.041609
            B    0.825005
            C   -0.732751
            D    0.316686
2013-01-04  A   -1.185508
            B    1.511815
            C   -0.155797
            D   -0.073565
2013-01-05  A    1.219288
            B   -1.848178
            C   -1.194112
            D   -0.786933
2013-01-06  A    1.311687
            B   -1.338593
            C   -0.390794
            D    0.115883
dtype: float64

In [69]:
# This is unexpected -- the result of .stack() is a Series 

type(df.stack())

pandas.core.series.Series

In [70]:
df_stack = df.stack()

In [71]:
df_stack[1]

-0.32026426085328669

In [73]:
df_stack[0:10]

2013-01-01  A    0.787913
            B   -0.320264
            C   -0.332076
            D    1.407452
2013-01-02  A    0.551695
            B   -1.572335
            C   -0.136959
            D   -1.669422
2013-01-03  A    1.041609
            B    0.825005
dtype: float64

Pivot tables: A _pivot table_ is a report obtained from a spreadsheet/DataFrame that summarizes the data found in the frame. The term "pivot" just refers to "picking up" a table and turning it around to look at it in a different way. 

In [77]:
# Example 

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                    'B' : ['x', 'y', 'z'] * 4,
                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                    'D' : np.random.randn(12),
                    'E' : np.random.randn(12)})

In [78]:
df

Unnamed: 0,A,B,C,D,E
0,one,x,foo,2.335674,-0.882201
1,one,y,foo,-0.992179,-1.254602
2,two,z,foo,0.216516,1.734632
3,three,x,bar,-0.609743,-1.343884
4,one,y,bar,0.214076,1.356136
5,one,z,bar,-0.62987,-0.867661
6,two,x,foo,1.091551,0.040199
7,three,y,foo,0.492549,0.478098
8,one,z,foo,0.396658,1.125599
9,one,x,bar,-0.608588,-0.099841


In [86]:
# So: This produces something like a 3D table -- "A" values along the left edge, "B" values along the top, and 
# "C" values in depth. Look up the three coordinates and you will find the values (or "NaN" if there were no 
# data for that slot). 

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,x,-0.608588,2.335674
one,y,0.214076,-0.992179
one,z,-0.62987,0.396658
three,x,-0.609743,
three,y,,0.492549
three,z,0.740679,
two,x,,1.091551
two,y,-0.14322,
two,z,,0.216516


In [87]:
# How is that different from the following, where I include the column variable "C" as an index variable?
# Answer: It only reports back the data that are present. 

pd.pivot_table(df, values='D', index=['A', 'B', 'C'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,D
A,B,C,Unnamed: 3_level_1
one,x,bar,-0.608588
one,x,foo,2.335674
one,y,bar,0.214076
one,y,foo,-0.992179
one,z,bar,-0.62987
one,z,foo,0.396658
three,x,bar,-0.609743
three,y,foo,0.492549
three,z,bar,0.740679
two,x,foo,1.091551


In [80]:
# Example using MTH 325 gradebook

grades = pd.read_csv('225-gradebook.csv', index_col=0)

grades.head()

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20


In [83]:
# This gives a report of the number of Guided Practices passed according to the number of 
# Homework Sets passed -- the default aggregator is the mean

piv_grades = pd.pivot_table(grades, values = 'gp', index = 'hw_pass')

piv_grades

Unnamed: 0_level_0,gp
hw_pass,Unnamed: 1_level_1
0,17.25
1,22.0
2,18.0
3,20.0
4,23.888889
5,23.75


In [84]:
pd.pivot_table(grades, values = 'gp', index = 'hw_pass', aggfunc='median')

Unnamed: 0_level_0,gp
hw_pass,Unnamed: 1_level_1
0,19.0
1,22.0
2,18.0
3,20.0
4,24.0
5,24.5


In [85]:
# This one shows the median number of Learning Target Assessments passed using 
# a double index of Homework and Programming Projects passed. This is like a table with 
# hw_pass along the vertical edge and pp_pass along the top. 

pd.pivot_table(grades, values = 'lt_pass', index = ['hw_pass', 'pp_pass'], aggfunc='median')

Unnamed: 0_level_0,Unnamed: 1_level_0,lt_pass
hw_pass,pp_pass,Unnamed: 2_level_1
0,0,2
0,1,11
0,2,9
1,3,23
2,3,21
3,0,13
4,0,21
4,2,23
4,3,23
4,4,24


In [88]:
# Compare with this, where it literally gives a 2D table. 

pd.pivot_table(grades, values = 'lt_pass', index = 'hw_pass', columns = 'pp_pass', aggfunc='median')


pp_pass,0,1,2,3,4,5
hw_pass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.0,11.0,9.0,,,
1,,,,23.0,,
2,,,,21.0,,
3,13.0,,,,,
4,21.0,,23.0,23.0,24.0,
5,,,,,24.0,26.0
