# More Data Wrangling 9 

In this notebook I demonstrate how to reshape a multi-indexed series and obtain aggregate scores at different levels of granularity. I also obtain pivot tables and a crosstabulation using count data. 

In [1]:
import pandas as pd
import numpy as np

In [4]:
# Reading in the multilevel data file. 

pre_post = pd.read_csv('New Long Multilevel Datafile.csv')

pre_post.head()

Unnamed: 0,PersonID,Condition,PrePostDepletion,Mean_Consumption,Rounds,Starting value,Knows Opp,Friends
0,1,0,0,30.0,5,1,0.0,0.0
1,2,0,0,22.2,5,1,0.0,0.0
2,3,0,0,8.83,6,1,0.0,0.0
3,4,0,0,9.83,6,1,0.0,0.0
4,5,0,0,23.6,3,1,1.0,1.0


In [3]:
# Reshape a multi-indexed series. 

pre_post.dtypes

PersonID              int64
Condition             int64
PrePostDepletion      int64
Mean_Consumption     object
Rounds                int64
Starting value        int64
Knows Opp           float64
Friends             float64
dtype: object

In [5]:
# Converting Mean_Consumption variable to float

pre_post['Mean_Consumption'] = pre_post['Mean_Consumption'].apply(pd.to_numeric, errors = 'coerce')

In [6]:
# The above worked. 

pre_post.dtypes

PersonID              int64
Condition             int64
PrePostDepletion      int64
Mean_Consumption    float64
Rounds                int64
Starting value        int64
Knows Opp           float64
Friends             float64
dtype: object

In [7]:
# Now calculating the mean for consumption:

pre_post.Mean_Consumption.mean()

23.19852409638556

In [8]:
# Calculating the consumption means for the different levels of the condition variable
# using the groupby method

pre_post.groupby('Condition').Mean_Consumption.mean()

Condition
0    20.539633
1    28.283070
Name: Mean_Consumption, dtype: float64

In [9]:
# Can get more granular and pass a list of more than one category to groupby:

pre_post.groupby(['Condition', 'PrePostDepletion']).Mean_Consumption.mean()

# This is stored as a multiindexed series, meaning it has multiple index labels to the left of
# the data. 

Condition  PrePostDepletion
0          0                   30.260877
           1                    9.883654
1          0                   40.122794
           1                   10.780870
Name: Mean_Consumption, dtype: float64

In [10]:
# We can reshape a multiindexed series into a data frame using the unstack method:

pre_post.groupby(['Condition', 'PrePostDepletion']).Mean_Consumption.mean().unstack()

PrePostDepletion,0,1
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,30.260877,9.883654
1,40.122794,10.78087


In [11]:
# Create a pivot table. 

# It can often be more convenient to create dataframes like the one above using the pivot method
# instead:

pre_post.pivot_table(index = 'Condition', columns = 'PrePostDepletion', values = 'Mean_Consumption', 
                     aggfunc = 'mean')

PrePostDepletion,0,1
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,30.260877,9.883654
1,40.122794,10.78087


In [12]:
# if we add margins = True to the above we can get row and column totals:

pre_post.pivot_table(index = 'Condition', columns = 'PrePostDepletion', values = 'Mean_Consumption', 
                     aggfunc = 'mean', margins = True)

PrePostDepletion,0,1,All
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,30.260877,9.883654,20.539633
1,40.122794,10.78087,28.28307
All,33.945549,10.1588,23.198524


In [13]:
# You can create a cross tabulation by changing the aggregate function from mean to count:

pre_post.pivot_table(index = 'Condition', columns = 'PrePostDepletion', values = 'Mean_Consumption', 
                     aggfunc = 'count', margins = True)

PrePostDepletion,0,1,All
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,114,104,218
1,68,46,114
All,182,150,332
