# More Data Wrangling 8

In this notebook I demonstrate how to aggregate data using multiple functions. I use a multilevel dataframe that contain data from a repeated measures design where the same participants contribute multiple datapoints. I also show how to append aggregted variables to the existing dataframe. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Aggregate by multiple functions.

# Reading in the multilevel data file. 

pre_post = pd.read_csv('New Long Multilevel Datafile.csv')

In [3]:
pre_post.head()

Unnamed: 0,PersonID,Condition,PrePostDepletion,Mean_Consumption,Rounds,Starting value,Knows Opp,Friends
0,1,0,0,30.0,5,1,0.0,0.0
1,2,0,0,22.2,5,1,0.0,0.0
2,3,0,0,8.83,6,1,0.0,0.0
3,4,0,0,9.83,6,1,0.0,0.0
4,5,0,0,23.6,3,1,1.0,1.0


In [4]:
pre_post.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PersonID          364 non-null    int64  
 1   Condition         364 non-null    int64  
 2   PrePostDepletion  364 non-null    int64  
 3   Mean_Consumption  364 non-null    object 
 4   Rounds            364 non-null    int64  
 5   Starting value    364 non-null    int64  
 6   Knows Opp         340 non-null    float64
 7   Friends           340 non-null    float64
dtypes: float64(2), int64(5), object(1)
memory usage: 22.9+ KB


In [5]:
# Here working out the total number of rounds for the participant with PersonID == 1:

pre_post[pre_post.PersonID == 1].Rounds.sum()

7

In [6]:
# To work out the total rounds for each participants (this is summing pre and post rounds):

pre_post.groupby('PersonID').Rounds.sum().head(10)

PersonID
1      7
2      7
3     12
4     12
5      4
6      4
7      5
8      5
9      9
10     9
Name: Rounds, dtype: int64

In [7]:
# Not limited to aggregating by a single function such as sum. You can use the agg method and
# pass it a list of functions such as sum and count. 

# Here it is summing the number of rounds for each participant and the number of data points
# All the data point counts are 2 in this case because each contributes a pre and post score. 

pre_post.groupby('PersonID').Rounds.agg(['sum', 'count']).head()

Unnamed: 0_level_0,sum,count
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7,2
2,7,2
3,12,2
4,12,2
5,4,2


In [8]:
# Combine the output of an aggregation with a dataframe. 

# If we canted to create a new column listing the total number of rounds for each player. 
# This was calculated using the sum method. Because sum is an aggregation function it returns 
# a reduced version of the data frame. 

# The output for the aggregated rounds variable is smaller than the original 
# rounds variable: 

len(pre_post.groupby('PersonID').Rounds.sum())

182

In [9]:
len(pre_post.Rounds)

364

In [10]:
# The solution is to use the transform method which returns the same data but transforms 
# the aggregated variable so that it is the correct shape as the original dataframe. 

total_rounds = pre_post.groupby('PersonID').Rounds.transform('sum')
len(total_rounds)

364

In [11]:
pre_post['total_rounds'] = total_rounds
pre_post.head(10)

Unnamed: 0,PersonID,Condition,PrePostDepletion,Mean_Consumption,Rounds,Starting value,Knows Opp,Friends,total_rounds
0,1,0,0,30.0,5,1,0.0,0.0,7
1,2,0,0,22.2,5,1,0.0,0.0,7
2,3,0,0,8.83,6,1,0.0,0.0,12
3,4,0,0,9.83,6,1,0.0,0.0,12
4,5,0,0,23.6,3,1,1.0,1.0,4
5,6,0,0,25.0,3,1,1.0,1.0,4
6,7,0,0,40.0,1,1,0.0,0.0,5
7,8,0,0,10.0,1,1,0.0,0.0,5
8,9,0,0,22.6,3,1,1.0,1.0,9
9,10,0,0,26.0,3,1,1.0,1.0,9


In [12]:
# Can then work out the % of total for pre rounds:

pre_post['percent_of_total'] = pre_post.Rounds / pre_post.total_rounds

pre_post.head(10)

Unnamed: 0,PersonID,Condition,PrePostDepletion,Mean_Consumption,Rounds,Starting value,Knows Opp,Friends,total_rounds,percent_of_total
0,1,0,0,30.0,5,1,0.0,0.0,7,0.714286
1,2,0,0,22.2,5,1,0.0,0.0,7,0.714286
2,3,0,0,8.83,6,1,0.0,0.0,12,0.5
3,4,0,0,9.83,6,1,0.0,0.0,12,0.5
4,5,0,0,23.6,3,1,1.0,1.0,4,0.75
5,6,0,0,25.0,3,1,1.0,1.0,4,0.75
6,7,0,0,40.0,1,1,0.0,0.0,5,0.2
7,8,0,0,10.0,1,1,0.0,0.0,5,0.2
8,9,0,0,22.6,3,1,1.0,1.0,9,0.333333
9,10,0,0,26.0,3,1,1.0,1.0,9,0.333333
