# GROUPING (GROUP BY)
To improve the speed significantly pandas has a function named group by, unlike the applyrow and apply functions that iterate through every rows and columns groupby groups a particular set of the data which is required and carry out the computations on these chunks which are separated by key values.
The results are then combined back to another DataFrame.

*In pandas this is known as " split - apply - combine pattern "*

### SPLITTING

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/census.csv')
df = df[df['SUMLEV'] == 50]  # only taking the census of SUMLEV 50
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [2]:
%%timeit  # this will tell jupyter to run this cell three times (%%timit -n 3)

for state in df['STNAME'].unique():  # this will take only the unique set of all the state names
    avg = np.average(df.where(df['STNAME'] == state).dropna()['CENSUS2010POP'])   # census2010pop will be projected
    
#     print(f'Counties in state {state} have an average population of {str(avg)}')


    
# this method takes up significant amount of time.

1.31 s ± 447 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
%%timeit -n 3
# the follwing mehthod performs the same application as above using groupby() function.

# in the following we are telling pandas that we want to group by the state name. This is called 'splitting'
for group, frame in df.groupby(df['STNAME']):  
    # "group by fucntion returns a tuple"
    # in this returned tuple the first value is the value of the key (stname) || second value is projected dataframe
    # that was found for that group (census2010pop)
    avg = np.average(frame['CENSUS2010POP'])
    
#     print(f'Counties in states {group} have an average population of {str(avg)}')

37.9 ms ± 6.29 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [4]:
# you can also include a fucntion into groupby()
# in order to use a function in groupby you have to set column you want to perform actions as the index, hence

df = df.set_index(df['STNAME'])

def set_batch_number(item):
    if item[0] < 'M': return 0
    elif item[0] < 'Q': return 1
    else: return 2
    
    
for group, frame in df.groupby(set_batch_number): # passing function to groupby (note how arguments are not included)
    print(group)
    print(f'There are {str(len(frame))} records in group {str(group)} for processing')
    
# in the above example while using a function, 'group' is the returned value from the function and 'frame' is the
# projected dataframe (whole dataframe). This automatically selects 'STNAME' since it is now the index.
# THIS HAPPENS IF NO COLUMN NAME (IDENTIFIER) IS PASSED IN groupby(). IT BY DEFAULT THEN TAKES THE INDEX

0
There are 1177 records in group 0 for processing
1
There are 1134 records in group 1 for processing
2
There are 831 records in group 2 for processing


## Grouping using multi-indexing

In [7]:
# there are multile ways where you can group values together using the groupby function
# during multi index grouping you have to sepcifically mention the levels inside the paranthesis.

# here the indexes are 'STNAME' & 'CTYNAME'  df = df.set_index(['STNAME', 'CTYNAME'])

for group, frame in df.groupby(level = (0,1)):
    pass
#        print(group)

ValueError: multiple levels only valid with MultiIndex

### The above examples are just the basics of groupby and only show the printing functionality of grouby function
## The pandas developers have three broad categories of data proccessing to happen during the apply state
* Aggregation of group data
* Transformation of group data
* Filteration of group data

# AGGREGATION OF GROUP DATA

In [None]:
# aggregation means clustering of the data or bringing the data together

# the agg() method takes dictionaries in its parameters for the computation

df.groupby('SUMLEV').agg({'CENSUS2010POP': np.nanmean}) # np.average 

# if there are 'NaN' values in your data, np.average will also return the NaN values
# to return only the numbers, numpy has a similar function called 'nanmean' (it takes the average of all the numbered
# data and excludes all the NaN values) 


In [None]:
# agg() dictionary can have multiple fucntions as well as multiple columns grouped by a group name, in this case
# 'STNAME'
%precision 2
df.groupby('STNAME').agg({'POPESTIMATE2010': (np.nanmean, np.nanstd),
                         'POPESTIMATE2011': np.nanmean})

# in the above example agg() method takes the column name as the key and fucntions as values, there can be mutiple 
# functions in the form of a tuple and multiple keys or columns in a dictionary

# note that, the column which is grouped by is shown as the index.


# IMPORTANT :  Note that the functions passed in the agg mehtod is not the calling of fucntion such as np.nanmean()
# but instead are the references to the fucntions, hence does not contain paranthesis

# TRANSFORMATION
### transform() is different from agg() as agg returns a single value per column, that is, " one row per group ", as shown in the above output. tranform() on the other hand returns an object the same size as that of the group, that is a new dataframe with the indexes that of the original dataframe.

### This can be useful later to merge the datasets

Difference between transform and merging is that merging will merge the specified columns in the two tables having the same label. (This will fail if there are no same labeled columns in both the tables) and must have an on attribute in the paranthesis.
The unique ability of transform is that it returns an object the same size as that of the dataframe for the specified column on which the computations are done, hence, it is easier to include the new table in the existing data frame directly instead of creating a new dataframe and then merging the two tables together.

In [6]:
cols = ['CENSUS2010POP', 'POPESTIMATE2010']

transform_df = df[cols].groupby('POPESTIMATE2010').transform(np.nanmean)
transform_df.head()
transform_df

Unnamed: 0_level_0,CENSUS2010POP
STNAME,Unnamed: 1_level_1
Alabama,54571.0
Alabama,182265.0
Alabama,27457.0
Alabama,22915.0
Alabama,57322.0
...,...
Wyoming,43806.0
Wyoming,21294.0
Wyoming,21118.0
Wyoming,8533.0


In [8]:
# in the above example, you can now insert the output into the dataframe as it not the same 'CENSUS2010POP', so
# we can also change its name

transform_df.rename({'CENSUS2010POP':'NEWPOPCEN'}, axis = 1, inplace = True)

df = df.merge(transform_df, left_index = True, right_index = True) # this is possible because both have the same index
df['NEWPOPCEN']

STNAME
Alabama     54571.0
Alabama    182265.0
Alabama     27457.0
Alabama     22915.0
Alabama     57322.0
             ...   
Wyoming     43806.0
Wyoming     21294.0
Wyoming     21118.0
Wyoming      8533.0
Wyoming      7208.0
Name: NEWPOPCEN, Length: 302908, dtype: float64

In [11]:
# we can now see the mean diff

df['MEANDIFF'] = np.absolute(df['CENSUS2010POP']-df['NEWPOPCEN'])
df['MEANDIFF']


STNAME
Alabama         0.0
Alabama    127694.0
Alabama     27114.0
Alabama     31656.0
Alabama      2751.0
             ...   
Wyoming     36598.0
Wyoming     14086.0
Wyoming     13910.0
Wyoming      1325.0
Wyoming         0.0
Name: MEANDIFF, Length: 302908, dtype: float64

# FILTERING 
### The groupby function also includes the filtering of the different groups, with different features.
### The filter() function takes in a function as the parameter which it applies to each group database and then returns True or False, depending upon whether that group should be included in the results

In [25]:
df.groupby('POPESTIMATE2010').filter(lambda x: np.nanmean(x['CENSUS2010POP']) > 900000)

# in the above example only the values that are greater than 900000 in 'CENSUS2010POP' are shown and all that turned
# false are not copied over

Unnamed: 0_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,NEWPOPCEN,MEANDIFF
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arizona,50,4,8,4,13,Arizona,Maricopa County,3817117,3817357,3825597,...,7.879873,8.318210,9.123309,4.726436,11.527593,10.390177,11.203958,11.985998,71518.0,3745599.0
Arizona,50,4,8,4,13,Arizona,Maricopa County,3817117,3817357,3825597,...,7.879873,8.318210,9.123309,4.726436,11.527593,10.390177,11.203958,11.985998,131423.0,3685694.0
Arizona,50,4,8,4,13,Arizona,Maricopa County,3817117,3817357,3825597,...,7.879873,8.318210,9.123309,4.726436,11.527593,10.390177,11.203958,11.985998,134421.0,3682696.0
Arizona,50,4,8,4,13,Arizona,Maricopa County,3817117,3817357,3825597,...,7.879873,8.318210,9.123309,4.726436,11.527593,10.390177,11.203958,11.985998,53597.0,3763520.0
Arizona,50,4,8,4,13,Arizona,Maricopa County,3817117,3817357,3825597,...,7.879873,8.318210,9.123309,4.726436,11.527593,10.390177,11.203958,11.985998,37220.0,3779897.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wisconsin,50,2,3,55,79,Wisconsin,Milwaukee County,947735,947736,948301,...,-5.581617,-8.303054,-8.604921,-2.352634,-2.953702,-3.114494,-5.344614,-5.623163,389891.0,557844.0
Wisconsin,50,2,3,55,79,Wisconsin,Milwaukee County,947735,947736,948301,...,-5.581617,-8.303054,-8.604921,-2.352634,-2.953702,-3.114494,-5.344614,-5.623163,52410.0,895325.0
Wisconsin,50,2,3,55,79,Wisconsin,Milwaukee County,947735,947736,948301,...,-5.581617,-8.303054,-8.604921,-2.352634,-2.953702,-3.114494,-5.344614,-5.623163,24496.0,923239.0
Wisconsin,50,2,3,55,79,Wisconsin,Milwaukee County,947735,947736,948301,...,-5.581617,-8.303054,-8.604921,-2.352634,-2.953702,-3.114494,-5.344614,-5.623163,166994.0,780741.0


# APPLYING

### The apply() method in groupby helps you to apply custom functions to each group and then returns a single dataframe for each apply() with the index being preserved