<a href="https://colab.research.google.com/github/Saifullah785/python-data-science-handbook-notes/blob/main/03_08_Aggregation_and_Grouping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Aggregation and Grouping**

In [24]:
import numpy as np
import pandas as pd

In [43]:
# Define a class to display the HTML representation of multiple objects side-by-side
class  display(object):
  """Display HTML representation of multiple objects"""

  template = """<div style ="float: left; padding: 10px;">
  <p style = 'font-family: "Courier New", Courier, monospace'>{0}</p>{1}
  </div>"""

  # Constructor to initialize the display object with the arguments to be displayed
  def __init__(self, *args):
    self.args = args

  # Method to return the HTML representation of the objects for display
  def __repr_html_(self):
    return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                     for a in self.args)

  # Method to return the string representation of the objects
  def __repr__(self):
    return '\n\n'.join(a + '\n' + repr(eval(a))
                       for a in self.args)

# **Planet Data**

In [26]:
import seaborn as sns
# Load the 'planets' dataset from seaborn
planets = sns.load_dataset('planets')
# Display the shape of the DataFrame (number of rows and columns)
planets.shape

(1035, 6)

In [27]:
# Display the first 5 rows of the DataFrame
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


# **Simple Aggregation in Pandas**

In [28]:
# Create a random number generator with a seed for reproducibility
rng = np.random.RandomState(42)
# Create a pandas Series with 5 random numbers
ser = pd.Series(rng.rand(5))
# Display the Series
ser

Unnamed: 0,0
0,0.37454
1,0.950714
2,0.731994
3,0.598658
4,0.156019


In [29]:
# Calculate and display the sum of the elements in the Series
ser.sum()

np.float64(2.811925491708157)

In [30]:
# Calculate and display the mean of the elements in the Series
ser.mean()

np.float64(0.5623850983416314)

In [31]:
# Create a pandas DataFrame with two columns ('A' and 'B') and 5 rows of random numbers
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
# Display the DataFrame
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [32]:
# Calculate and display the mean of each column in the DataFrame
df.mean()

Unnamed: 0,0
A,0.477888
B,0.44342


In [33]:
# Calculate and display the mean of each row in the DataFrame (axis='columns')
df.mean(axis = 'columns')

Unnamed: 0,0
0,0.08829
1,0.513997
2,0.849309
3,0.406727
4,0.444949


In [34]:
# Drop rows with missing values and display descriptive statistics of the remaining data
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0



Aggregation   |   	Returns

count	        |     Total number of items

first, last	  |    First and last item

mean, median	|     Mean and median

min, max	    |     Minimum and maximum

std, var	    |      Standard deviation and variance

mad           |      	Mean absolute deviation

prod	        |       Product of all items

sum	          |        Sum of all items

#**group by : Split, Apply, Combine**

**Split, Apply, Combine**

In [35]:
# Create a pandas DataFrame with a 'key' column and a 'data' column
df = pd.DataFrame({'key': ['A','B','C','A','B','C'],'data':range(6)},columns = ['key','data'])
# Display the DataFrame
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [36]:
# Group the DataFrame by the 'key' column
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc513c13310>

In [37]:
# Group the DataFrame by the 'key' column and calculate the sum of the 'data' column for each group
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


# **The GroupBy Object**

In [38]:
# Group the 'planets' DataFrame by the 'method' column
planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc512fc1910>

In [39]:
# Group the 'planets' DataFrame by the 'method' column and select the 'orbital_period' column
planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc513af8f90>

In [40]:
# Group the 'planets' DataFrame by the 'method' column and calculate the median of the 'orbital_period' for each group
planets.groupby('method')['orbital_period'].median()

Unnamed: 0_level_0,orbital_period
method,Unnamed: 1_level_1
Astrometry,631.18
Eclipse Timing Variations,4343.5
Imaging,27500.0
Microlensing,3300.0
Orbital Brightness Modulation,0.342887
Pulsar Timing,66.5419
Pulsation Timing Variations,1170.0
Radial Velocity,360.2
Transit,5.714932
Transit Timing Variations,57.011


In [41]:
# Iterate over the groups created by grouping 'planets' by 'method'
for (method, group) in planets.groupby('method'):
  # Print the method name and the shape of the corresponding group
  print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


**Dispatch methods**

In [42]:
# Group the 'planets' DataFrame by the 'method' column, select the 'year' column, calculate descriptive statistics for each group, and unstack the result
planets.groupby('method')['year'].describe().unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,method,Unnamed: 2_level_1
count,Astrometry,2.0
count,Eclipse Timing Variations,9.0
count,Imaging,38.0
count,Microlensing,23.0
count,Orbital Brightness Modulation,3.0
...,...,...
max,Pulsar Timing,2011.0
max,Pulsation Timing Variations,2007.0
max,Radial Velocity,2014.0
max,Transit,2014.0
