 # Multi Indexing Grouping and Aggregations

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
data=pd.read_csv('https://gist.githubusercontent.com/justmarkham/19241df07db2b93283fbb0cfe9c572f9/raw/f70931bd9f25c560b330f017f6933073c442c7ed/stocks.csv')
data.head()

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT


- set index on ‘Symbol‘ and 'Date‘, allow inplace changes 

In [4]:
data.set_index('Symbol','Date')

Unnamed: 0_level_0,Date,Close,Volume
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
AAPL,2016-10-03,112.52,21701800
MSFT,2016-10-03,57.42,19189500
AAPL,2016-10-04,113.0,29736800
MSFT,2016-10-04,57.24,20085900
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-05,57.64,16726400
CSCO,2016-10-05,31.59,11808600
AAPL,2016-10-05,113.05,21453100


In [5]:
# sort indexes 

In [7]:
data.sort_index()

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


In [8]:
# Select data for “AAPL” 

In [23]:
data[["Symbol"]]

Unnamed: 0,Symbol
0,CSCO
1,AAPL
2,MSFT
3,AAPL
4,MSFT
5,CSCO
6,MSFT
7,CSCO
8,AAPL


In [24]:
# Select “Close” value for “AAPL” on “2016-10-03” 

In [26]:
close=pd.DataFrame({'AAPL':[2016/10/3]})

In [27]:
close

Unnamed: 0,AAPL
0,67.2


In [28]:
# Select data for Apple and Microsoft 

In [31]:
data[['Symbol','AAPL']]

KeyError: "['AAPL'] not in index"

In [32]:
# Select data for Apple and Microsoft for a particular Date 

In [33]:
# Select data for apple on multiple dates 

In [34]:
#Selecting all data for '2016-10-03','2016-10-04‘date 

# Grouping and Aggregations 

In [35]:
data=pd.read_csv('https://raw.githubusercontent.com/datagy/data/main/sales.csv')
data.head()

Unnamed: 0,date,gender,region,sales
0,8/22/2022,Male,North-West,20381
1,3/5/2022,Male,North-East,14495
2,2/9/2022,Male,North-East,13510
3,6/22/2022,Male,North-East,15983
4,8/10/2022,Female,North-West,15007


In [None]:
# Load data and group by Region 

In [40]:
data.groupby('region')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002050B9088B0>

In [41]:
g1=data.groupby(["region"]).count()

In [42]:
g1

Unnamed: 0_level_0,date,gender,sales
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
North-East,347,347,347
North-West,322,322,322
South,331,331,331


In [48]:
# Accessing the Groups in a GroupBy object 

In [49]:
print(data.groupby('region').groups)

{'North-East': [1, 2, 3, 12, 13, 14, 15, 18, 19, 23, 24, 25, 26, 28, 31, 35, 40, 41, 45, 50, 51, 58, 59, 60, 63, 64, 67, 68, 72, 75, 76, 77, 80, 82, 83, 85, 87, 92, 94, 100, 105, 108, 109, 112, 113, 114, 115, 120, 121, 125, 127, 130, 134, 138, 139, 140, 141, 142, 146, 149, 155, 157, 164, 165, 176, 177, 179, 180, 188, 191, 193, 195, 199, 201, 205, 207, 211, 216, 218, 222, 224, 228, 231, 236, 242, 243, 246, 249, 254, 256, 259, 264, 265, 271, 272, 280, 283, 287, 288, 289, ...], 'North-West': [0, 4, 7, 11, 16, 17, 21, 29, 30, 34, 36, 38, 39, 42, 43, 44, 46, 48, 49, 52, 55, 56, 62, 66, 70, 71, 74, 79, 88, 89, 90, 91, 95, 98, 99, 103, 106, 116, 117, 119, 122, 123, 128, 129, 132, 137, 143, 144, 147, 148, 151, 153, 154, 156, 160, 161, 169, 171, 175, 178, 184, 185, 186, 198, 204, 206, 209, 212, 217, 221, 223, 225, 230, 233, 234, 235, 237, 240, 244, 251, 252, 253, 255, 260, 261, 263, 266, 267, 277, 278, 281, 285, 290, 291, 296, 297, 300, 318, 320, 321, ...], 'South': [5, 6, 8, 9, 10, 20, 22, 27,

In [43]:
# counting number of groups  

In [47]:
data.groupby('region').ngroups

3

In [50]:
# Selecting a Group of south region 

In [54]:
print(data.groupby('region').get_group('South'))

           date  gender region  sales
5      9/6/2022    Male  South  21792
6     8/21/2022    Male  South  20113
8    11/22/2022    Male  South  14594
9     1/16/2022  Female  South  24114
10   12/21/2022    Male  South  35154
..          ...     ...    ...    ...
972    6/9/2022    Male  South  22254
979  11/24/2022  Female  South  25591
981   12/5/2022    Male  South  34334
985   12/1/2022  Female  South  21282
994   9/29/2022    Male  South  21255

[331 rows x 4 columns]


In [55]:
# Findout average sales per region 

In [59]:
averages=data.groupby('region')['sales'].mean()
print(averages)

region
North-East    17386.072046
North-West    15257.732919
South         24466.864048
Name: sales, dtype: float64


- Applying Multiple Aggregations Using Pandas GroupBy - 
- Find out Mean, median and variance of sales per region 

In [63]:
averages=data.groupby('region')['sales'].mean()
print(averages)
aggregation=data.groupby('region')['sales'].median()
print(aggregation)
averages1=data.groupby('region')['sales'].var()
print(averages1)

region
North-East    17386.072046
North-West    15257.732919
South         24466.864048
Name: sales, dtype: float64
region
North-East    17414.0
North-West    15337.5
South         24492.0
Name: sales, dtype: float64
region
North-East    4.131225e+06
North-West    1.311495e+07
South         2.760139e+07
Name: sales, dtype: float64


In [65]:
# OR
# Applying Multiple Aggregations with .agg()
import numpy as np
aggs = data.groupby('region')['sales'].agg([np.mean, np.std, np.var])
print(aggs)

                    mean          std           var
region                                             
North-East  17386.072046  2032.541552  4.131225e+06
North-West  15257.732919  3621.456493  1.311495e+07
South       24466.864048  5253.702513  2.760139e+07


In [64]:
# Calculate percentage of region's sales per row with respect to total sale of the region 

In [66]:
data['Percent Of Region Sales'] = data['sales'] / data.groupby('region')['sales'].transform('sum')
print(data.head())

        date  gender      region  sales  Percent Of Region Sales
0  8/22/2022    Male  North-West  20381                 0.004148
1   3/5/2022    Male  North-East  14495                 0.002403
2   2/9/2022    Male  North-East  13510                 0.002239
3  6/22/2022    Male  North-East  15983                 0.002649
4  8/10/2022  Female  North-West  15007                 0.003055


In [70]:
# Filtering Rows Where the Group's Average Sale Price is Less Than 20,000 

In [71]:
data = data.groupby('region').filter(lambda x: x['sales'].mean() < 20000)
print(data.head())

        date  gender      region  sales  Percent Of Region Sales
0  8/22/2022    Male  North-West  20381                 0.004148
1   3/5/2022    Male  North-East  14495                 0.002403
2   2/9/2022    Male  North-East  13510                 0.002239
3  6/22/2022    Male  North-East  15983                 0.002649
4  8/10/2022  Female  North-West  15007                 0.003055


In [68]:
# calculate the sum of all sales broken out by 'region' and by 'gender' 

In [72]:
sums = data.groupby(['region', 'gender']).sum()
print(sums.head())

                     sales  Percent Of Region Sales
region     gender                                  
North-East Female  3051132                 0.505743
           Male    2981835                 0.494257
North-West Female  2455899                 0.499879
           Male    2457091                 0.500121


In [69]:
# Ranking Sales by Region and by Gender  

In [73]:
data['rank'] = data.groupby(['region', 'gender'])['sales'].rank(ascending=False)
print(data.head())

        date  gender      region  sales  Percent Of Region Sales   rank
0  8/22/2022    Male  North-West  20381                 0.004148   11.0
1   3/5/2022    Male  North-East  14495                 0.002403  154.0
2   2/9/2022    Male  North-East  13510                 0.002239  168.0
3  6/22/2022    Male  North-East  15983                 0.002649  138.0
4  8/10/2022  Female  North-West  15007                 0.003055   89.5
