# Group

## Groupby
> Pandas - groupby()
- 집단, 그룹별로 데이터를 집계, 요약

> Split => Apply function => Combine
1. 전체 데이터를 그룹 별로 나누고 (split)
2. 각 그룹별로 집계함수를 적용(apply) 한 후
3. 그룹별 집계 결과를 하나로 합치는(combine) 단계

> parameter
- as_index = False : 그룹 라벨을 index로 사용하지 않음


In [2]:
# importing libraries
import pandas as pd
import numpy as np

In [3]:
# Reading data set (example data set)
dat = pd.read_csv("groupby_abalone.txt",
                     sep = ",",
                     names = ['sex', 'length', 'diameter', 'height', 
                              'whole_weight', 'shucked_weight', 'viscera_weight', 
                              'shell_weight', 'rings'],
                      header = None)
dat

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
dat.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [5]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


### 하나의 변수에 대해서 집계
- 성별(sex) 그룹('F', 'M', 'I')별로 무게변수에 대해서 집계
- 집단별 크기는 .size()
- 집단별 합계는 .sum()
- 집단별 평균은 .mean() 

In [6]:
# sex별 whole_weight

grouped = dat['whole_weight'].groupby(dat['sex'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000022B104D70D0>

In [7]:
grouped.size()

sex
F    1307
I    1342
M    1528
Name: whole_weight, dtype: int64

In [8]:
grouped.sum()

sex
F    1367.8175
I     578.8885
M    1514.9500
Name: whole_weight, dtype: float64

In [9]:
grouped.mean()

sex
F    1.046532
I    0.431363
M    0.991459
Name: whole_weight, dtype: float64

### 집계를 하는 key를 제외한 전체 연속형 변수에 대한 집계

In [10]:
dat.groupby(dat['sex']).mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [11]:
dat.groupby('sex').mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


### 2개의 key (두 조건으로 분류)

In [12]:
# np.where()  함수를 사용하여 length 의 중앙값보다 크면 'length_long'
# 중앙값보다 작으면 'length_short'의 이름으로하는 계급으로하는 새로운 범주형 변수.
dat['length_cat'] = np.where(dat.length > np.median(dat.length),
                            'long', # True
                            'short')# False
dat[['length', 'length_cat']]

Unnamed: 0,length,length_cat
0,0.455,short
1,0.350,short
2,0.530,short
3,0.440,short
4,0.330,short
...,...,...
4172,0.565,long
4173,0.590,long
4174,0.600,long
4175,0.625,long


In [13]:
# 성별 그룹(sex)과 길이 범주(length_cat) 그룹별로  GroupBy 를 사용하여 평균
mean_by_sex_length = dat['whole_weight'].groupby([dat['sex'], dat['length_cat']]).mean()
mean_by_sex_length

sex  length_cat
F    long          1.261330
     short         0.589702
I    long          0.923215
     short         0.351234
M    long          1.255182
     short         0.538157
Name: whole_weight, dtype: float64

In [14]:
mean_by_sex_length.unstack()

length_cat,long,short
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.26133,0.589702
I,0.923215,0.351234
M,1.255182,0.538157


In [15]:
dat.groupby(['sex', 'length_cat'])['whole_weight'].mean().unstack()

length_cat,long,short
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.26133,0.589702
I,0.923215,0.351234
M,1.255182,0.538157


In [16]:
dat.groupby(['sex', 'length_cat'], as_index=False)['whole_weight'].mean()

Unnamed: 0,sex,length_cat,whole_weight
0,F,long,1.26133
1,F,short,0.589702
2,I,long,0.923215
3,I,short,0.351234
4,M,long,1.255182
5,M,short,0.538157


### getgroup
- 그룹 안에 데이터를 확인하고 싶은 경우 사용

In [17]:
dat.groupby('sex').get_group('F')

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_cat
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,short
6,F,0.530,0.415,0.150,0.7775,0.2370,0.1415,0.3300,20,short
7,F,0.545,0.425,0.125,0.7680,0.2940,0.1495,0.2600,16,short
9,F,0.550,0.440,0.150,0.8945,0.3145,0.1510,0.3200,19,long
10,F,0.525,0.380,0.140,0.6065,0.1940,0.1475,0.2100,14,short
...,...,...,...,...,...,...,...,...,...,...
4160,F,0.585,0.475,0.165,1.0530,0.4580,0.2170,0.3000,11,long
4161,F,0.585,0.455,0.170,0.9945,0.4255,0.2630,0.2845,11,long
4168,F,0.515,0.400,0.125,0.6150,0.2865,0.1230,0.1765,8,short
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,long


### Aggregation
> Aggregation 
- 그룹별로 결과를 얻는 조작
- 원하는 함수를 사용

In [18]:
dat.groupby('sex').agg(np.mean)

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


### apply

In [19]:
dat.groupby(['sex', 'length_cat'], as_index = False).apply(lambda d : d.name)

sex  length_cat
F    long           (F, long)
     short         (F, short)
I    long           (I, long)
     short         (I, short)
M    long           (M, long)
     short         (M, short)
dtype: object