### [ 그룹화 처리 ]
- Series/DataFrame에 groupby() 메서드를 사용
- Series/DataFrameGroupBy object 결과로 반환

In [2]:
# [1] 모듈로딩
import pandas as pd
import df_util as util

In [3]:
# [2] 데이터 준비
file_name='titanic.csv'

In [4]:
# [3] CSV => DataFrame로 저장
dataDF = pd.read_csv(file_name)

In [5]:
# [4] 데이터 확인
util.checkDataFrame(dataDF, dataDF)


[     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  a

In [6]:
# [5] 분석 주제 ==> 성별에 따른 나이 분포
# - 분석 컬럼 즉, 성별과 나이 컬럼
# - loc[행, 열]
gender_ageDF=dataDF[['sex','age']] # dataDF.loc[:,['sex','age']]

In [7]:
# [5-1] 데이터 전처리 : 결측치, 결측값
# - 결측치 : isna(), isnull()

gender_ageDF.isna().sum()

sex      0
age    177
dtype: int64

In [8]:
# -- 결측치 처리 ==> 성별에 따라 치환 
# -- 성별에 따른 그룹화
groupObj=gender_ageDF.groupby('sex')

In [9]:
# -- Group 객체의 속성
# -- groups속성 : 그룹화된 그룹에 속하는 인덱스 정보 저장 Dict 형태
print(f'성별에 따라 그룹화하여 keys() 연산: {groupObj.groups.keys()}')
print(f'성별에 따라 그룹화하여 values() 연산:{groupObj.groups.values()}')

성별에 따라 그룹화하여 keys() 연산: dict_keys(['female', 'male'])
성별에 따라 그룹화하여 values() 연산:dict_values([Index([  1,   2,   3,   8,   9,  10,  11,  14,  15,  18,
       ...
       866, 871, 874, 875, 879, 880, 882, 885, 887, 888],
      dtype='int64', length=314), Index([  0,   4,   5,   6,   7,  12,  13,  16,  17,  20,
       ...
       873, 876, 877, 878, 881, 883, 884, 886, 889, 890],
      dtype='int64', length=577)])


In [10]:
# -- groups속성 : 각 그룹의 인덱스 속성을 저장 Dict 형태
# print(groupObj.indices)
print(groupObj.indices.keys())

dict_keys(['female', 'male'])


In [11]:
# -- get_group() 메서드 : 그룹화된 그룹들 중에서 특정 그룹 데이터 읽기 메서드
#                         DataFrame을 반환
for key in groupObj.groups.keys():
    print(f'\n[{key}]')
    print(groupObj.get_group(key).ndim, groupObj.get_group(key).shape)
    print(groupObj.get_group(key).min(), groupObj.get_group(key).max())


[female]
2 (314, 2)
sex    female
age      0.75
dtype: object sex    female
age      63.0
dtype: object

[male]
2 (577, 2)
sex    male
age    0.42
dtype: object sex    male
age    80.0
dtype: object


In [12]:
# - 그룹별로 집계연산 수행
print(groupObj.count())
print(groupObj.max())
print(groupObj.min())
print(groupObj.mean())
print(groupObj.median())

        age
sex        
female  261
male    453
         age
sex         
female  63.0
male    80.0
         age
sex         
female  0.75
male    0.42
              age
sex              
female  27.915709
male    30.726645
         age
sex         
female  27.0
male    29.0


In [13]:
resultDF=groupObj.agg(['min', 'max', 'mean', 'median', 'sum'])
resultDF

Unnamed: 0_level_0,age,age,age,age,age
Unnamed: 0_level_1,min,max,mean,median,sum
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
female,0.75,63.0,27.915709,27.0,7286.0
male,0.42,80.0,30.726645,29.0,13919.17


In [14]:
resultDF.columns

MultiIndex([('age',    'min'),
            ('age',    'max'),
            ('age',   'mean'),
            ('age', 'median'),
            ('age',    'sum')],
           )

In [15]:
resultDF[[('age','min'), ('age', 'mean')]]

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,min,mean
sex,Unnamed: 1_level_2,Unnamed: 2_level_2
female,0.75,27.915709
male,0.42,30.726645


In [16]:
# Series에서 원소 추출
resultDF[('age','min')]['female']

0.75

In [17]:
# Series 연산
resultDF[('age','max')] - resultDF[('age','min')]

sex
female    62.25
male      79.58
dtype: float64

In [18]:
dataDF.survived.sum()

342

- 사용자 정의 함수 지정하기

In [19]:
def myfunc(obj):
    print(type(obj), obj.head(3))
    return obj.max()

In [20]:
agg_groupObj=groupObj.agg(['count',myfunc, lambda x:x.mean()])
agg_groupObj

<class 'pandas.core.series.Series'> 1    38.0
2    26.0
3    35.0
Name: age, dtype: float64
<class 'pandas.core.series.Series'> 0    22.0
4    35.0
5     NaN
Name: age, dtype: float64


Unnamed: 0_level_0,age,age,age
Unnamed: 0_level_1,count,myfunc,<lambda_0>
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,261,63.0,27.915709
male,453,80.0,30.726645


In [21]:
print(agg_groupObj.index)
print(agg_groupObj.columns)

Index(['female', 'male'], dtype='object', name='sex')
MultiIndex([('age',      'count'),
            ('age',     'myfunc'),
            ('age', '<lambda_0>')],
           )
