# 연령별 인구분포 비교분석

- https://jumin.mois.go.kr/#
- 연령별 인구분포

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

import pandas as pd
from pandas.plotting import parallel_coordinates as pac
import numpy as np

from hdfs import InsecureClient

In [2]:
plt.style.use('fivethirtyeight')

d2coding_path = './D2Coding-Ver1.3.2-20180524.ttf' # 폰트 경로
fm.fontManager.addfont(d2coding_path)
plt.rcParams['font.family'] = 'D2Coding'

matplotlib.rcParams['axes.unicode_minus'] = False

In [5]:
client_hdfs = InsecureClient('http://bdata.iptime.org:9999/', user='hadoop')
# _file = open('./dataset/연령별인구현황_연간_2008-2013.csv', 'r', encoding='euc-kr')
# print(_file.read()[:500])
# _file.close()

In [6]:

data = []
data_dir = '/user/hadoop/dataset'
paths = ['연령별인구현황_연간_2008-2013.csv', '연령별인구현황_연간_2014-2019.csv', '연령별인구현황_연간_2020-2023.csv']
for path in paths:
    with client_hdfs.read(f'{data_dir}/{path}', encoding='euc-kr') as reader:
        data.append(pd.read_csv(reader))

print(data[0].columns)

Index(['행정구역', '2008년_계_총인구수', '2008년_계_연령구간인구수', '2008년_계_0~9세',
       '2008년_계_10~19세', '2008년_계_20~29세', '2008년_계_30~39세', '2008년_계_40~49세',
       '2008년_계_50~59세', '2008년_계_60~69세',
       ...
       '2013년_여_10~19세', '2013년_여_20~29세', '2013년_여_30~39세', '2013년_여_40~49세',
       '2013년_여_50~59세', '2013년_여_60~69세', '2013년_여_70~79세', '2013년_여_80~89세',
       '2013년_여_90~99세', '2013년_여_100세 이상'],
      dtype='object', length=235)


In [None]:
for i in range(1, 3):
    print(i)

## Pandas의 merge와 concat 함수

### merge

In [None]:
import pandas as pd

test1 = {'key': ['A', 'B', 'C'], 'val1': [1, 2, 3]}
test2 = {'key': ['A', 'B', 'E'], 'val2': [2, 4, 8]}
df1 = pd.DataFrame(test1)
df2 = pd.DataFrame(test2)
mdf_in = pd.merge(df1, df2, how='inner', on='key')
mdf_le = pd.merge(df1, df2, how='left', on='key')
print(mdf_in)
print(mdf_le)

### concat

In [None]:
import pandas as pd

test1 = {'key': ['A', 'B', 'C'], 'val1': [1, 2, 3]}
test2 = {'key': ['A', 'B', 'E'], 'val2': [2, 4, 8]}
df1 = pd.DataFrame(test1)
df2 = pd.DataFrame(test2)

cdf = pd.concat([df1, df2])
# 옵션 추가하기 (기본값: axis=0, ignore_index=False)
cdf_a = pd.concat([df1, df2], axis=1)
cdf_b = pd.concat([df1, df2], ignore_index=True)
print(cdf)
print(cdf_a)
print(cdf_b)

In [None]:
# import pandas as pd (already imported)

def merge(data, on):
    # if not data: 
    #     raise ValueError('At least one DataFrame is required.')
    res = data[0]
    for i in range(1, len(data)):
        res = pd.merge(res, data[i], how='left', on=on)
    return res

In [None]:
data_m = merge(data, on='행정구역')
# data_c = pd.concat(data, axis=1)
data_m

In [None]:
print(len(data_m))

In [None]:
data_m.rename(columns={'행정구역':'region'}, inplace=True) # inplace옵션: 원본 데이터프레임의 데이터손실 발생 여부
data_m.head(10)

In [None]:
# data_g = data_m[data_m.columns[data_m.columns.str.contains('region|년_남_|년_여_')]]
# data_g = data_m[~data_m.columns[data_m.columns.str.contains('년_남_|년_여_')]] # ~ 연산자는 not을 의미함
# gdata = mdata[mdata.columns[~mdata.columns.str.contains('인구수|계_')]]
data_g = data_m[data_m.columns[~data_m.columns.str.contains('인구수|계_')]]
data_g

In [None]:
data_r = data_g.melt(id_vars=['region'], var_name='type', value_name='value')

In [None]:
# 컬럼 하나를 언더바(_) 기준으로 여러 컬럼으로 나누고 기존 컬럼 삭제
data_r[['year', 'gender', 'age']] = data_r['type'].str.split('_', expand=True)
del data_r['type']
# 문자열 내용 + 타입 변경
data_r['value'] = data_r['value'].str.replace(',', '').astype(float)
data_r['region'] = data_r['region'].str.replace(r'\(\d+\)', '', regex=True)
data_r['year'] = pd.to_datetime(data_r['year'], format='%Y년')

In [None]:
# data_r_1 = data_r[~data_r['region'].str.contains('전국')]
data_r = data_r.drop(index=0) # 여러 행 제거 = 배열 형식 or range()
data_r.head()

In [None]:
data1 = data_r.copy()

f1 = data1['year'].dt.year == 2008

data1 = data1[f1].groupby(['region', 'age'])['value'].sum().reset_index()
data1.head()

In [None]:
# 각 지역에 따른 인구수 백분율
data1['value'] = data1.groupby(['region'])['value'].transform(lambda x: 100 * x / float(x.sum()))
data1.head()

In [None]:
# data1 = data1.dropna() # NaN 지우기
# data1 = data1.fillna(0) # NaN을 0으로 채우기
data1 = data1.fillna(0)
data1.head(110)

In [None]:
piv = data1.pivot(index='region', columns='age', values='value')
args = np.argsort(piv.columns.str.extract(r'(\d+)', expand=False).astype(int))
print(args)
piv = piv.iloc[:,args]
piv.head()

In [None]:
piv[['0~9세','10~19세','20~29세']]

In [None]:
plt.figure(figsize=(10,10))
plt.title('2008년 지역별 연령 분포 비교')
sns.heatmap(piv, cmap='Oranges')
plt.savefig('image/2008년지역별연령분포비교.png')
plt.show()

In [None]:
fig, ax = plt.subplots(4, 4, figsize=(20, 20))
axs = ax.flat

for r, v in piv.items():
    print(f'{r} / {v}')
    