In [87]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [88]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font',family="Malgun Gothic")
plt.rcParams['axes.unicode_minus'] = False

codebook = pd.read_excel("Koweps_Codebook.xlsx")
df = pd.read_csv("Koweps_hpc10_2015_beta1.csv")

In [89]:
df.columns

Index(['Unnamed: 0', 'h10_id', 'h10_ind', 'h10_sn', 'h10_merkey', 'h_new',
       'h10_cobf', 'h10_reg5', 'h10_reg7', 'h10_din',
       ...
       'c1007_4aq19', 'c1007_4aq20', 'c1007_4aq21', 'c1007_4aq22',
       'c1007_4aq23', 'h10_pers_income1', 'h10_pers_income2',
       'h10_pers_income3', 'h10_pers_income4', 'h10_pers_income5'],
      dtype='object', length=958)

In [90]:
new_column = dict(codebook[['변수명', '설명']].values)
df.rename(columns = new_column, inplace = True)

In [91]:
vals = list(new_column.values())
df[vals]

Unnamed: 0,성별,태어난 연도,혼인상태,종교,직종,일한달의 월 평균 임금,7개 권역별 지역구분
0,2,1936,2,2,,,1
1,2,1945,2,2,,,1
2,1,1948,2,2,942.0,120.0,1
3,1,1942,3,1,762.0,200.0,1
4,2,1923,2,1,,,1
...,...,...,...,...,...,...,...
16659,2,1967,1,1,,,5
16660,2,1992,5,1,314.0,302.5,5
16661,1,1995,5,1,,,5
16662,2,1998,0,1,,,5


In [92]:
gender_data = codebook['내용'][0].replace('.', '').split()
gender_data = dict(map(lambda x: (int(x[0]), x[1]), gender_data))
df['성별'] = df['성별'].map(gender_data)

In [93]:
import re
marriage_data = codebook['내용'][2]
pattern = r'((\d+)\.(.*?)(?=\s\d+\.|$))' 
# (\d+)= \d: 숫자로시작, +:1개이상, (): 이 부분을 결과로 반환
# \.: 마침표찾기,
# (.*?) = .: 어떤 문자든 1개, *: 1개이상반복, ?: 가능한 짧게 
# (?=\s\d+\.|$) = (?= ...): lookahead라고 불리며, ...에 들어간 패턴이 뒤 따라오는 경우에만을 의미
# \s: 공백문자, \d+: 숫자하나이상, \.마침표, |: 또는 $: 문자열 끝
marriage_data = re.findall(pattern = pattern, string = marriage_data)
print(marriage_data)
df['혼인상태'] = df['혼인상태'].map({int(k) : v.strip() for s,k,v in marriage_data})


[('0.비해당(18세 미만)', '0', '비해당(18세 미만)'), ('1.유배우        ', '1', '유배우        '), ('2.사별        ', '2', '사별        '), ('3.이혼         ', '3', '이혼         '), ('4.별거          ', '4', '별거          '), ('5.미혼(18세이상, 미혼모 포함)  ', '5', '미혼(18세이상, 미혼모 포함)  '), ('6.기타(사망 등)', '6', '기타(사망 등)')]


In [94]:
df[['성별', '혼인상태']]

Unnamed: 0,성별,혼인상태
0,여,사별
1,여,사별
2,남,사별
3,남,이혼
4,여,사별
...,...,...
16659,여,유배우
16660,여,"미혼(18세이상, 미혼모 포함)"
16661,남,"미혼(18세이상, 미혼모 포함)"
16662,여,비해당(18세 미만)


In [100]:
gender_marriage_info = df.groupby(['성별', '혼인상태']).size().unstack('혼인상태')
gender_marriage_info['총 인원'] = gender_marriage_info.sum(axis=1)
gender_marriage_info['혼인한 사람'] = gender_marriage_info['총 인원'] - gender_marriage_info['비해당(18세 미만)'] - gender_marriage_info['미혼(18세이상, 미혼모 포함)']
gender_marriage_info['이혼율'] = (gender_marriage_info['이혼'] / gender_marriage_info['혼인한 사람'] * 100).round(2)

In [101]:
gender_marriage_info

혼인상태,기타(사망 등),"미혼(18세이상, 미혼모 포함)",별거,비해당(18세 미만),사별,유배우,이혼,총 인원,혼인한 사람,이혼율
성별,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
남,18,1327,31,1454,229,4197,322,7578,4797,6.71
여,8,1106,53,1407,1888,4234,390,9086,6573,5.93


In [None]:
sns.barplot(gender_marriage_info, x = '성별', y = '이혼율', hue = '이혼율')
plt.title("남/여 이혼율 비교(18세 미만 제외)")
plt.show()