# [sklearn] 운동 종목 분류 모델

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-whitegrid')
plt.rc('font', family = 'Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

## 데이터 불러오기

In [2]:
raw_df = pd.read_excel('pydata/2017 장애인실태조사_최종공개.xlsx', sheet_name=1)
raw_df

Unnamed: 0,조사표 종류,가구원 일련번호(장애인),가구원 일련번호(응답자),조사지역(시도),응답자 유형,대리응답이유,장애유형확인1,장애유형확인2,장애유형확인3,개인번호,...,"보육,교육비","보호,간병비",재활기관이용료,통신비,"장애인보조기기 구입,유지비",부모사후 및 노후대비비,기타.4,월평균 총 추가비용,wg_p,ws_p
0,2,1,1,21,,,1,,,1,...,0,0,0,0,0,0,0,0,444.377794,1.132070
1,2,1,1,21,,,1,,,1,...,0,0,0,0,0,0,0,0,444.377794,1.132070
2,2,1,1,21,,,1,,,1,...,0,0,0,0,0,0,0,0,444.377794,1.132070
3,2,2,2,21,,,1,,,2,...,0,0,0,0,0,0,0,28,444.377794,1.132070
4,2,3,3,21,,,8,,,3,...,0,0,25,0,0,0,0,33,575.147902,1.465212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2,2,2,31,,,1,,,2,...,0,0,0,0,0,0,0,77,61.800775,0.157440
6545,2,1,1,31,,,1,,,1,...,0,0,0,0,0,0,0,13,61.800775,0.157440
6546,2,1,1,31,,,1,,,1,...,0,0,0,0,0,0,0,6,61.800775,0.157440
6547,2,1,1,31,,,9,,,1,...,0,0,0,0,0,0,0,35,55.733914,0.141984


---
## 데이터 정제(전처리)

### 필요한 정보만 추출
#### 1) 지체장애인 + 뇌병변장애인
#### 2) 현재 운동 중인 사람만 추출(**건강관리운동 == 1**)

In [3]:
cond1 = (raw_df['지체장애여부']== 1) # 1) 지체장애
cond2 = (raw_df['뇌병변장애여부'] == 1) # 1) 뇌병변장애
cond3 = (raw_df['지체장애여부'] != 1) & (raw_df['뇌병변장애여부'] != 1)

df = raw_df.loc[cond1 | cond2 | cond3].loc[raw_df['건광관리운동'] == 1] # 2) 운동하는 사람만 추출
df.head()

Unnamed: 0,조사표 종류,가구원 일련번호(장애인),가구원 일련번호(응답자),조사지역(시도),응답자 유형,대리응답이유,장애유형확인1,장애유형확인2,장애유형확인3,개인번호,...,"보육,교육비","보호,간병비",재활기관이용료,통신비,"장애인보조기기 구입,유지비",부모사후 및 노후대비비,기타.4,월평균 총 추가비용,wg_p,ws_p
0,2,1,1,21,,,1,,,1,...,0,0,0,0,0,0,0,0,444.377794,1.13207
2,2,1,1,21,,,1,,,1,...,0,0,0,0,0,0,0,0,444.377794,1.13207
3,2,2,2,21,,,1,,,2,...,0,0,0,0,0,0,0,28,444.377794,1.13207
4,2,3,3,21,,,8,,,3,...,0,0,25,0,0,0,0,33,575.147902,1.465212
5,2,2,2,21,,,9,,,2,...,0,0,0,0,0,0,0,346,427.939532,1.090193


In [4]:
df.shape

(3901, 1192)

In [4]:
# 장애유형 빈도 확인
df[['지체장애여부', '뇌병변장애여부']].value_counts()

지체장애여부  뇌병변장애여부
1       2          1951
2       2          1595
        1           341
1       1            14
dtype: int64

#### 3) 최대 빈도수를 갖는 만성질환 변수만 사용

In [5]:
# 만성질환 칼럼만 추출
disease_df = df[['01)만성질환명(고혈압)', '02)만성질환명(뇌졸중,중풍)', '03)만성질환명(심근경색증)', '04)만성질환명(협심증)', '05)만성질환명(이상지혈증)', '06)만성질환명(당뇨병)', '07)만성질환명(갑상선장애)', '08)만성질환명(천식)', '09)만성질환명(폐결핵)', '10)만성질환명(폐질환(만성기관 지염,폐기종)', '11)만성질환명(위십이지장궤양)', '12)만성질환명(B형간염)', '13)만성질환명(C형간염)', '14)만성질환명(간경변증)', '15)만성질환명(신부전)', '16)만성질환명(골관절염(퇴행성\n관절염))', '17)만성질환명(류마티스 관절염)', '18)만성질환명(골다공증)', '19)만성질환명(척추측만증)', '20)만성질환명(허리목통증)', '21)만성질환명(피부염)', '22)만성질환명(백내장)', '23)만성질환명(우울증)', '24)만성질환명(암)', '25)만성질환명(기타)']]
disease_df.head()

Unnamed: 0,01)만성질환명(고혈압),"02)만성질환명(뇌졸중,중풍)",03)만성질환명(심근경색증),04)만성질환명(협심증),05)만성질환명(이상지혈증),06)만성질환명(당뇨병),07)만성질환명(갑상선장애),08)만성질환명(천식),09)만성질환명(폐결핵),"10)만성질환명(폐질환(만성기관 지염,폐기종)",...,16)만성질환명(골관절염(퇴행성\n관절염)),17)만성질환명(류마티스 관절염),18)만성질환명(골다공증),19)만성질환명(척추측만증),20)만성질환명(허리목통증),21)만성질환명(피부염),22)만성질환명(백내장),23)만성질환명(우울증),24)만성질환명(암),25)만성질환명(기타)
0,2,2,2,2,2,2,1,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,1,2,2,2,2,1
4,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,1,2,1
5,1,2,2,2,1,1,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [7]:
# 2-> 0 변경
disease_lst = disease_df.columns.tolist()
for i in range(len(disease_lst)):
    disease_df.replace({disease_lst[i]: 2}, {disease_lst[i]: 0}, inplace = True)

disease_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,01)만성질환명(고혈압),"02)만성질환명(뇌졸중,중풍)",03)만성질환명(심근경색증),04)만성질환명(협심증),05)만성질환명(이상지혈증),06)만성질환명(당뇨병),07)만성질환명(갑상선장애),08)만성질환명(천식),09)만성질환명(폐결핵),"10)만성질환명(폐질환(만성기관 지염,폐기종)",...,16)만성질환명(골관절염(퇴행성\n관절염)),17)만성질환명(류마티스 관절염),18)만성질환명(골다공증),19)만성질환명(척추측만증),20)만성질환명(허리목통증),21)만성질환명(피부염),22)만성질환명(백내장),23)만성질환명(우울증),24)만성질환명(암),25)만성질환명(기타)
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
5,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# 열별로 각 만성질환 합 출력
# 고혈압, 허리목통증, 골관절염, 당뇨병
pd.DataFrame(disease_df.sum(axis = 0)).reset_index().sort_values(by = 0, ascending=False).head(4)

Unnamed: 0,index,0
0,01)만성질환명(고혈압),1737
19,20)만성질환명(허리목통증),1206
15,16)만성질환명(골관절염(퇴행성\n관절염)),893
5,06)만성질환명(당뇨병),838


### 필요한 변수만 추출

In [9]:
print(df.columns.tolist())

['조사표 종류', '가구원 일련번호(장애인)', '가구원 일련번호(응답자)', '조사지역(시도)', '응답자 유형', '대리응답이유', '장애유형확인1', '장애유형확인2', '장애유형확인3', '개인번호', '가구주와의 관계(가구주)', '성별', '생년', '생월', '만 나이', '장애등록 여부', '장애등록 연도', '등록장애유형(1순위)\xa0', '등록장애유형(2순위)', '장애등급', '본인을 포함한 총 가구원수', '본인을 포함한 총 장애인수', '가구 유형\xa0', '월 평균 총가구소득', '가구 주된 수입원', '가구 월평균 지출액', '주택 형태', '장애차별에 대한 인식', '장애인차별금지법 인지', '지체장애여부', '장애부위(상지)', '장애부위(하지)', '장애부위(척추)', '가장 불편한부위', '장애발생시기', '장애발생시 연령', '장애주된 원인', '질병명', '산업재해인정여부', '주된 진단명', '뇌병변장애여부', '장애부위(상지).1', '장애부위(하지).1', '장애부위(척추).1', '①동반증상-경직', '②동반증상-관절구축', '③동반증상-배변배뇨장애', '④동반증상-통증', '⑤동반증상-연하장애', '⑥동반증상-기타', '⑦동반장애-시각장애', '⑧동반장애-청각장애', '⑨동반장애-언어장애', '⑩동반장애-지적장애', '◯1 동반장애-뇌전증(간질)장애', '◯12동반장애-기타', '주된의사소통방법', '장애발생시기.1', '장애발생시 연령.1', '장애 주된 원인', '질병명\xa0', '산업재해인정여부.1', '주된 진단명(1)', '출생장소', '출산방법', '시각장애여부', '시력인지여부', '점자해독여부', '장애발생시기.2', '장애발생시 연령.2', '장애 주된 원인.1', '질병명\xa0.1', '산업재해인정여부.2', '청각장애애여부- 1순위', '청각장애애여부- 2순위', '보청기사용여부', '인공와우수술 여부', '수화가능여부', '수화가능자-주된의사전달수단\xa0', '수화이외사용 이유', 

In [6]:
# 총 20개 변수
data = df[['성별', '생년', '장애등급', '지체장애여부', '뇌병변장애여부', '01)만성질환명(고혈압)', '20)만성질환명(허리목통증)', 
    '16)만성질환명(골관절염(퇴행성\n관절염))', '06)만성질환명(당뇨병)', '흡연 여부', '음주횟수', '키(센티)', '몸무게(kg)', 'EQ-5D (통증/불편)',
    '운동 주기', '운동 시간(분)', '월 소득 합계-개인', '월평균 총 추가비용', '참여 운동 종목 (1순위)', '참여 운동 종목 (2순위)']]
data.head()

Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,음주횟수,키(센티),몸무게(kg),EQ-5D (통증/불편),운동 주기,운동 시간(분),월 소득 합계-개인,월평균 총 추가비용,참여 운동 종목 (1순위),참여 운동 종목 (2순위)
0,1,1970,6,1,2,2,2,2,2,3,4,178.0,77.0,1,4,90,300,0,9,0
2,1,1976,6,1,2,0,0,0,0,1,2,186.0,98.0,1,1,60,604,0,9,0
3,2,1958,5,1,2,2,1,2,2,4,1,158.0,63.0,2,1,60,50,28,2,0
4,2,1984,3,2,2,2,2,2,2,4,1,158.0,58.0,1,2,30,0,33,2,0
5,1,1972,2,2,2,1,2,2,1,3,3,168.0,89.0,2,2,30,0,346,18,2


In [11]:
data.shape

(3901, 20)

### 운동 종목 1, 2순위, 한 셀로 합치기

In [7]:
# 수치 -> 문자형으로 변환
kinds_list1 = ['호흡운동', '걷기/조깅', '맨손체조/스트레칭', '균형잡기 운동', '수중운동', '자전거', '웨이트 트레이닝', '등산',
             '수영', '요가', '보치아', '배드민턴', '게이트볼', '탁구', '볼링', '당구', '(파크)골프', '기타']
kinds_list2 = ['', '호흡운동', '걷기/조깅', '맨손체조/스트레칭', '균형잡기 운동', '수중운동', '자전거', '웨이트 트레이닝', '등산',
             '수영', '요가', '보치아', '배드민턴', '게이트볼', '탁구', '볼링', '당구', '(파크)골프', '기타']

for i in range(len(kinds_list1)):
    data.replace({'참여 운동 종목 (1순위)': i+1}, {'참여 운동 종목 (1순위)': kinds_list1[i]}, inplace=True)

for j in range(len(kinds_list2)):
    data.replace({'참여 운동 종목 (2순위)': j}, {'참여 운동 종목 (2순위)': kinds_list2[j]}, inplace=True)
    
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,음주횟수,키(센티),몸무게(kg),EQ-5D (통증/불편),운동 주기,운동 시간(분),월 소득 합계-개인,월평균 총 추가비용,참여 운동 종목 (1순위),참여 운동 종목 (2순위)
0,1,1970,6,1,2,2,2,2,2,3,4,178.0,77.0,1,4,90,300,0,수영,
2,1,1976,6,1,2,0,0,0,0,1,2,186.0,98.0,1,1,60,604,0,수영,
3,2,1958,5,1,2,2,1,2,2,4,1,158.0,63.0,2,1,60,50,28,걷기/조깅,
4,2,1984,3,2,2,2,2,2,2,4,1,158.0,58.0,1,2,30,0,33,걷기/조깅,
5,1,1972,2,2,2,1,2,2,1,3,3,168.0,89.0,2,2,30,0,346,기타,걷기/조깅
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6537,1,1941,4,2,1,2,2,2,2,3,2,171.0,74.0,2,2,90,31,184,웨이트 트레이닝,
6538,2,1951,5,1,2,1,2,1,2,4,1,167.0,59.0,3,1,60,171,129,걷기/조깅,
6542,1,1952,5,1,2,2,2,2,2,3,5,165.0,62.0,1,3,60,90,60,걷기/조깅,
6547,1,1958,5,2,2,1,1,2,2,3,1,165.0,64.0,2,2,60,150,35,걷기/조깅,배드민턴


In [8]:
cols = ['참여 운동 종목 (1순위)', '참여 운동 종목 (2순위)']
data['운동종목'] =data[cols].apply(lambda row: '&'.join(row.values.astype(str)), axis=1)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['운동종목'] =data[cols].apply(lambda row: '&'.join(row.values.astype(str)), axis=1)


Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,...,키(센티),몸무게(kg),EQ-5D (통증/불편),운동 주기,운동 시간(분),월 소득 합계-개인,월평균 총 추가비용,참여 운동 종목 (1순위),참여 운동 종목 (2순위),운동종목
0,1,1970,6,1,2,2,2,2,2,3,...,178.0,77.0,1,4,90,300,0,수영,,수영&
2,1,1976,6,1,2,0,0,0,0,1,...,186.0,98.0,1,1,60,604,0,수영,,수영&
3,2,1958,5,1,2,2,1,2,2,4,...,158.0,63.0,2,1,60,50,28,걷기/조깅,,걷기/조깅&
4,2,1984,3,2,2,2,2,2,2,4,...,158.0,58.0,1,2,30,0,33,걷기/조깅,,걷기/조깅&
5,1,1972,2,2,2,1,2,2,1,3,...,168.0,89.0,2,2,30,0,346,기타,걷기/조깅,기타&걷기/조깅


In [9]:
# 끝 문자가 '&'로 끝나는 칼럼값 정제
y_list = []
for i in range(len(data)):
    if data['운동종목'].tolist()[i][-1] == '&':
        y_list.append(data['운동종목'].tolist()[i].replace('&', ''))
    else:
        y_list.append(data['운동종목'].tolist()[i])

data.drop(columns='운동종목', inplace = True) # 기존의 '운동종목' 칼럼 제거
data['운동종목'] = y_list # 새로 정제한 리스트를 '운동종목' 칼럼으로 재생성

data.head() # 정제 확인

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['운동종목'] = y_list # 새로 정제한 리스트를 '운동종목' 칼럼으로 재생성


Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,...,키(센티),몸무게(kg),EQ-5D (통증/불편),운동 주기,운동 시간(분),월 소득 합계-개인,월평균 총 추가비용,참여 운동 종목 (1순위),참여 운동 종목 (2순위),운동종목
0,1,1970,6,1,2,2,2,2,2,3,...,178.0,77.0,1,4,90,300,0,수영,,수영
2,1,1976,6,1,2,0,0,0,0,1,...,186.0,98.0,1,1,60,604,0,수영,,수영
3,2,1958,5,1,2,2,1,2,2,4,...,158.0,63.0,2,1,60,50,28,걷기/조깅,,걷기/조깅
4,2,1984,3,2,2,2,2,2,2,4,...,158.0,58.0,1,2,30,0,33,걷기/조깅,,걷기/조깅
5,1,1972,2,2,2,1,2,2,1,3,...,168.0,89.0,2,2,30,0,346,기타,걷기/조깅,기타&걷기/조깅


### 지체장애여부/뇌병변장애여부 + 만성질환 4개 변수 → 0 or 1로 변경
고혈압, 허리목통증, 골관절염, 당뇨병

In [10]:
# 2-> 0 변경
disease_lst4 = data.columns.tolist()[3:9]
for i in range(len(disease_lst4)):
    data.replace({disease_lst4[i]: 2}, {disease_lst4[i]: 0}, inplace = True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,...,키(센티),몸무게(kg),EQ-5D (통증/불편),운동 주기,운동 시간(분),월 소득 합계-개인,월평균 총 추가비용,참여 운동 종목 (1순위),참여 운동 종목 (2순위),운동종목
0,1,1970,6,1,0,0,0,0,0,3,...,178.0,77.0,1,4,90,300,0,수영,,수영
2,1,1976,6,1,0,0,0,0,0,1,...,186.0,98.0,1,1,60,604,0,수영,,수영
3,2,1958,5,1,0,0,1,0,0,4,...,158.0,63.0,2,1,60,50,28,걷기/조깅,,걷기/조깅
4,2,1984,3,0,0,0,0,0,0,4,...,158.0,58.0,1,2,30,0,33,걷기/조깅,,걷기/조깅
5,1,1972,2,0,0,1,0,0,1,3,...,168.0,89.0,2,2,30,0,346,기타,걷기/조깅,기타&걷기/조깅


### 성별 0 or 1로 변경
남자: 1 → 0  
여자: 2 → 1

In [11]:
data = data.replace({'성별': 1}, {'성별': 0}) # 남자
data = data.replace({'성별': 2}, {'성별': 1}) # 여자
data.head()

Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,...,키(센티),몸무게(kg),EQ-5D (통증/불편),운동 주기,운동 시간(분),월 소득 합계-개인,월평균 총 추가비용,참여 운동 종목 (1순위),참여 운동 종목 (2순위),운동종목
0,0,1970,6,1,0,0,0,0,0,3,...,178.0,77.0,1,4,90,300,0,수영,,수영
2,0,1976,6,1,0,0,0,0,0,1,...,186.0,98.0,1,1,60,604,0,수영,,수영
3,1,1958,5,1,0,0,1,0,0,4,...,158.0,63.0,2,1,60,50,28,걷기/조깅,,걷기/조깅
4,1,1984,3,0,0,0,0,0,0,4,...,158.0,58.0,1,2,30,0,33,걷기/조깅,,걷기/조깅
5,0,1972,2,0,0,1,0,0,1,3,...,168.0,89.0,2,2,30,0,346,기타,걷기/조깅,기타&걷기/조깅


---
## 모델 생성 및 학습
1. SVC
1. KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import multiprocessing

In [13]:
print(data.columns)
print(len(data.columns))

Index(['성별', '생년', '장애등급', '지체장애여부', '뇌병변장애여부', '01)만성질환명(고혈압)',
       '20)만성질환명(허리목통증)', '16)만성질환명(골관절염(퇴행성\n관절염))', '06)만성질환명(당뇨병)', '흡연 여부',
       '음주횟수', '키(센티)', '몸무게(kg)', 'EQ-5D (통증/불편)', '운동 주기', '운동 시간(분)',
       '월 소득 합계-개인', '월평균 총 추가비용', '참여 운동 종목 (1순위)', '참여 운동 종목 (2순위)', '운동종목'],
      dtype='object')
21


### X, y 데이터 구분

In [39]:
X = data[['성별', '생년', '장애등급', '지체장애여부', '뇌병변장애여부', '01)만성질환명(고혈압)',
       '20)만성질환명(허리목통증)', '16)만성질환명(골관절염(퇴행성\n관절염))', '06)만성질환명(당뇨병)', '흡연 여부',
       '음주횟수', '키(센티)', '몸무게(kg)', 'EQ-5D (통증/불편)', '운동 주기', '운동 시간(분)', '월 소득 합계-개인', '월평균 총 추가비용']]
y = data['운동종목']

In [40]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state = 777)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2730, 18)
(1171, 18)
(2730,)
(1171,)


### 범주형/연속형 변수 인코딩 및 스케일링 실시
[참고](https://lovelydiary.tistory.com/419)

In [41]:
ct = ColumnTransformer([
    ('scaling', StandardScaler(), ['생년', '키(센티)', '몸무게(kg)', '운동 시간(분)', '월 소득 합계-개인', '월평균 총 추가비용']),
    ('onehot', OneHotEncoder(), ['성별', '장애등급', '지체장애여부', '뇌병변장애여부', '01)만성질환명(고혈압)',
       '20)만성질환명(허리목통증)', '16)만성질환명(골관절염(퇴행성\n관절염))', '06)만성질환명(당뇨병)', '흡연 여부',
       '음주횟수', 'EQ-5D (통증/불편)', '운동 주기'])
])

X_train_pre = ct.fit_transform(X_train)
X_test_pre = ct.transform(X_test)

In [42]:
print(X_train_pre.shape)
print(X_test_pre.shape)

(2730, 49)
(1171, 49)


### 모델 학습 및 평가 점수 확인

In [17]:
# 모델 생성
svc_model = SVC()
svc_model.fit(X_train_pre, y_train)

# 평가 점수 확인
print('학습 데이터 점수: {:.3f}'.format(svc_model.score(X_train_pre, y_train)))
print('평가 데이터 점수: {:.3f}'.format(svc_model.score(X_test_pre, y_test)))

  y = column_or_1d(y, warn=True)


학습 데이터 점수: 0.465
평가 데이터 점수: 0.483


### 교차 검증

In [30]:
cross_val_score(estimator = svc_model, 
               X = X, y = y, cv=5, 
               n_jobs=multiprocessing.cpu_count(),
               verbose=True) # 상세한 설명 출력

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:    5.3s remaining:    8.0s
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:    8.7s finished


array([0.46222791, 0.46410256, 0.46282051, 0.46282051, 0.46282051])

### GridSearchCV

In [123]:
# GridSearchCV
scale_tuple = ('scaler', StandardScaler(), ['생년', '키(센티)', '몸무게(kg)', '운동 시간(분)', '월 소득 합계-개인', '월평균 총 추가비용'])
onehot_tuple = ('onehot', OneHotEncoder(), ['성별', '장애등급', '지체장애여부', '뇌병변장애여부', '01)만성질환명(고혈압)',
       '20)만성질환명(허리목통증)', '16)만성질환명(골관절염(퇴행성\n관절염))', '06)만성질환명(당뇨병)', '흡연 여부',
       '음주횟수', 'EQ-5D (통증/불편)', '운동 주기'])
svc_tuple = ('svm', SVC())

pipeline = Pipeline([scale_tuple, onehot_tuple, svc_tuple])

param_grid = {'gamma': [0.001, 0.01, 0.1, 1, 10, 100], # 값이 클수록 결정 경계의 곡률이 커짐 -> 오버피팅
             'C': [0.001, 0.01, 0.1, 1, 10, 100], # 값이 클수록 이상치의 존재 가능성을 작게 봐서 좀 더 세심하게 결정 경계를 찾아낸다. -> 오버피팅(하드마진)
             'kernel': ['rbf', 'ploy', 'sigmoid'] # 가장 성능이 좋은 것: 가우시안 RBF 커널
             }
svc_gs = GridSearchCV(estimator = SVC(), 
                    param_grid=param_grid, scoring='accuracy', cv = 5, 
                    n_jobs=multiprocessing.cpu_count())

result = svc_gs.fit(X_train, y_train)

print('최적 파라미터: {}'.format(svc_gs.best_params_))
print('최적 점수: {}'.format(svc_gs.best_score_))
print(result.best_estimator_)

180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 255, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 315, in _dense_fit
    ) = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 173, in sklearn.svm._libsvm.fit
ValueError: 'ploy' is not in list

 0.45714286        nan 0.45714286

최적 파라미터: {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'}
최적 점수: 0.45714285714285713
SVC(C=0.001, gamma=0.001)


In [125]:
# 최적 파라미터를 이용해 새로운 모델 생성
svc_model = svc_gs.best_estimator_

# 학습 및 평가 데이터 점수
print('학습 데이터 점수: {:.3f}'.format(svc_model.score(X_train, y_train)))
print('평가 데이터 점수: {:.3f}'.format(svc_model.score(X_test, y_test)))

학습 데이터 점수: 0.457
평가 데이터 점수: 0.477


In [126]:
# 실제 y_test 값과 예측 y_predict 값 비교
y_predict = svc_model.predict(X_test)
y_predict

array(['걷기/조깅', '걷기/조깅', '걷기/조깅', ..., '걷기/조깅', '걷기/조깅', '걷기/조깅'],
      dtype=object)

In [60]:
y_test_df = y_test.reset_index()
y_test_df.head()

Unnamed: 0,index,운동종목
0,370,등산
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅&등산
4,776,걷기/조깅


In [61]:
y_pred_df = pd.concat([pd.DataFrame(y_test.index).rename(columns={0:'index'}), pd.DataFrame(y_predict)], axis=1).rename(columns={0:'운동종목(예측)'})
y_pred_df.head()

Unnamed: 0,index,운동종목(예측)
0,370,걷기/조깅
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅
4,776,걷기/조깅


In [62]:
pd.merge(y_test_df, y_pred_df, on='index')

Unnamed: 0,index,운동종목,운동종목(예측)
0,370,등산,걷기/조깅
1,2580,걷기/조깅,걷기/조깅
2,5139,걷기/조깅,걷기/조깅
3,3827,걷기/조깅&등산,걷기/조깅
4,776,걷기/조깅,걷기/조깅
...,...,...,...
1166,1445,등산,걷기/조깅
1167,3810,걷기/조깅,걷기/조깅
1168,3764,걷기/조깅,걷기/조깅
1169,1417,걷기/조깅,걷기/조깅


In [63]:
pd.merge(y_test_df, y_pred_df, on='index')[['운동종목(예측)']].value_counts()

운동종목(예측)
걷기/조깅       1171
dtype: int64

In [79]:
len(pd.merge(y_test_df, y_pred_df, on='index')['운동종목'].unique())

85

**평가지표**  
[참고1](https://magicode.tistory.com/38)  
[참고2](https://leedakyeong.tistory.com/entry/%EB%B6%84%EB%A5%98-%EB%AA%A8%EB%8D%B8-%EC%84%B1%EB%8A%A5-%ED%8F%89%EA%B0%80-%EC%A7%80%ED%91%9C-Confusion-Matrix%EB%9E%80-%EC%A0%95%ED%99%95%EB%8F%84Accuracy-%EC%A0%95%EB%B0%80%EB%8F%84Precision-%EC%9E%AC%ED%98%84%EB%8F%84Recall-F1-Score)

In [127]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(accuracy_score(y_test, y_predict)) # 정확도
print(recall_score(y_test, y_predict, average='weighted')) # 재현도
print(precision_score(y_test, y_predict, average='weighted')) # 정밀도
print(f1_score(y_test, y_predict, average='weighted')) # F1 Score

0.47651579846285225
0.47651579846285225
0.22706730618468965
0.3075717935711643


  _warn_prf(average, modifier, msg_start, len(result))


---
### KNN

In [73]:
# GridSearchCV: 교차 검증과 하이퍼 파라미터 튜닝을 동시에 수행
ct = ColumnTransformer([
    ('scaling', StandardScaler(), ['생년', '키(센티)', '몸무게(kg)', '운동 시간(분)', '월 소득 합계-개인', '월평균 총 추가비용']),
    ('onehot', OneHotEncoder(), ['성별', '장애등급', '지체장애여부', '뇌병변장애여부', '01)만성질환명(고혈압)',
       '20)만성질환명(허리목통증)', '16)만성질환명(골관절염(퇴행성\n관절염))', '06)만성질환명(당뇨병)', '흡연 여부',
       '음주횟수', 'EQ-5D (통증/불편)', '운동 주기'])
])

X_train_pre = ct.fit_transform(X_train)
X_test_pre = ct.transform(X_test)


param_grid = {'n_neighbors': range(1, 16),
             'weights': ['uniform', 'distance'],
             'algorithm': ['ball_tree', 'kd_tree', 'brute'],
             'p': [1,2] # 1: 멘하튼 거리, 2: 유클리디안 거리
             }

knn_gs = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, n_jobs=multiprocessing.cpu_count(), cv = 5)

result = knn_gs.fit(X_train_pre, y_train)

print('최적 파라미터: {}'.format(knn_gs.best_params_))
print('최적 점수: {}'.format(knn_gs.best_score_))
print(result.best_estimator_)



최적 파라미터: {'algorithm': 'ball_tree', 'n_neighbors': 14, 'p': 2, 'weights': 'uniform'}
최적 점수: 0.4523809523809524
KNeighborsClassifier(algorithm='ball_tree', n_neighbors=14)


In [113]:
# 실제 y_test 값과 예측 y_predict 값 비교
knn_model = knn_gs.best_estimator_
knn_model.fit(X_train_pre, y_train)

y_predict = knn_model.predict(X_test_pre)
y_predict

array(['걷기/조깅', '걷기/조깅', '걷기/조깅', ..., '걷기/조깅', '걷기/조깅', '걷기/조깅'],
      dtype=object)

In [80]:
y_test_df = y_test.reset_index()
y_test_df.head()

Unnamed: 0,index,운동종목
0,370,등산
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅&등산
4,776,걷기/조깅


In [81]:
y_pred_df = pd.concat([pd.DataFrame(y_test.index).rename(columns={0:'index'}), pd.DataFrame(y_predict)], axis=1).rename(columns={0:'운동종목(예측)'})
y_pred_df.head()

Unnamed: 0,index,운동종목(예측)
0,370,걷기/조깅
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅
4,776,걷기/조깅


In [82]:
# 실제 y값과 예측 y값 데이터 결합
pd.merge(y_test_df, y_pred_df, on='index')

Unnamed: 0,index,운동종목,운동종목(예측)
0,370,등산,걷기/조깅
1,2580,걷기/조깅,걷기/조깅
2,5139,걷기/조깅,걷기/조깅
3,3827,걷기/조깅&등산,걷기/조깅
4,776,걷기/조깅,걷기/조깅
...,...,...,...
1166,1445,등산,걷기/조깅
1167,3810,걷기/조깅,걷기/조깅
1168,3764,걷기/조깅,걷기/조깅
1169,1417,걷기/조깅,걷기/조깅


In [83]:
pd.merge(y_test_df, y_pred_df, on='index')[['운동종목(예측)']].value_counts()

운동종목(예측)       
걷기/조깅              1095
걷기/조깅&맨손체조/스트레칭      56
등산                    8
기타                    3
맨손체조/스트레칭             2
수영                    2
등산&걷기/조깅              1
웨이트 트레이닝              1
웨이트 트레이닝&걷기/조깅        1
자전거                   1
호흡운동&걷기/조깅            1
dtype: int64

In [114]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(accuracy_score(y_test, y_predict)) # 정확도
print(recall_score(y_test, y_predict, average='weighted')) # 재현도
print(precision_score(y_test, y_predict, average='weighted')) # 정밀도
print(f1_score(y_test, y_predict, average='weighted')) # F1 Score

0.46883005977796754
0.46883005977796754
0.27881461254061657
0.3282621467095164


  _warn_prf(average, modifier, msg_start, len(result))


---
### 랜덤포레스트

In [87]:
# GridSearchCV
## 중요 매개변수: n_estimators, max_features이고, max_depth 같은 사전 가지치기 옵션이 있음

from sklearn.ensemble import RandomForestClassifier
param_grid = {'n_estimators': [10, 100, 200], # 결정트리 개수
             'max_features': ['sqrt', 'log'], # 각 노드에거 랜덤하게 선택할 '후보 특성' 개수 (분류: sqrt, 회귀: log)
             'max_depth': [3, 5, 7]
             }

rfc_gs = GridSearchCV(estimator = RandomForestClassifier(), cv= 5,
                 param_grid=param_grid, n_jobs = -1)

result = rfc_gs.fit(X_train, y_train)

print('최적 파라미터: {}'.format(rfc_gs.best_params_)) # 최적의 파라미터 출력
print('최적 점수: {}'.format(rfc_gs.best_score_)) # 최적 파라미터의 평가 점수 출력
print(result.best_estimator_) # 최적 파라미터로 학습된 estimator 출력

45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\user\anaconda3\l

최적 파라미터: {'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 100}
최적 점수: 0.4593406593406593
RandomForestClassifier(max_depth=7, max_features='sqrt')


In [115]:
# 실제 y_test 값과 예측 y_predict 값 비교
rfc_model = rfc_gs.best_estimator_
rfc_model.fit(X_train, y_train)

y_predict = rfc_model.predict(X_test)
y_predict

array(['걷기/조깅', '걷기/조깅', '걷기/조깅', ..., '걷기/조깅', '걷기/조깅', '걷기/조깅'],
      dtype=object)

In [89]:
y_test_df = y_test.reset_index()
y_test_df.head()

Unnamed: 0,index,운동종목
0,370,등산
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅&등산
4,776,걷기/조깅


In [90]:
y_pred_df = pd.concat([pd.DataFrame(y_test.index).rename(columns={0:'index'}), pd.DataFrame(y_predict)], axis=1).rename(columns={0:'운동종목(예측)'})
y_pred_df.head()

Unnamed: 0,index,운동종목(예측)
0,370,걷기/조깅
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅
4,776,걷기/조깅


In [91]:
# 실제 y값과 예측 y값 데이터 결합
pd.merge(y_test_df, y_pred_df, on='index')

Unnamed: 0,index,운동종목,운동종목(예측)
0,370,등산,걷기/조깅
1,2580,걷기/조깅,걷기/조깅
2,5139,걷기/조깅,걷기/조깅
3,3827,걷기/조깅&등산,걷기/조깅
4,776,걷기/조깅,걷기/조깅
...,...,...,...
1166,1445,등산,걷기/조깅
1167,3810,걷기/조깅,걷기/조깅
1168,3764,걷기/조깅,걷기/조깅
1169,1417,걷기/조깅,걷기/조깅


In [92]:
pd.merge(y_test_df, y_pred_df, on='index')[['운동종목(예측)']].value_counts()

운동종목(예측)       
걷기/조깅              1162
등산                    7
걷기/조깅&맨손체조/스트레칭       1
자전거&걷기/조깅             1
dtype: int64

In [116]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(accuracy_score(y_test, y_predict)) # 정확도
print(recall_score(y_test, y_predict, average='weighted')) # 재현도
print(precision_score(y_test, y_predict, average='weighted')) # 정밀도
print(f1_score(y_test, y_predict, average='weighted')) # F1 Score

0.4807856532877882
0.4807856532877882
0.40056206905244957
0.3159556399500407


  _warn_prf(average, modifier, msg_start, len(result))


---
### Soft Voting

In [93]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# 약한 학습기 생성
svc_model = SVC(probability=True) # soft 방식을 위해서 probability 옵션 지정 필요
knn_model = KNeighborsClassifier()
dt_model = DecisionTreeClassifier()

# 앙상블 모델 구축(soft voting)
voting_model = VotingClassifier(
    estimators = [('svc', svc_model),('knn', knn_model), ('dt', dt_model)], # 3개의 약한 학습기
    voting = 'soft', # 간접 투표
    n_jobs = -1
)

# 앙상블 모델 학습
voting_model.fit(X_train, y_train)

# 모델 비교
for model in (svc_model, knn_model, dt_model, voting_model):
    model.fit(X_train, y_train)
    print(model.__class__.__name__, '학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
    print(model.__class__.__name__, '평가 데이터 점수: {}'.format(model.score(X_test, y_test)), '\n')

SVC 학습 데이터 점수: 0.45714285714285713
SVC 평가 데이터 점수: 0.47651579846285225 

KNeighborsClassifier 학습 데이터 점수: 0.5014652014652015
KNeighborsClassifier 평가 데이터 점수: 0.426131511528608 

DecisionTreeClassifier 학습 데이터 점수: 1.0
DecisionTreeClassifier 평가 데이터 점수: 0.27241673783091375 

VotingClassifier 학습 데이터 점수: 0.9802197802197802
VotingClassifier 평가 데이터 점수: 0.34244235695986336 



In [117]:
# 실제 y_test 값과 예측 y_predict 값 비교
y_predict = voting_model.predict(X_test)
y_predict

array(['자전거&걷기/조깅', '걷기/조깅', '걷기/조깅', ..., '걷기/조깅&맨손체조/스트레칭', '걷기/조깅&자전거',
       '걷기/조깅'], dtype=object)

In [95]:
y_test_df = y_test.reset_index()
y_test_df.head()

Unnamed: 0,index,운동종목
0,370,등산
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,걷기/조깅&등산
4,776,걷기/조깅


In [96]:
y_pred_df = pd.concat([pd.DataFrame(y_test.index).rename(columns={0:'index'}), pd.DataFrame(y_predict)], axis=1).rename(columns={0:'운동종목(예측)'})
y_pred_df.head()

Unnamed: 0,index,운동종목(예측)
0,370,자전거&걷기/조깅
1,2580,걷기/조깅
2,5139,걷기/조깅
3,3827,맨손체조/스트레칭
4,776,걷기/조깅


In [97]:
# 실제 y값과 예측 y값 데이터 결합
pd.merge(y_test_df, y_pred_df, on='index')

Unnamed: 0,index,운동종목,운동종목(예측)
0,370,등산,자전거&걷기/조깅
1,2580,걷기/조깅,걷기/조깅
2,5139,걷기/조깅,걷기/조깅
3,3827,걷기/조깅&등산,맨손체조/스트레칭
4,776,걷기/조깅,걷기/조깅
...,...,...,...
1166,1445,등산,등산
1167,3810,걷기/조깅,걷기/조깅&맨손체조/스트레칭
1168,3764,걷기/조깅,걷기/조깅&맨손체조/스트레칭
1169,1417,걷기/조깅,걷기/조깅&자전거


In [98]:
pd.merge(y_test_df, y_pred_df, on='index')[['운동종목(예측)']].value_counts()

운동종목(예측)       
걷기/조깅              673
걷기/조깅&맨손체조/스트레칭    170
맨손체조/스트레칭           43
자전거                 29
맨손체조/스트레칭&걷기/조깅     22
                  ... 
요가&수영                1
요가&걷기/조깅             1
등산&기타                1
수영&자전거               1
호흡운동&맨손체조/스트레칭       1
Length: 64, dtype: int64

In [99]:
pd.merge(y_test_df, y_pred_df, on='index')[['운동종목']].value_counts()

운동종목           
걷기/조깅              558
걷기/조깅&맨손체조/스트레칭    188
맨손체조/스트레칭           57
자전거                 33
걷기/조깅&자전거           21
                  ... 
수영&등산                1
수영&볼링                1
수영&자전거               1
수중운동&호흡운동            1
호흡운동&웨이트 트레이닝        1
Length: 85, dtype: int64

In [118]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(accuracy_score(y_test, y_predict)) # 정확도
print(recall_score(y_test, y_predict, average='weighted')) # 재현도
print(precision_score(y_test, y_predict, average='weighted')) # 정밀도
print(f1_score(y_test, y_predict, average='weighted')) # F1 Score

0.34244235695986336
0.34244235695986336
0.29754568612627347
0.31696742212991413


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


---
### Gradient Boosting: 여러 개의 결정 트리를 묶어 강력한 모델을 만드는 또 다른 앙상블 방법

In [119]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier()
gbrt.fit(X_train, y_train)

print('학습 데이터 점수: {}'.format(gbrt.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(gbrt.score(X_test, y_test)))

학습 데이터 점수: 0.7677655677655678
평가 데이터 점수: 0.42442356959863364


In [128]:
y_predict = gbrt.predict(X_test)

In [129]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(accuracy_score(y_test, y_predict)) # 정확도
print(recall_score(y_test, y_predict, average='weighted')) # 재현도
print(precision_score(y_test, y_predict, average='weighted')) # 정밀도
print(f1_score(y_test, y_predict, average='weighted')) # F1 Score

0.42442356959863364
0.4244235695986336
0.30021133073389816
0.3289930985784192


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


---
### 범주형 변수 전처리
#### 가변수 만들기(흡연여부, 음주횟수, 통증/불편 운동 주기)

In [165]:
# factor형 변환
variableNames = ['장애등급', '흡연 여부', '음주횟수', '운동 주기', 'EQ-5D (통증/불편)']
for i in range(len(variableNames)):
    data[variableNames[i]] = data[variableNames[i]].astype('object')

In [166]:
data[['장애등급', '흡연 여부', '음주횟수', '운동 주기', 'EQ-5D (통증/불편)']].info() # 변환 결과 확인

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2260 entries, 0 to 6249
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   장애등급           2260 non-null   object
 1   흡연 여부          2260 non-null   object
 2   음주횟수           2260 non-null   object
 3   운동 주기          2260 non-null   object
 4   EQ-5D (통증/불편)  2260 non-null   object
dtypes: object(5)
memory usage: 105.9+ KB


In [167]:
catego_df = pd.get_dummies(data[['장애등급', '흡연 여부', '음주횟수', '운동 주기', 'EQ-5D (통증/불편)']])
data_result = pd.concat([data, catego_df], axis=1)
data_result.head()

Unnamed: 0,성별,생년,장애등급,지체장애여부,뇌병변장애여부,01)만성질환명(고혈압),20)만성질환명(허리목통증),16)만성질환명(골관절염(퇴행성\n관절염)),06)만성질환명(당뇨병),흡연 여부,...,운동 주기_1,운동 주기_2,운동 주기_3,운동 주기_4,운동 주기_5,운동 주기_6,EQ-5D (통증/불편)_0,EQ-5D (통증/불편)_1,EQ-5D (통증/불편)_2,EQ-5D (통증/불편)_3
0,0,1970,6,1,0,0,0,0,0,3,...,0,0,0,1,0,0,0,1,0,0
2,0,1976,6,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,1,1958,5,1,0,0,1,0,0,4,...,1,0,0,0,0,0,0,0,1,0
8,1,1955,5,1,0,1,1,0,0,4,...,0,0,1,0,0,0,0,0,1,0
13,1,1952,6,0,1,1,0,0,0,4,...,1,0,0,0,0,0,0,0,1,0
