In [1]:
import pandas as pd
from konlpy.tag import Kkma
from konlpy.utils import pprint

In [2]:
kkma = Kkma()

In [3]:
df = pd.read_csv(r'D:\pycharmProject\BigdataProject_AbandonedAnimal\lostAnimal_20150101_20151231_vol1.csv', encoding='CP949')

In [4]:
df.columns

Index(['Unnamed: 0', 'age(before)', 'age(after)', 'careAddr', 'careNm',
       'careTel', 'chargeNm', 'colorCd', 'desertionNo', 'filename', 'happenDt',
       'happenPlace', 'kindCd', 'neuterYn', 'noticeComment', 'noticeEdt',
       'noticeNo', 'noticeSdt', 'officetel', 'orgNm', 'popfile',
       'processState', 'sexCd', 'specialMark', 'weight(before)',
       'weight(after)'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79745 entries, 0 to 79744
Data columns (total 26 columns):
Unnamed: 0        79745 non-null int64
age(before)       79745 non-null object
age(after)        79745 non-null float64
careAddr          79745 non-null object
careNm            79745 non-null object
careTel           79745 non-null object
chargeNm          76820 non-null object
colorCd           79745 non-null object
desertionNo       79745 non-null float64
filename          79745 non-null object
happenDt          79745 non-null int64
happenPlace       79745 non-null object
kindCd            79745 non-null object
neuterYn          79745 non-null object
noticeComment     449 non-null object
noticeEdt         79745 non-null int64
noticeNo          79745 non-null object
noticeSdt         79745 non-null int64
officetel         77970 non-null object
orgNm             79745 non-null object
popfile           79745 non-null object
processState      79745 non-null object
sexCd          

## kindCd, happenDt, happenPlace, colorCd

### kindCd

In [6]:
df['kindCd'].head()

0    [개] 믹스견
1    [개] 말티즈
2    [개] 믹스견
3    [개] 믹스견
4    [개] 믹스견
Name: kindCd, dtype: object

In [7]:
df['kind'] = df['kindCd'].str.extract('([가-힣]+)\]', expand=False) # 한글 정규식
df['kind'].head()

0    개
1    개
2    개
3    개
4    개
Name: kind, dtype: object

In [8]:
kind_mapping = {'개': 0, '고양이':1, '기타축종':2}
df['kind'] = df['kind'].map(kind_mapping)

In [9]:
df['kind'].value_counts()

0    58220
1    20602
2      923
Name: kind, dtype: int64

In [10]:
df['breed'] = df['kindCd'].str.split('] ').str[1]
# split으로 분리하여 얻은 list에서 2번째 값을 string으로 받음

In [11]:
df['breed'].head()

0    믹스견
1    말티즈
2    믹스견
3    믹스견
4    믹스견
Name: breed, dtype: object

In [12]:
breed = df['breed'].value_counts()
breed

믹스견                26492
고양이                20584
말티즈                 7741
푸들                  4873
시츄                  3736
진도견                 2604
요크셔 테리어             2316
포메라니안               1047
코카 스파니엘              978
스피츠                  893
슈나우져                 745
골든 리트리버              744
미니어쳐 핀셔              658
치와와                  535
닥스훈트                 468
비글                   430
페키니즈                 390
시베리안 허스키             320
토끼                   298
알라스칸 말라뮤트            198
잡종                   180
보스턴 테리어              167
혼종                   166
발바리                  161
셰퍼드                  159
포인터                  155
사모예드                 143
고슴도치                 136
그레이트 피레니즈            125
웰시 코기 펨브로크           110
                   ...  
아메리칸스테포드셔테리어           1
믹스 또는 라사압소             1
브레타니 스파니엘              1
푸들.말티즈 믹스              1
요크 믹스                  1
아프간 하운드(추정)            1
티베탄테리어                 1
제이크랜드테리어               1
시바견?                   1


### happenDt

In [13]:
df['happenDt'].head()

0    20151231
1    20151231
2    20151231
3    20151231
4    20151231
Name: happenDt, dtype: int64

In [14]:
df['happenDt'] = pd.to_datetime(df['happenDt'], format='%Y%m%d')
# 8자리를 날짜형식으로 바꿈

In [15]:
df['happenDt'].head()

0   2015-12-31
1   2015-12-31
2   2015-12-31
3   2015-12-31
4   2015-12-31
Name: happenDt, dtype: datetime64[ns]

In [16]:
# df['happenWd'] = df['happenDt'].dt.dayofweekday  # 요일을 숫자로 표현함 "0 = Sunday"
df['happenWd'] = df['happenDt'].dt.day_name()

In [17]:
df['happenWd'].head()

0    Thursday
1    Thursday
2    Thursday
3    Thursday
4    Thursday
Name: happenWd, dtype: object

In [18]:
df['happenWd'].value_counts()

Monday       17591
Tuesday      13327
Wednesday    12857
Friday       12414
Thursday     12399
Saturday      7108
Sunday        4049
Name: happenWd, dtype: int64

In [19]:
df['happenMth'] = pd.DatetimeIndex(df['happenDt']).month
# 날짜에서 '월'값을 받음

In [20]:
df['happenMth'].head()

0    12
1    12
2    12
3    12
4    12
Name: happenMth, dtype: int64

In [21]:
df['happenMth'].value_counts()

7     8312
6     8271
8     7996
5     7683
9     7159
10    6946
4     6864
11    6088
3     5674
12    5410
1     5079
2     4263
Name: happenMth, dtype: int64

### age

In [22]:
df['weight(after)'].head()

0    6.0
1    3.5
2    5.0
3    6.0
4    4.0
Name: weight(after), dtype: float64

In [23]:
df.loc[df['weight(after)'] <= 3, 'size'] = '초소형'
df.loc[(df['weight(after)'] > 3 ) & (df['weight(after)'] <=9 ), 'size'] = '소형'
df.loc[(df['weight(after)'] > 9 ) & (df['weight(after)'] <=25 ), 'size'] = '중형'
df.loc[df['weight(after)'] > 25, 'size'] = '대형'

In [24]:
df['size'].head()

0    소형
1    소형
2    소형
3    소형
4    소형
Name: size, dtype: object

### weight

In [25]:
df['age(after)'].head()

0    0.0
1    4.0
2    5.0
3    5.0
4    1.0
Name: age(after), dtype: float64

In [26]:
df.loc[df['age(after)'] <= 1, 'age_u'] = '유견기'
df.loc[(df['age(after)'] > 1 ) & (df['age(after)'] <= 9 ), 'age_u'] = '성견기'
df.loc[df['age(after)'] > 9, 'age_u'] = '노견기'

In [27]:
df['age_u'].head()

0    유견기
1    성견기
2    성견기
3    성견기
4    유견기
Name: age_u, dtype: object

## Save

In [28]:
df.columns

Index(['Unnamed: 0', 'age(before)', 'age(after)', 'careAddr', 'careNm',
       'careTel', 'chargeNm', 'colorCd', 'desertionNo', 'filename', 'happenDt',
       'happenPlace', 'kindCd', 'neuterYn', 'noticeComment', 'noticeEdt',
       'noticeNo', 'noticeSdt', 'officetel', 'orgNm', 'popfile',
       'processState', 'sexCd', 'specialMark', 'weight(before)',
       'weight(after)', 'kind', 'breed', 'happenWd', 'happenMth', 'size',
       'age_u'],
      dtype='object')

In [30]:
df.to_csv('lostAnimal_20150101_20151231_vol2.csv',encoding='euc-kr')