In [1]:
import pandas as pd
import numpy
import os
import datetime,time
import sys
from konlpy.tag import Kkma
from konlpy.utils import pprint
from collections import Counter

In [2]:
df = pd.read_csv(r'D:\pycharmProject\BigdataProject_AbandonedAnimal\__result__\crawling\lostAnimal_20150101_20151231_pre.csv', encoding='CP949')

In [3]:
df.columns

Index(['Unnamed: 0', 'age(before)', 'age(after)', 'careAddr', 'careNm',
       'careTel', 'chargeNm', 'colorCd', 'desertionNo', 'filename', 'happenDt',
       'happenPlace', 'kindCd', 'neuterYn', 'noticeComment', 'noticeEdt',
       'noticeNo', 'noticeSdt', 'officetel', 'orgNm', 'popfile',
       'processState', 'sexCd', 'specialMark', 'weight'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79749 entries, 0 to 79748
Data columns (total 25 columns):
Unnamed: 0       79749 non-null int64
age(before)      79749 non-null object
age(after)       79749 non-null int64
careAddr         79749 non-null object
careNm           79749 non-null object
careTel          79749 non-null object
chargeNm         76824 non-null object
colorCd          79749 non-null object
desertionNo      79749 non-null float64
filename         79749 non-null object
happenDt         79749 non-null int64
happenPlace      79749 non-null object
kindCd           79749 non-null object
neuterYn         79749 non-null object
noticeComment    449 non-null object
noticeEdt        79749 non-null int64
noticeNo         79749 non-null object
noticeSdt        79749 non-null int64
officetel        77974 non-null object
orgNm            79749 non-null object
popfile          79749 non-null object
processState     79749 non-null object
sexCd            79749 non-null object


## kindCd, happenDt, happenPlace, colorCd

### kindCd

In [5]:
df['kindCd'].head()

0    [개] 믹스견
1    [개] 말티즈
2    [개] 믹스견
3    [개] 믹스견
4    [개] 믹스견
Name: kindCd, dtype: object

In [6]:
df['kind'] = df['kindCd'].str.extract('([가-힣]+)\]', expand=False) # 한글 정규식
df['kind'].head()

0    개
1    개
2    개
3    개
4    개
Name: kind, dtype: object

In [7]:
kind_mapping = {'개': 0, '고양이':1, '기타축종':2}
df['kind'] = df['kind'].map(kind_mapping)

In [8]:
df['kind'].value_counts()

0    58223
1    20603
2      923
Name: kind, dtype: int64

In [9]:
df['breed'] = df['kindCd'].str.split('] ').str[1]
# split으로 분리하여 얻은 list에서 2번째 값을 string으로 받음

In [10]:
df['breed'].head()

0    믹스견
1    말티즈
2    믹스견
3    믹스견
4    믹스견
Name: breed, dtype: object

### happenDt

In [11]:
df['happenDt'].head()

0    20151231
1    20151231
2    20151231
3    20151231
4    20151231
Name: happenDt, dtype: int64

In [12]:
df['happenDt'] = pd.to_datetime(df['happenDt'], format='%Y%m%d')
# 8자리를 날짜형식으로 바꿈

In [13]:
df['happenDt'].head()

0   2015-12-31
1   2015-12-31
2   2015-12-31
3   2015-12-31
4   2015-12-31
Name: happenDt, dtype: datetime64[ns]

In [14]:
# df['happenWd'] = df['happenDt'].dt.dayofweekday  # 요일을 숫자로 표현함 "0 = Sunday"
df['happenWd'] = df['happenDt'].dt.day_name()

In [15]:
df['happenWd'].head()

0    Thursday
1    Thursday
2    Thursday
3    Thursday
4    Thursday
Name: happenWd, dtype: object

In [16]:
df['happenMth'] = pd.DatetimeIndex(df['happenDt']).month
# 날짜에서 '월'값을 받음

In [17]:
df['happenMth'].head()

0    12
1    12
2    12
3    12
4    12
Name: happenMth, dtype: int64

### colorCd

In [18]:
kkma = Kkma()

In [19]:
colors = ' '.join(df['colorCd'])

In [20]:
pos = kkma.pos(colors)  # 형태소 분석 및 품사 태깅

In [21]:
cnt = Counter(pos)
# 가장 빈번하게 등장하는 형태소가 무엇인지 찾고자 할 때 사용 

In [22]:
cnt.most_common()  # 가장 많은 것부터 보여줌

[(('흰색', 'NNG'), 25917),
 (('갈색', 'NNG'), 17983),
 (('/', 'SP'), 12982),
 (('ㄴ', 'ETD'), 12587),
 (('희', 'VA'), 12090),
 (('ㄹ', 'ETD'), 10248),
 (('갈', 'VV'), 9970),
 ((',', 'SP'), 7497),
 (('검정', 'NNG'), 7191),
 (('검', 'NNG'), 6958),
 (('+', 'SW'), 6403),
 (('백색', 'NNG'), 2794),
 (('황색', 'NNG'), 2772),
 (('검정색', 'NNG'), 2630),
 (('백', 'NNG'), 2551),
 (('회색', 'NNG'), 2161),
 (('검은색', 'NNG'), 2098),
 (('황', 'NNG'), 1743),
 (('흑', 'NNG'), 1547),
 (('삼색', 'NNG'), 1500),
 (('백', 'NR'), 1320),
 (('&', 'SW'), 1191),
 (('색', 'NNG'), 1186),
 (('연갈색', 'NNG'), 1082),
 (('노랑', 'NNG'), 1028),
 (('연', 'NNG'), 982),
 (('회', 'NNG'), 951),
 (('은', 'ETD'), 740),
 (('흑백', 'NNG'), 626),
 (('.', 'SF'), 573),
 (('기타', 'NNG'), 552),
 (('검', 'VA'), 510),
 (('황백', 'NNG'), 473),
 (('흑색', 'NNG'), 471),
 (('줄', 'NNG'), 424),
 (('치즈', 'NNG'), 408),
 (('고등어', 'NNG'), 376),
 (('화이트', 'NNG'), 370),
 (('노란색', 'NNG'), 349),
 (('줄무늬', 'NNG'), 329),
 (('블랙', 'NNG'), 314),
 (('하양', 'NNG'), 303),
 (('노', 'NNG'), 301),
 ((

### specialMark

In [23]:
# special = ' '.join(df['specialMark'])

TypeError: sequence item 57: expected str instance, float found

In [24]:
for mark in df['specialMark']:
    special_pos = kkma.pos(str(mark))
    

In [25]:
special_cnt = Counter(special_pos)

In [26]:
special_cnt.most_common()

[(('순', 'NNG'), 1), (('하', 'XSV'), 1), (('ㅁ', 'ETN'), 1)]