# 도매가 데이터 결측값 보간 전처리

### import

In [1]:
import pandas as pd
import numpy as np
import datetime
import copy 

### 데이터셋 input

In [2]:
도매가격 = pd.read_csv('../data/도매가/종합/도매_가격_완성본_13_21년도.csv', thousands=',',index_col = 0)

In [3]:
도매가격.rename(columns={'가격':'도매가격'}, inplace = True)

## 도매 비어있는 날짜 추가 및 결측치 보간

### 기간 내 날짜 전체 생성

In [4]:
start_date = pd.to_datetime('2013-01-01') ## 시작 날짜
end_date = pd.to_datetime('2021-12-31') ## 마지막 날짜
dates = pd.date_range(start_date,end_date,freq='D')
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', length=3287, freq='D')

In [5]:
dates = pd.DataFrame(dates)
dates

Unnamed: 0,0
0,2013-01-01
1,2013-01-02
2,2013-01-03
3,2013-01-04
4,2013-01-05
...,...
3282,2021-12-27
3283,2021-12-28
3284,2021-12-29
3285,2021-12-30


In [6]:
dates = pd.DataFrame(dates)
for i in range(0,4):
    dates = dates.append(dates.loc[:3286], ignore_index = True)
dates

  dates = dates.append(dates.loc[:3286], ignore_index = True)


Unnamed: 0,0
0,2013-01-01
1,2013-01-02
2,2013-01-03
3,2013-01-04
4,2013-01-05
...,...
16430,2021-12-27
16431,2021-12-28
16432,2021-12-29
16433,2021-12-30


In [7]:
dates.columns = ['날짜']

In [8]:
dates = dates.sort_values(by = '날짜').reset_index(drop=True)

In [9]:
## 전 구간 전체지역 데이터프레임 생성

dates["지역"] = ['서울','대전','대구','부산','광주']*3287
dates

Unnamed: 0,날짜,지역
0,2013-01-01,서울
1,2013-01-01,대전
2,2013-01-01,대구
3,2013-01-01,부산
4,2013-01-01,광주
...,...,...
16430,2021-12-31,서울
16431,2021-12-31,대전
16432,2021-12-31,대구
16433,2021-12-31,부산


### 도매가 보간

In [10]:
도매가격['날짜'] = pd.to_datetime(도매가격['날짜'], format='%Y-%m-%d')
도매가격.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12145 entries, 0 to 4254
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      12145 non-null  datetime64[ns]
 1   지역      12145 non-null  object        
 2   품명      12145 non-null  object        
 3   도매가격    12033 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 474.4+ KB


### 겹치는 날짜 처리를 위한 거래량 분석

In [11]:
## datetime str로 변경

도매가격["날짜"] = 도매가격["날짜"].dt.strftime('%Y-%m-%d')
도매가격.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12145 entries, 0 to 4254
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      12145 non-null  object 
 1   지역      12145 non-null  object 
 2   품명      12145 non-null  object 
 3   도매가격    12033 non-null  float64
dtypes: float64(1), object(3)
memory usage: 474.4+ KB


In [12]:
도매가격.dropna(inplace=True)
도매가격.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12033 entries, 0 to 4254
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      12033 non-null  object 
 1   지역      12033 non-null  object 
 2   품명      12033 non-null  object 
 3   도매가격    12033 non-null  float64
dtypes: float64(1), object(3)
memory usage: 470.0+ KB


In [13]:
도매가격.reset_index(drop = True, inplace = True)

In [14]:
## 같은 날 다른 품명의 날짜 추출

date_list = []
for idx in 도매가격.index:
    point = 도매가격.iloc[idx]["날짜"]
    item = 도매가격.iloc[idx]["품명"]
    region = 도매가격.iloc[idx]["지역"]
    if len(도매가격[(도매가격['날짜'] == point) & (도매가격['지역'] == region) & (도매가격['품명'] != item)]) != 0:
        if pd.isna(item):
            print(도매가격.iloc[idx])
        else:
            date_list.append(point)

In [15]:
date_list

['2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-02',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-03',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-01-04',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-19',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-22',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-23',
 '2013-04-24',
 '2013-04-24',
 '2013-04-24',
 '2013-04-24',
 '2013-04-24',
 '2013-04-24',
 '2013-04-

In [16]:
temp = list(set(date_list))
new_date = sorted(temp)

In [17]:
res_date = new_date

In [18]:
df_temp = pd.read_csv("../data/거래량/13_21년도_5대시장_거래량_종합.csv", index_col = 0)
df_temp

Unnamed: 0,날짜,시장,품명,거래량
0,2013-01-02,광주,가을,19514
1,2013-01-02,대전,가을,20390
2,2013-01-02,광주,겨울,24668
3,2013-01-02,부산,겨울,8000
4,2013-01-03,광주,가을,14006
...,...,...,...,...
14624,2021-12-31,광주,가을,18700
14625,2021-12-31,대전,가을,21690
14626,2021-12-31,부산,가을,25000
14627,2021-12-31,서울,가을,263060


In [19]:
## 겹치는 날짜에서 거래량 max인 품종의 품명을 추출해 리스트화

item_list = []
for date in res_date:
    df = df_temp[df_temp["날짜"] == date]
    vol = df.groupby('품명').sum()
    top = vol['거래량'].max()
    item_list.append(vol[vol['거래량'] == top].index)
item_list

[Index(['가을'], dtype='object', name='품명'),
 Index(['겨울'], dtype='object', name='품명'),
 Index(['겨울'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['봄'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['가을'], dtype='object', name='품명'),
 Index(['겨울'], dtype='object', name='품명'),
 Index(['겨울'], dtype='object', name='품명'),
 Index(['겨울'], dtype='object', name='품명'),
 Index(['겨울'], dtyp

In [20]:
우세품 = pd.DataFrame(item_list, columns = ["품명"])
우세품

Unnamed: 0,품명
0,가을
1,겨울
2,겨울
3,봄
4,봄
...,...
209,가을
210,가을
211,가을
212,가을


In [21]:
우세품[우세품['품명'].isna()]

Unnamed: 0,품명
95,
96,
129,
141,


In [22]:
## 거래 데이터가 없는 경우 앞 내용과 동일하게 채움 

우세품.loc[[95,96,141],["품명"]] = '가을'
우세품.loc[129,["품명"]]= '봄'

In [23]:
## 겹치는 품목이 있는 날짜 프레임 생성

dup_date = pd.DataFrame(res_date, columns=['날짜'])
dup_date

Unnamed: 0,날짜
0,2013-01-02
1,2013-01-03
2,2013-01-04
3,2013-04-19
4,2013-04-22
...,...
209,2021-12-15
210,2021-12-16
211,2021-12-17
212,2021-12-20


In [24]:
우세거래품 = pd.concat([dup_date,우세품], axis = 1)

In [25]:
## 제거할 인덱스(우세품이 아닌 도매가데이터) 리스트 생성

idx_ls = []
for idx in 우세거래품.index:
    date = 우세거래품.iloc[idx,0]
    item = 우세거래품.iloc[idx,1]
    print(date,item)
    drop_idx = 도매가격[(도매가격.날짜 == date) & (도매가격.품명 != item)].index
    for item in drop_idx:
        idx_ls.append(item)
    print('---------------------------------------------------------')

2013-01-02 가을
---------------------------------------------------------
2013-01-03 겨울
---------------------------------------------------------
2013-01-04 겨울
---------------------------------------------------------
2013-04-19 봄
---------------------------------------------------------
2013-04-22 봄
---------------------------------------------------------
2013-04-23 봄
---------------------------------------------------------
2013-04-24 봄
---------------------------------------------------------
2013-04-25 봄
---------------------------------------------------------
2013-04-26 봄
---------------------------------------------------------
2013-04-29 봄
---------------------------------------------------------
2013-04-30 봄
---------------------------------------------------------
2013-10-29 가을
---------------------------------------------------------
2013-10-30 가을
---------------------------------------------------------
2013-10-31 가을
---------------------------------------------------------


---------------------------------------------------------
2020-12-17 가을
---------------------------------------------------------
2020-12-18 가을
---------------------------------------------------------
2020-12-21 가을
---------------------------------------------------------
2020-12-22 가을
---------------------------------------------------------
2020-12-23 가을
---------------------------------------------------------
2020-12-24 가을
---------------------------------------------------------
2020-12-28 가을
---------------------------------------------------------
2020-12-29 가을
---------------------------------------------------------
2020-12-30 가을
---------------------------------------------------------
2020-12-31 가을
---------------------------------------------------------
2021-05-03 봄
---------------------------------------------------------
2021-05-04 봄
---------------------------------------------------------
2021-05-06 봄
---------------------------------------------------------
2021-05-0

In [26]:
도매중복수정 = 도매가격.drop(idx_ls).reset_index(drop = True)
도매중복수정

Unnamed: 0,날짜,지역,품명,도매가격
0,2013-01-02,서울,가을,13000.0
1,2013-01-02,부산,가을,12000.0
2,2013-01-02,대구,가을,12000.0
3,2013-01-02,광주,가을,12000.0
4,2013-01-02,대전,가을,13000.0
...,...,...,...,...
11013,2021-12-31,서울,겨울,8000.0
11014,2021-12-31,부산,겨울,9000.0
11015,2021-12-31,대구,겨울,9000.0
11016,2021-12-31,광주,겨울,9120.0


In [27]:
dates['날짜'] = dates['날짜'].dt.strftime('%Y-%m-%d')
dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16435 entries, 0 to 16434
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   날짜      16435 non-null  object
 1   지역      16435 non-null  object
dtypes: object(2)
memory usage: 256.9+ KB


In [28]:
## 전체 구간에 현재 보유데이터 결합

temp = pd.merge(dates,도매중복수정, on=['날짜','지역'], how='left')
temp

Unnamed: 0,날짜,지역,품명,도매가격
0,2013-01-01,서울,,
1,2013-01-01,대전,,
2,2013-01-01,대구,,
3,2013-01-01,부산,,
4,2013-01-01,광주,,
...,...,...,...,...
16430,2021-12-31,서울,겨울,8000.0
16431,2021-12-31,대전,겨울,9700.0
16432,2021-12-31,대구,겨울,9000.0
16433,2021-12-31,부산,겨울,9000.0


In [29]:
## 각 지역별로 분리

seoul_d = temp[temp.지역 == '서울'].reset_index(drop = True)
daejeon_d = temp[temp.지역 == '대전'].reset_index(drop = True)
daegu_d = temp[temp.지역 == '대구'].reset_index(drop = True)
busan_d = temp[temp.지역 == '부산'].reset_index(drop = True)
gwangju_d = temp[temp.지역 == '광주'].reset_index(drop = True)

# 전처리 - padding by ffill

In [30]:
## 지역별로 나누어 가격 보간

seoul_p = copy.deepcopy(seoul_d)
daejeon_p = copy.deepcopy(daejeon_d)
daegu_p = copy.deepcopy(daegu_d)
busan_p = copy.deepcopy(busan_d)
gwangju_p = copy.deepcopy(gwangju_d)
pad_ls = [seoul_p, daejeon_p, daegu_p, busan_p, gwangju_p]

for value in pad_ls:
    temp = value.도매가격.fillna(method='ffill',inplace = True)
    print(value[value.도매가격.isna()])

           날짜  지역   품명  도매가격
0  2013-01-01  서울  NaN   NaN
           날짜  지역   품명  도매가격
0  2013-01-01  대전  NaN   NaN
           날짜  지역   품명  도매가격
0  2013-01-01  대구  NaN   NaN
           날짜  지역   품명  도매가격
0  2013-01-01  부산  NaN   NaN
           날짜  지역   품명  도매가격
0  2013-01-01  광주  NaN   NaN


In [31]:
## 시작날짜 결측치 전날(2012-12-31)과 다음날 평균으로 대체

pr = [13000,13000,12000,12000,11500]
for item, value in zip(pad_ls,pr):
    item.iloc[0,3] = value

In [32]:
## 품명보간

for item in pad_ls:
    item.interpolate(method = 'pad', limit=30, inplace=True) # 최근 값으로 보간
    item.fillna('가을', inplace= True)#시작날짜 품명 보간

### 데이터셋 파일 저장

In [33]:
seoul_p.to_csv(path_or_buf='../data/도매가/보간/13_21_서울_도매가보간.csv', encoding = 'utf-8-sig')
daejeon_p.to_csv(path_or_buf='../data/도매가/보간/13_21_대전_도매가보간.csv', encoding = 'utf-8-sig')
daegu_p.to_csv(path_or_buf='../data/도매가/보간/13_21_대구_도매가보간.csv', encoding = 'utf-8-sig')
busan_p.to_csv(path_or_buf='../data/도매가/보간/13_21_부산_도매가보간.csv', encoding = 'utf-8-sig')
gwangju_p.to_csv(path_or_buf='../data/도매가/보간/13_21_광주_도매가보간.csv', encoding = 'utf-8-sig')