In [None]:
import math, sys, os
import pandas as pd
import numpy as np
import sklearn
import itertools
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb
import tensorflow as tf

In [2]:
col_dtype = {
    'REG_YYMM': 'str',
    'CARD_SIDO_NM': 'str',
    'CARD_CCG_NM': 'str',
    'STD_CLSS_NM': 'str',
    'HOM_SIDO_NM': 'str',
    'HOM_CCG_NM': 'str',
    'AGE': 'str',
    'SEX_CTGO_CD': 'str',
    'FLC': 'str',
    'CSTMR_CNT': 'int64',
    'AMT': 'int64',
    'CNT': 'int64'
}

In [3]:
# 날짜 처리
df = pd.read_csv('input/201901-202003.csv', dtype=col_dtype)
df = df.fillna('세종시')

df['REG_YEAR'] = df['REG_YYMM'].str[:4]
df['REG_MONTH'] = df['REG_YYMM'].str[4:]

In [6]:
# template 생성
id_reg = ['201901','201902','201903','201904','201905', '201906', '201907', '201908', '201909', '201910', '201911', '201912', '202001', '202002', '202003', '202004']

id_SIDO = df['CARD_SIDO_NM'].unique()
id_CCG = df['CARD_CCG_NM'].unique()
id_CLSS = df['STD_CLSS_NM'].unique()
id_AGE = df['AGE'].unique()
id_SEX_CTGO_CD = df['SEX_CTGO_CD'].unique()
id_FLC = df['FLC'].unique()

# candi = np.array(list(itertools.product(id_reg, id_SIDO, id_CLSS, id_AGE, id_SEX_CTGO_CD, id_FLC)))
candi = np.array(list(itertools.product(id_reg, id_SIDO, id_CLSS)))

In [8]:
# Left Parts
# df_temp = pd.DataFrame(candi, columns=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC'])
df_temp = pd.DataFrame(candi, columns=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'])
df_temp['REG_YEAR'] = df_temp['REG_YYMM'].str[:4]
df_temp['REG_MONTH'] = df_temp['REG_YYMM'].str[4:]
df_temp.drop('REG_YYMM', axis=1, inplace=True)

In [9]:
# Right Parts
# Group By
# cols_index = ['REG_YEAR', 'REG_MONTH', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AGE', 'SEX_CTGO_CD', 'FLC']
cols_index = ['REG_YEAR', 'REG_MONTH', 'CARD_SIDO_NM', 'STD_CLSS_NM']
# cols_value = ['CSTMR_CNT', 'CNT', 'AMT']
cols_value = ['AMT']

# gb = df[cols_index+cols_value].groupby(by=cols_index+cols_value).agg({'AMT':'sum', 'CNT':'sum', 'CSTMR_CNT':'sum'})
gb = df[cols_index+cols_value].groupby(by=cols_index).agg({'AMT':'sum'})
gb.reset_index(inplace=True)

In [10]:
# Merge!
df_merged = pd.merge(df_temp, gb, how='left', on=cols_index)
df_merged['AMT'].fillna(0, inplace=True)

In [11]:
# 인코딩
encoders = dict()
for col in cols_index:
    encoder = LabelEncoder()
    encoder.fit(df_merged[col])
    encoders[col] = encoder

df_num = df_merged.copy()        
for col in cols_index:
    encoder = encoders[col]
    df_num[col] = encoder.transform(df_merged[col])

In [13]:
def build_dataset(tts):
    t1, t2, t3, t4, ty = tts
    x1 = df_num[(df_merged['REG_YEAR'] + df_merged['REG_MONTH']) == t1]['AMT'].values
    x2 = df_num[(df_merged['REG_YEAR'] + df_merged['REG_MONTH']) == t2]['AMT'].values
    x3 = df_num[(df_merged['REG_YEAR'] + df_merged['REG_MONTH']) == t3]['AMT'].values
    x4 = df_num[(df_merged['REG_YEAR'] + df_merged['REG_MONTH']) == t4]['AMT'].values
    y = df_num[(df_merged['REG_YEAR'] + df_merged['REG_MONTH']) == ty]['AMT'].values
    
    return np.array([x1,x2,x3,x4,y])

In [14]:
reg_comb = [
    ['201901', '201902', '201903', '201904', '201907'],
    ['201902', '201903', '201904', '201905', '201908'],
    ['201903', '201904', '201905', '201906', '201909'],
    ['201904', '201905', '201906', '201907', '201910'],
    ['201905', '201906', '201907', '201908', '201911'],
    ['201906', '201907', '201908', '201909', '201912'],
    ['201907', '201908', '201909', '201910', '202001'],
    ['201908', '201909', '201910', '201911', '202002'],
    ['201909', '201910', '201911', '201912', '202003'],
]

In [15]:
reg_test = ['201910', '201911', '201912', '202001', '202004']

In [16]:
ds = []
for reg in reg_comb:
    xy = build_dataset(reg)
    ds.append(xy)
    
ds = np.array(ds)

In [17]:
xy = build_dataset(reg_test)

In [18]:
x_te = np.array([np.log1p(xy[:4])])

In [28]:
from tensorflow.keras import backend as k
from tensorflow.keras.layers import Input, Dense, Dropout, concatenate
from tensorflow.keras.layers import Bidirectional, LSTM, Attention
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam, RMSprop

In [29]:
x_trn = np.log1p(ds[:-1,:4])
y_trn = np.log1p(ds[:-1,-1:])

x_val = np.log1p(ds[-1:, :4])
y_val = np.log1p(ds[-1:, -1:])

In [30]:
k.clear_session()
# xYear = Input(batch_shape=(None, 1))
# xMonth = Input(batch_shape=(None, 12))
# xYearEmb = Dense(5)(xYear)
# xMonthEmb = Dense(5)(xMonth)

xInput = Input(batch_shape=(None, 4, 697))
xDrop = Dropout(0.3)(xInput)
xLstm = Bidirectional(LSTM(64, return_sequences=True))(xDrop)
xLstm = Bidirectional(LSTM(64))(xLstm)
xDense = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(xLstm)

# xConcat = concatenate([xLstm, xYearEmb, xMonthEmb])
xOutput = Dense(697)(xDense)
xOutput = k.expand_dims(xOutput, axis=1)


model = Model(xInput, xOutput)
model.compile(
    loss='mean_squared_error',
    optimizer=Adam(learning_rate=0.005))

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4, 697)]          0         
_________________________________________________________________
dropout (Dropout)            (None, 4, 697)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 4, 128)            390144    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 697)               89913     
_________________________________________________________________
tf_op_layer_ExpandDims (Tens [(None, 1, 697)]          0     

In [32]:
model.fit(x_trn, y_trn, epochs=500, validation_data=[x_val, y_val], verbose=1)

Train on 8 samples, validate on 1 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch

<tensorflow.python.keras.callbacks.History at 0x26ef3da1388>

# Real Test

In [42]:
pred = model.predict([x_te])

In [43]:
pred_decode = 10**pred

In [47]:
(pred_decode[0,0]<0).sum()

0

In [48]:
df_rt = df_merged[(df_merged['REG_YEAR'] + df_merged['REG_MONTH']) == '202004']

In [50]:
df_rt['AMT'] = pred_decode[0, 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [51]:
gb_rt = df_rt.groupby(['CARD_SIDO_NM', 'STD_CLSS_NM']).agg({'AMT':'sum'})

In [53]:
# 제출 파일 만들기
subm = pd.read_csv('input/submission.csv', index_col=0)

In [54]:
subm['AMT'] = np.concatenate([gb_rt['AMT'].values, gb_rt['AMT'].values])

In [55]:
subm.index.name = 'id'
subm.to_csv('submission_bigcat.csv', encoding='utf-8-sig')
subm.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,2.501078e+20
1,202004,강원,골프장 운영업,4.059351e+23
2,202004,강원,과실 및 채소 소매업,5.246135e+22
3,202004,강원,관광 민예품 및 선물용품 소매업,1.356714e+19
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,70.0


In [56]:
subm

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,2.501078e+20
1,202004,강원,골프장 운영업,4.059351e+23
2,202004,강원,과실 및 채소 소매업,5.246135e+22
3,202004,강원,관광 민예품 및 선물용품 소매업,1.356714e+19
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,7.000000e+01
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,8.075007e+22
1390,202007,충북,한식 음식점업,4.584817e+25
1391,202007,충북,호텔업,1.431037e+19
1392,202007,충북,화장품 및 방향제 소매업,1.020669e+22
