In [1]:
# 1. 라이브러리 가져오기

import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
# 2. 데이터 전처리
# Data Cleansing & Pre-Processing

def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [3]:
# 날짜 처리
data = pd.read_csv('./dummies/201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [4]:

# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [5]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])


In [6]:
# 3. 탐색적 자료분석
# Exploratory Data Analysis


# 4. 변수 선택 및 모델 구축
# Feature Engineering & Initial Modeling

# feature, target 설정
# train_num = df_num.sample(frac=1, random_state=0)
x = df_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(df_num['AMT'])

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [8]:
# 5. 모델 학습 및 검증
# Model Tuning & Evaluation
# 훈련

# model_7
# without Dropout
# batch_size=128, epochs=10
# loss, metrics = 'mean_squared_logarithmic_error'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
model7 = tf.keras.Sequential()
model7.add(layers.Dense(16, activation='relu', input_shape = x_train.iloc[0].shape))
model7.add(layers.Dense(16, activation='relu'))
model7.add(layers.Dense(32, activation='relu'))
model7.add(layers.Dense(32, activation='relu'))
model7.add(layers.Dense(64, activation='relu'))
model7.add(layers.Dense(64, activation='relu'))
model7.add(layers.Dense(128, activation='relu'))
model7.add(layers.Dense(128, activation='relu'))
model7.add(layers.Dense(256, activation='relu'))
model7.add(layers.Dense(256, activation='relu'))
model7.add(layers.Dense(512, activation='relu'))
model7.add(layers.Dense(512, activation='relu'))
model7.add(layers.Dense(1))
model7.compile(optimizer='Adam', 
              loss='mean_squared_logarithmic_error', 
              metrics='mean_squared_logarithmic_error')

hist7 = model7.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val))

# model7.evaluate(x_test, y_test, batch_size=1)

# y_pred7 = model7.predict(x_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
model7.evaluate(x_test, y_test, batch_size=128)



[0.025682156905531883, 0.025682156905531883]

In [11]:
# 6. 결과 및 결언
# Conclusion & Discussion

# 예측 템플릿 만들기
CARD_SIDO_NMs = df['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df['HOM_SIDO_NM'].unique()
AGEs          = df['AGE'].unique()
SEX_CTGO_CDs  = df['SEX_CTGO_CD'].unique()
FLCs          = df['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=x_train.columns)

In [12]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in temp.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(temp[column])
        encoders[column] = encoder
        
temp = temp.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    temp[column] = encoder.transform(temp[column])

In [13]:
temp = temp[['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD','FLC', 'year', 'month']].astype('int')    
    
pred = model7.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)

temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [14]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [15]:
# 제출 파일 만들기
submission = pd.read_csv('./dummies/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('./jeju_2_submission.csv', encoding='utf-8-sig')
submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,2164303000.0
1,202004,강원,골프장 운영업,2185802000.0
2,202004,강원,과실 및 채소 소매업,2207477000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,2229158000.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,2251018000.0
