# 테마파크 방문객 예측 코드
- MLP(multi-layer perceptron)을 이용한 예측 코드 

In [1]:
import os
import random
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm 
from tqdm import tqdm
import datetime
from datetime import datetime
from datetime import timedelta

import warnings
warnings.filterwarnings("ignore")

import matplotlib
import matplotlib.font_manager as fm


In [2]:
"""기존 데이터"""
os.chdir(r"디렉토리로 지정할 경로")
macro_raw=pd.read_csv(r".\input\macrodata.csv",encoding='euc-kr')
"""추가되는 데이터"""
weather=pd.read_excel(r".\input\날씨.xlsx")
ride=pd.read_excel(r".\input\놀이기구운휴정보.xlsx")
search=pd.read_excel(r".\input\검색량.xlsx",header=5)

### 전체 입장객 데이터 클렌징 및 병합 함수 

In [4]:
def macro_cleansing(admin=weather,
                    ride=ride,  
                    search=search, 
                    if_majeon=False, 
                    dayahead='평일',
                    dayafter='공휴일'
                    ):
    admin2=admin.copy()
    admin2.rename(columns={'시간:일자':'일시','일자:성/비성수기':'비성수기',
                            '일자:휴일구분':'휴일구분','일자:날씨':'날씨',
                            '날씨:최저온도':'최저온도', '날씨:최고온도':'최고온도',},inplace=True)
    admin2.drop(admin2[admin2['일시']=='전체 결과'].index,inplace=True)
    admin2['일시']=pd.to_datetime(admin2.일시)

    # 요일 추가
    admin2['요일숫자']=admin2['일시'].dt.weekday
    weekday_list = ['월요일', '화요일', '수요일', '목요일', '금요일', '토요일', '일요일']
    admin2['요일'] = admin2.apply(lambda x : weekday_list[x['요일숫자']], axis = 1)
    admin2.drop('요일숫자',axis=1,inplace=True)

    #휴일구분 전처리 추가

    admin_list=[dayahead] + admin2['휴일구분'].tolist() + [dayafter] #지난 달 마지막 날과 다음 달 첫 날 병합 
    admin2.reset_index(inplace=True)
    admin2['index']=admin2['index']+1

    def custom_holiday(휴일구분,index):

        if 휴일구분 == '금요일':
            return '평휴'
        elif 휴일구분 == '토요일':
            return '휴휴'
        elif 휴일구분 =='일요일':
            return '휴평'
        elif 휴일구분 =='공휴일전일':
            return '휴평'

        elif 휴일구분 =='공휴일':
            return '휴휴'

        elif (휴일구분 in ['평일','공휴일전일']) & \
            (admin_list[index-1] in ['토요일','일요일','공휴일']) & \
                (admin_list[index+1] in ['토요일','일요일','공휴일']) :
            return '휴평휴'

        elif (휴일구분 not in ['평일','공휴일전일']) & \
            (admin_list[index-1] in ['토요일','일요일','공휴일']) & \
                (admin_list[index+1] in ['토요일','일요일','공휴일']) :
            return '휴휴휴'
        
        else : return '평평'

    admin2['휴일구분_전처리'] = admin2.apply(lambda x: custom_holiday(x['휴일구분'],x['index']),axis=1)

    #날씨
    admin2.loc[admin2['날씨'].str.contains('눈|비'),'날씨']='눈비'
    admin2.loc[admin2['날씨'].str.contains('구름|흐림'),'날씨']='흐림'

    # # 운휴정보 

    ride=ride[['운휴일자','RIDE명','운휴구분']]
    ride=ride.loc[ride['운휴구분'].isin(['일점검','정기점검','정기운휴'])]
    ride=ride.drop_duplicates()
    ride['count']=1
    ride=ride.groupby(['운휴일자','RIDE명']).sum().reset_index()
    ride2=ride.pivot(index='운휴일자',columns='RIDE명',values='count').reset_index()
    ride2=ride2[['운휴일자','놀이기구1','놀이기구2']]
    ride2.rename(columns={'운휴일자':'일시','놀이기구1':'놀이기구1점검','놀이기구2':'놀이기구2점검'},inplace=True)
    ride2['일시']=pd.to_datetime(ride2.일시)
    ride2.fillna(0)
    ride2.rename(columns={'운휴일자':'일시'},inplace=True)
    
    #검색량 (lag 적용)
    search=search.loc[1:,['날짜','테마파크','테마파크맛집','코로나']]
    search.reset_index(drop=True, inplace=True)
    search.rename(columns={'날짜':'일시'},inplace=True)
    search['일시']=admin2['일시']
    
    #코로나 

    #병합
    admin2=pd.merge(admin2,ride2,how='left',on='일시')
    admin2=pd.merge(admin2,search,how='left',on='일시')
    admin2.drop(columns=['index','휴일구분'],axis=1,inplace=True)
    admin2=admin2.fillna(0)

    admin2['년']=admin2['일시'].dt.year
    admin2['월']=admin2['일시'].dt.month
    admin2['covid']=0

    if type(if_majeon)==pd.DataFrame:
        if_majeon.rename(columns={'유형별 입장객분류':'일시'},inplace=True)
        if_majeon = if_majeon.iloc[:-1,[0,5]] #전체 결과, 타 경로 삭제 
        if_majeon['일시']=pd.to_datetime(if_majeon['일시'])
        admin2['전체입장객']=if_majeon['전체 결과'] #        
    else: 
        admin2['전체입장객']=-999
    return admin2


In [None]:
"""함수 실행 결과"""
macro_cleansing(admin=weather, 
                    ride=ride, 
                    search=search, 
                    if_majeon=False, # 학습용: majeon, 예측용: False 
                    dayahead='평일', #수기로 입력
                    dayafter='공휴일' #수기로 입력
                    )

### macro 데이터와 병합하여 훈련데이터 완성
- 위의 함수를 사용하여 만들어진 n월 데이터를 macro 데이터와 concat 하여 훈련 데이터 완성 

In [38]:
"""위의 통합데이터 macro와 병합하여 사용"""

"""#사용예시"""
data_2212 = macro_cleansing(admin=weather,
                            ride=ride, 
                            search=search, 
                            if_majeon=False,
                            dayahead='평일', 
                            dayafter='공휴일')
macro_raw = pd.concat([macro_raw,data_2212])


# 해당 월이 병합된 macro data 완성 

### 전체 데이터 예측 함수

In [6]:
def predict_total_visitor(macro_raw,year,month,validation=False): #기본값
    
    macro=macro_raw.copy()
    
    #covid=1인 기간 없애기
    macro.drop(macro[macro['covid']==1].index,inplace=True)
    macro.reset_index(drop=True,inplace=True)

    #train,test 분리
    test = macro[(macro['년']==year)&(macro['월']==month)].index
    train = list(set(macro.index)-set(test))

    macro['전체입장객']=macro['전체입장객'].astype(str)
    macro['전체입장객']=macro['전체입장객'].apply(lambda x: float(x.replace(',',''))) #쉼표 삭제 

    macro['전체입장객']=macro['전체입장객'].astype(float)
    #train only
    macro.loc[train,'전체입장객']=macro.loc[train,'전체입장객'].apply(lambda x: 25000 if x>=25000 else x)
    macro['전체입장객']=macro['전체입장객'].apply(lambda x: x/100)
    
    macro.loc[macro['테마파크']>=70000,'테마파크']=70000
    macro.loc[macro['코로나']>=3000000,'코로나']=3000000

    # 전체입장객 로그 취하기
    macro['log_전체입장객']=np.log(macro['전체입장객'])
    macro.drop(columns='전체입장객',axis=1, inplace=True)

    # 테마파크 검색량 로그 취하기
    macro['log_테마파크']=np.log(macro['테마파크'])
    macro.drop(columns='테마파크',axis=1, inplace=True)

    # 테마파크맛집 검색량 로그 취하기
    macro['log_테마파크맛집']=np.log(macro['테마파크맛집'])
    macro.drop(columns='테마파크맛집',axis=1, inplace=True)


    #%% 모델 학습 및 테스트
    SEED = 42

    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

    tf.random.set_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)



    """명목변수 dummy & 데이터 형태 변형"""
    dummy_macro=pd.get_dummies(
                            macro.drop(columns=['일시','log_전체입장객', '년']),
                            columns=['월','비성수기','요일','휴일구분_전처리','날씨'])

    """연속변수 scaling"""

    x_train=dummy_macro.iloc[train]
    x_test=dummy_macro.iloc[test]

    continuous = ['최저온도', '최고온도', '코로나', 'log_테마파크', 'log_테마파크맛집']
    mean = x_train[continuous].mean()
    std = x_train[continuous].std()
    x_train[continuous] = (x_train[continuous] - mean) / std
    x_test[continuous] = (x_test[continuous] - mean) / std


    np_x_train=x_train.to_numpy()
    np_x_test=x_test.to_numpy()
    np_y_train=macro.loc[train,'log_전체입장객'].to_numpy()
    np_y_test=macro.loc[test,'log_전체입장객'].to_numpy()

    """# 모델 정의 """
    model=tf.keras.Sequential([
        tf.keras.layers.Dense(units=8,activation='tanh',input_shape=(np_x_train.shape[1],)),
        tf.keras.layers.Dense(units=1,activation='elu')
    ])

    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)


    mae_plot = []
    mape_plot = []
    for iteration in range(1000):
        idx = np.random.choice(range(len(train)), 256, replace=False) # batch
        x = np_x_train[idx]
        y = np_y_train[idx][:, tf.newaxis] 
        x = tf.cast(x, tf.float32)
        y = tf.cast(y, tf.float32)

        with tf.GradientTape() as tape:
            # 1. 예측 (prediction)
            predictions = model(x)
            # 2. Loss 계산
            loss = tf.reduce_mean(tf.abs(predictions - y))
        
        mape = tf.stop_gradient(tf.reduce_mean(tf.abs((tf.exp(predictions) - tf.math.exp(y)) / tf.math.exp(y))))
        if tf.math.is_inf(mape):
            break
            
        # 3. 그라디언트(gradients) 계산
        gradients = tape.gradient(loss, model.trainable_variables)

        # 4. 오차역전파(Backpropagation) - weight 업데이트
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if iteration % 10 == 0:
            print('Iteration: {:02d}, Loss: {:.4f}, MAPE: {:.4f}%'.format(iteration, loss, mape * 100))
        mae_plot.append(loss)
        mape_plot.append(mape)

    # test
    x = np_x_test
    y = np_y_test
    x = tf.cast(x, tf.float32)
    y = tf.cast(y, tf.float32)

    # 예측값 저장
    yhat = model(x)

    if validation:
        #검증용  
        real = np.array([np.sum((tf.exp(y) * 100))])
        predict = np.array([np.sum((tf.exp(yhat) * 100))])
        accuracy = np.array(1-abs(real-predict)/real)
        years=np.array([year],dtype='float32')
        months=np.array([month],dtype='float32')
        # days=np.array([i+1 for i in range(len(predict))])[:, None]
        # 일별 전체입장객 예측치 및 정확도 
        tmp=np.concatenate([years,months,real,predict,accuracy],axis=0)
        
        return pd.DataFrame([tmp],columns=['년','월','실제값','예측치','정확도'])

    else: 
        # 예측용
        predict = np.array((tf.exp(yhat) * 100))
        years=np.array([year for i in range(len(predict))])[:, None]
        months=np.array([month for i in range(len(predict))])[:, None]
        days=np.array([i+1 for i in range(len(predict))])[:, None]
        # 일별 전체입장객 예측치 및 정확도 
        tmp=np.concatenate([years,months,days,predict],axis=1)

        return pd.DataFrame(tmp,columns=['년','월','일','예측치'])

### 예측용 코드

In [8]:
prediction=predict_total_visitor(macro_raw=macro_raw,year=2022,month=12,validation=False)
prediction

Iteration: 00, Loss: 4.0472, MAPE: 97.6774%
Iteration: 10, Loss: 3.9529, MAPE: 97.3991%
Iteration: 20, Loss: 3.7777, MAPE: 96.7241%
Iteration: 30, Loss: 3.6076, MAPE: 95.9698%
Iteration: 40, Loss: 3.4643, MAPE: 95.5522%
Iteration: 50, Loss: 3.2427, MAPE: 94.2405%
Iteration: 60, Loss: 3.0190, MAPE: 92.9043%
Iteration: 70, Loss: 2.7801, MAPE: 91.3426%
Iteration: 80, Loss: 2.5682, MAPE: 89.1404%
Iteration: 90, Loss: 2.2011, MAPE: 85.2087%
Iteration: 100, Loss: 2.1361, MAPE: 84.0312%
Iteration: 110, Loss: 1.7172, MAPE: 76.8936%
Iteration: 120, Loss: 1.5477, MAPE: 72.7968%
Iteration: 130, Loss: 1.3248, MAPE: 67.5761%
Iteration: 140, Loss: 1.2509, MAPE: 65.2734%
Iteration: 150, Loss: 0.9340, MAPE: 54.7971%
Iteration: 160, Loss: 0.8243, MAPE: 50.2565%
Iteration: 170, Loss: 0.6687, MAPE: 46.3882%
Iteration: 180, Loss: 0.5997, MAPE: 42.2914%
Iteration: 190, Loss: 0.4894, MAPE: 41.4336%
Iteration: 200, Loss: 0.4371, MAPE: 39.6958%
Iteration: 210, Loss: 0.4220, MAPE: 39.1742%
Iteration: 220, Loss

Unnamed: 0,년,월,일,예측치
0,2022.0,12.0,1.0,15066.099609
1,2022.0,12.0,2.0,17291.5
2,2022.0,12.0,3.0,18433.6875
3,2022.0,12.0,4.0,13500.967773
4,2022.0,12.0,5.0,8788.854492
5,2022.0,12.0,6.0,8846.322266
6,2022.0,12.0,7.0,8917.71582
7,2022.0,12.0,8.0,8747.450195
8,2022.0,12.0,9.0,11765.447266
9,2022.0,12.0,10.0,14426.413086
