In [23]:
import ast
import torch
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, MinMaxScaler, StandardScaler

# 여행 데이터 전처리

In [3]:
travel_df = pd.read_csv('../data/VL_csv/tn_travel_여행_E_COST_cleaned.csv')

In [21]:
# 복사본 생성
travel_data = travel_df.copy()

# TRAVEL_PURPOSE: 문자열 리스트 변환
travel_data['TRAVEL_PURPOSE_LIST'] = travel_data['TRAVEL_PURPOSE'].apply(ast.literal_eval)

# Multi-hot encoding
mlb = MultiLabelBinarizer()
purpose_encoded = mlb.fit_transform(travel_data['TRAVEL_PURPOSE_LIST'])
purpose_columns = [f'purpose_{cls}' for cls in mlb.classes_]

# 2. One-hot: MVMN_NM, mission
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
onehot_encoded = ohe.fit_transform(travel_data[['MVMN_NM', 'mission']])
onehot_columns = ohe.get_feature_names_out(['MVMN_NM', 'mission'])

# 3. Cost feature: TOTAL_COST (정규화)
# 이 Scaler를 저장해두고 나중에 재사용 해야됨..?
scaler = StandardScaler()
cost_scaled = scaler.fit_transform(travel_data[['TOTAL_COST']])
cost_column = ['total_cost_scaled']


# 4. 전체 travel feature 병합
travel_features_np = np.hstack([purpose_encoded, onehot_encoded, cost_scaled])

In [22]:
# tensor로 변환
travel_features = torch.tensor(travel_features_np, dtype=torch.float)

# 결과 요약
feature_shape = travel_features.shape
feature_preview = travel_features[:3]

travel_df.shape, feature_shape, feature_preview

((320, 17),
 torch.Size([320, 27]),
 tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000,  0.0000, -0.5763],
         [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000,  0.0000, -0.5652],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000,  0.0000,  0.1245]]))

In [24]:
# 추후 전처리를 위해 저장해둬야됨

joblib.dump(scaler, "scaler_for_travel_cost.pkl")

['scaler_for_travel_cost.pkl']

| 내용                 | 개수                                        |
| ------------------ | ----------------------------------------- |
| 여행 목적 (multi-hot)  | `purpose_*` (예: purpose\_2, purpose\_6 등) |
| 이동수단, 미션 (one-hot) | `MVMN_NM_*`, `mission_*`                  |
| 총 예산 정규화           | `total_cost_scaled`                       |
| **총 feature 차원 수** | 27개                          |


# 여행객 데이터 전처리

In [13]:
# 여행객 정보 CSV 로드
traveller_path = '../data/VL_csv/tn_traveller_master_여행객 Master_E.csv'
traveller_df = pd.read_csv(traveller_path)

print(traveller_df.columns)
traveller_df.head(3).T

Index(['TRAVELER_ID', 'RESIDENCE_SGG_CD', 'GENDER', 'AGE_GRP', 'EDU_NM',
       'EDU_FNSH_SE', 'MARR_STTS', 'FAMILY_MEMB', 'JOB_NM', 'JOB_ETC',
       'INCOME', 'HOUSE_INCOME', 'TRAVEL_TERM', 'TRAVEL_NUM',
       'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SGG_1', 'TRAVEL_LIKE_SIDO_2',
       'TRAVEL_LIKE_SGG_2', 'TRAVEL_LIKE_SIDO_3', 'TRAVEL_LIKE_SGG_3',
       'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
       'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
       'TRAVEL_STATUS_RESIDENCE', 'TRAVEL_STATUS_DESTINATION',
       'TRAVEL_STATUS_ACCOMPANY', 'TRAVEL_STATUS_YMD', 'TRAVEL_MOTIVE_1',
       'TRAVEL_MOTIVE_2', 'TRAVEL_MOTIVE_3', 'TRAVEL_COMPANIONS_NUM'],
      dtype='object')


Unnamed: 0,0,1,2
TRAVELER_ID,e009688,e003836,e000653
RESIDENCE_SGG_CD,41,41,11
GENDER,남,여,남
AGE_GRP,20,30,30
EDU_NM,4,5,5
EDU_FNSH_SE,1,1,1
MARR_STTS,1,1,1
FAMILY_MEMB,5,3,1
JOB_NM,12,3,3
JOB_ETC,,,


## 전처리할 Feature

| 컬럼명                 | 설명                    | 활용 방식                   |
| ------------------- | --------------------- | ----------------------- |
| `TRAVELER_ID`       | 여행객 고유 ID             | 고유 식별자, 노드 키            |
| `AGE_GRP`           | 연령대 (10, 20, 30, ...) | One-hot                 |
| `GENDER`            | 성별 (남/여)              | One-hot                 |
| `RESIDENCE_SGG_CD`  | 거주지 시군구 코드            | 수치형 or 임베딩              |
| `TRAVEL_STYL_1~8`   | 여행 스타일 선택값            | 그대로 사용 |
| `TRAVEL_MOTIVE_1~3` | 여행 동기 선택값             | 다중 선택형 → Multi-hot      |


In [27]:
# 1. AGE_GRP, GENDER → One-hot
user_data = traveller_df.copy()
user_data = user_data[['TRAVELER_ID', 'AGE_GRP', 'GENDER'] + 
                      [f'TRAVEL_STYL_{i}' for i in range(1, 9)] +
                      [f'TRAVEL_MOTIVE_{i}' for i in range(1, 4)]]

# One-hot encoding for AGE_GRP, GENDER
ohe_user = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_encoded = ohe_user.fit_transform(user_data[['AGE_GRP', 'GENDER']])
ohe_columns = ohe_user.get_feature_names_out(['AGE_GRP', 'GENDER'])

# 2. 스타일 (TRAVEL_STYL_1~8): 정수형 그대로 사용
style_features = user_data[[f'TRAVEL_STYL_{i}' for i in range(1, 9)]].fillna(0).values

# 3. 동기 (TRAVEL_MOTIVE_1~3): Multi-hot encoding
# -> 하나의 유저가 여러 동기를 선택하는 경우 반영
motive_data = user_data[[f'TRAVEL_MOTIVE_{i}' for i in range(1, 4)]].fillna(0).astype(int).values
motive_set = [set(motives) - {0} for motives in motive_data]
mlb_user = MultiLabelBinarizer()
motive_encoded = mlb_user.fit_transform(motive_set)

In [29]:
# 4. 전체 user feature 결합
user_features_np = np.hstack([ohe_encoded, style_features, motive_encoded])

# shape 및 일부 확인
traveller_df.shape, user_features_np.shape, user_features_np[:3]

((320, 36),
 (320, 25),
 array([[1., 0., 0., 0., 0., 1., 0., 6., 4., 6., 2., 3., 5., 3., 2., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 1., 2., 2., 1., 3., 1., 3., 4., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 1., 0., 2., 1., 1., 2., 4., 6., 1., 7., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 0.]]))