In [1]:
import ast
import torch
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, MinMaxScaler, StandardScaler

# 여행 데이터 전처리

In [2]:
travel_df = pd.read_csv('../data/VL_csv/tn_travel_여행_E_COST_cleaned.csv')

In [3]:
# 복사본 생성
travel_data = travel_df.copy()

# TRAVEL_PURPOSE: 문자열 리스트 변환
travel_data['TRAVEL_PURPOSE_LIST'] = travel_data['TRAVEL_PURPOSE'].apply(ast.literal_eval)

# Multi-hot encoding
mlb = MultiLabelBinarizer()
purpose_encoded = mlb.fit_transform(travel_data['TRAVEL_PURPOSE_LIST'])
purpose_columns = [f'purpose_{cls}' for cls in mlb.classes_]

# 2. One-hot: MVMN_NM, mission
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
onehot_encoded = ohe.fit_transform(travel_data[['MVMN_NM', 'mission']])
onehot_columns = ohe.get_feature_names_out(['MVMN_NM', 'mission'])

# 3. Cost feature: TOTAL_COST (정규화)
# 이 Scaler를 저장해두고 나중에 재사용 해야됨..?
scaler = StandardScaler()
cost_scaled = scaler.fit_transform(travel_data[['TOTAL_COST']])
cost_column = ['total_cost_scaled']


# 4. 전체 travel feature 병합
travel_features_np = np.hstack([purpose_encoded, onehot_encoded, cost_scaled])

In [4]:
# tensor로 변환
travel_features = torch.tensor(travel_features_np, dtype=torch.float)

# 결과 요약
feature_shape = travel_features.shape
feature_preview = travel_features[:3]

travel_df.shape, feature_shape, feature_preview

((2560, 17),
 torch.Size([2560, 27]),
 tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000,  0.0000, -0.4149],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           0.0000,  1.0000, -0.3105],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
           0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
           1.0000,  0.0000,  0.3791]]))

In [5]:
# 추후 전처리를 위해 저장
joblib.dump(scaler, "scaler_for_travel_cost.pkl")

['scaler_for_travel_cost.pkl']

| 내용                 | 개수                                        |
| ------------------ | ----------------------------------------- |
| 여행 목적 (multi-hot)  | `purpose_*` (예: purpose\_2, purpose\_6 등) |
| 이동수단, 미션 (one-hot) | `MVMN_NM_*`, `mission_*`                  |
| 총 예산 정규화           | `total_cost_scaled`                       |
| **총 feature 차원 수** | 27개                          |


# 여행객 데이터 전처리

In [6]:
# 여행객 정보 CSV 로드
traveller_path = '../data/VL_csv/tn_traveller_master_여행객 Master_E.csv'
traveller_df = pd.read_csv(traveller_path)

print(traveller_df.columns)
traveller_df.head(3).T

Index(['TRAVELER_ID', 'RESIDENCE_SGG_CD', 'GENDER', 'AGE_GRP', 'EDU_NM',
       'EDU_FNSH_SE', 'MARR_STTS', 'FAMILY_MEMB', 'JOB_NM', 'JOB_ETC',
       'INCOME', 'HOUSE_INCOME', 'TRAVEL_TERM', 'TRAVEL_NUM',
       'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SGG_1', 'TRAVEL_LIKE_SIDO_2',
       'TRAVEL_LIKE_SGG_2', 'TRAVEL_LIKE_SIDO_3', 'TRAVEL_LIKE_SGG_3',
       'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
       'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
       'TRAVEL_STATUS_RESIDENCE', 'TRAVEL_STATUS_DESTINATION',
       'TRAVEL_STATUS_ACCOMPANY', 'TRAVEL_STATUS_YMD', 'TRAVEL_MOTIVE_1',
       'TRAVEL_MOTIVE_2', 'TRAVEL_MOTIVE_3', 'TRAVEL_COMPANIONS_NUM'],
      dtype='object')


Unnamed: 0,0,1,2
TRAVELER_ID,e004720,e000914,e003564
RESIDENCE_SGG_CD,41,30,41
GENDER,여,여,여
AGE_GRP,60,20,30
EDU_NM,4,6,7
EDU_FNSH_SE,1.0,1.0,1.0
MARR_STTS,3,1,2
FAMILY_MEMB,3,1,4
JOB_NM,11,3,2
JOB_ETC,,,


## 전처리할 Feature

| 컬럼명                 | 설명                    | 활용 방식                   |
| ------------------- | --------------------- | ----------------------- |
| `TRAVELER_ID`       | 여행객 고유 ID             | 고유 식별자, 노드 키            |
| `AGE_GRP`           | 연령대 (10, 20, 30, ...) | One-hot                 |
| `GENDER`            | 성별 (남/여)              | One-hot                 |
| `RESIDENCE_SGG_CD`  | 거주지 시군구 코드            | 수치형 or 임베딩              |
| `TRAVEL_STYL_1~8`   | 여행 스타일 선택값            | 그대로 사용 |
| `TRAVEL_MOTIVE_1~3` | 여행 동기 선택값             | 다중 선택형 → Multi-hot      |


In [7]:
# 1. JOB_NM별 평균 INCOME 계산 후 소득 순으로 정렬
job_income_mean = traveller_df.groupby('JOB_NM')['INCOME'].mean().sort_values()
job_income_rank = {job: rank for rank, job in enumerate(job_income_mean.index)}

# 2. traveller_df 복사 및 컬럼 선택
user_data = traveller_df.copy()

# 3. JOB_NM → 소득 순 Label Encoding
user_data['JOB_NM_ENC'] = user_data['JOB_NM'].map(job_income_rank)

# 4. GENDER, MARR_STTS → Label Encoding
gender_map = {'남': 0, '여': 1}
user_data['GENDER_ENC'] = user_data['GENDER'].map(gender_map)
user_data['MARR_STTS_ENC'] = user_data['MARR_STTS']

# 5. 사용할 feature 컬럼 목록 정의
columns_to_use = [
    'AGE_GRP', 'GENDER_ENC', 'MARR_STTS_ENC', 'EDU_NM', 'EDU_FNSH_SE', 'INCOME',
    'HOUSE_INCOME', 'TRAVEL_TERM', 'TRAVEL_NUM', 'FAMILY_MEMB', 'TRAVEL_COMPANIONS_NUM',
    'JOB_NM_ENC', 'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SIDO_2', 'TRAVEL_LIKE_SIDO_3'
] + [f'TRAVEL_STYL_{i}' for i in range(1, 9)] + [f'TRAVEL_MOTIVE_{i}' for i in range(1, 4)]

# 6. 최종 feature 벡터 생성
user_features_np = user_data[columns_to_use].fillna(0).astype(int).values

# 결과 확인
user_features_np.shape, user_features_np[:3]


((2560, 26),
 array([[60,  1,  3,  4,  1,  4,  9,  2,  2,  3,  1,  0, 11, 47, 41,  2,
          4,  4,  4,  4,  4,  5,  5,  2,  6,  0],
        [20,  1,  1,  6,  1,  4,  0,  3,  2,  1,  0,  8, 45, 26, 11,  3,
          6,  2,  3,  2,  2,  4,  1,  1,  7, 10],
        [30,  1,  2,  7,  1,  7, 12,  2,  2,  4,  3,  9, 42, 48, 41,  1,
          1,  1,  3,  2,  1,  1,  7,  8,  3,  7]]))

In [8]:
joblib.dump(scaler, "scaler_for_traveler_cost.pkl")

['scaler_for_traveler_cost.pkl']

# 방문지 데이터 전처리

In [9]:
# 방문지 정보 CSV 로드
visit_path = '../data/VL_csv/tn_visit_area_info_방문지정보_Cleaned_E.csv'
visit_df = pd.read_csv(visit_path)

print(visit_df.columns)
visit_df.head(3).T

Index(['VISIT_AREA_ID', 'TRAVEL_ID', 'VISIT_ORDER', 'VISIT_AREA_NM',
       'VISIT_START_YMD', 'VISIT_END_YMD', 'ROAD_NM_ADDR', 'LOTNO_ADDR',
       'X_COORD', 'Y_COORD', 'ROAD_NM_CD', 'LOTNO_CD', 'POI_ID', 'POI_NM',
       'RESIDENCE_TIME_MIN', 'VISIT_AREA_TYPE_CD', 'REVISIT_YN',
       'VISIT_CHC_REASON_CD', 'LODGING_TYPE_CD', 'DGSTFN', 'REVISIT_INTENTION',
       'RCMDTN_INTENTION', 'SGG_CD'],
      dtype='object')


Unnamed: 0,0,1,2
VISIT_AREA_ID,2304300002,2304300003,2304300004
TRAVEL_ID,e_e000004,e_e000004,e_e000004
VISIT_ORDER,2,3,4
VISIT_AREA_NM,화성 관광열차 안내소 연무대 매표소,창룡문,수원 화성 화홍문
VISIT_START_YMD,2023-04-30,2023-04-30,2023-04-30
VISIT_END_YMD,2023-04-30,2023-04-30,2023-04-30
ROAD_NM_ADDR,경기 수원시 팔달구 창룡대로103번길 20,,
LOTNO_ADDR,경기 수원시 팔달구 매향동 3-32,경기 수원시 팔달구 남수동,경기 수원시 팔달구 북수동 9000-1
X_COORD,127.023339,127.025143,127.017626
Y_COORD,37.287878,37.287791,37.287546


In [10]:
# 필요한 컬럼만 추출
visit_data = visit_df[['VISIT_AREA_ID', 'VISIT_AREA_TYPE_CD', 'REVISIT_YN',
                       'RESIDENCE_TIME_MIN', 'DGSTFN', 'REVISIT_INTENTION', 'RCMDTN_INTENTION']].copy()

# 1. REVISIT_YN → Label Encoding (Y=1, N=0, NaN=0)
visit_data['REVISIT_YN'] = visit_data['REVISIT_YN'].map({'Y': 1, 'N': 0})
visit_data['REVISIT_YN'] = visit_data['REVISIT_YN'].fillna(0).astype(int)

# 2. One-hot 인코딩: VISIT_AREA_TYPE_CD
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
type_encoded = ohe.fit_transform(visit_data[['VISIT_AREA_TYPE_CD']])
type_encoded.shape  # shape 확인

# 3. 수치형 컬럼 결측치 0으로 대체
numeric_cols = ['RESIDENCE_TIME_MIN', 'DGSTFN', 'REVISIT_INTENTION', 'RCMDTN_INTENTION']
visit_data[numeric_cols] = visit_data[numeric_cols].fillna(0)

# 4. 최종 feature 결합
visit_features_np = np.hstack([
    type_encoded,
    visit_data[['REVISIT_YN']].values,
    visit_data[numeric_cols].values
])

# 결과 확인
visit_features_np.shape, visit_features_np[:3]

((15941, 22),
 array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0., 60.,  4.,  3.,  4.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0., 30.,  4.,  4.,  4.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0., 60.,  4.,  3.,  3.]]))

In [11]:
joblib.dump(ohe, "ohe_for_visitarea_typecd.pkl")

['ohe_for_visitarea_typecd.pkl']

# 노드 데이터 저장 (npy)

In [12]:
np.save("../data/travel_features.npy", travel_features_np)
np.save("../data/user_features.npy", user_features_np)
np.save("../data/visit_features.npy", visit_features_np)