# 데이터 로드

In [40]:
import pandas as pd

In [41]:
# 주요 파일 경로
move_path = "../data/VL_csv/tn_move_his_이동내역_Cleaned_E.csv"
user_path = "../data/VL_csv/tn_traveller_master_여행객 Master_E_preprocessed.csv"
travel_path = "../data/VL_csv/tn_travel_여행_E_COST_cleaned_gnn.csv"
visit_area_path = "../data/VL_csv/tn_visit_area_info_방문지정보_Cleaned_E.csv"

# 데이터 로딩
move_df = pd.read_csv(move_path)
user_df = pd.read_csv(user_path)
travel_df = pd.read_csv(travel_path)
visit_area_df = pd.read_csv(visit_area_path)

In [42]:
{
    "이동 내역": move_df.head(2),
    "유저 정보": user_df.head(2),
    "여행 정보": travel_df.head(2),
    "방문지 정보": visit_area_df.head(2)
}

{'이동 내역':    TRAVEL_ID     TRIP_ID  START_VISIT_AREA_ID  END_VISIT_AREA_ID  \
 0  e_e000004  2304300001         2.304300e+09                NaN   
 1  e_e000004  2304300002                  NaN       2.304300e+09   
 
        START_DT_MIN        END_DT_MIN  MVMN_CD_1  MVMN_CD_2  
 0  2023-04-30 13:30               NaN        NaN        NaN  
 1               NaN  2023-04-30 14:00        1.0        NaN  ,
 '유저 정보':    GENDER  EDU_NM  EDU_FNSH_SE  MARR_STTS  JOB_NM  HOUSE_INCOME  TRAVEL_TERM  \
 0       2       4          1.0          3      11           9.0            2   
 1       2       7          1.0          2       2          12.0            2   
 
    TRAVEL_LIKE_SIDO_1  TRAVEL_LIKE_SIDO_2  TRAVEL_LIKE_SIDO_3  ...  \
 0                  11                  47                  41  ...   
 1                  42                  48                  41  ...   
 
    TRAVEL_STYL_3  TRAVEL_STYL_4  TRAVEL_STYL_5  TRAVEL_STYL_6  TRAVEL_STYL_7  \
 0              4              4          

# Edge 전처리

## 장소 - 장소 엣지

In [43]:
import torch
import numpy as np

In [44]:
move_valid = move_df.dropna(subset=["END_VISIT_AREA_ID"]).copy() # END_VISIT_AREA_ID가 존재하는 이동만 추출

# TRAVEL_ID 기준으로 정렬
move_valid["END_VISIT_AREA_ID"] = move_valid["END_VISIT_AREA_ID"].astype(int)
move_valid.sort_values(by=["TRAVEL_ID", "TRIP_ID"], inplace=True)

In [45]:
# 다음 방문지 ID 추출 (TRAVEL_ID별로 shift)
move_valid["NEXT_VISIT_AREA_ID"] = (
    move_valid.groupby("TRAVEL_ID")["END_VISIT_AREA_ID"].shift(-1)
)

# 유효한 쌍만 남김 (다음 장소가 있는 경우)
move_edges = move_valid.dropna(subset=["NEXT_VISIT_AREA_ID"]).copy()
move_edges["NEXT_VISIT_AREA_ID"] = move_edges["NEXT_VISIT_AREA_ID"].astype(int)

In [46]:
# 이동 쌍 빈도 계산
move_freq = (
    move_edges.groupby(["END_VISIT_AREA_ID", "NEXT_VISIT_AREA_ID"])
    .size()
    .reset_index(name="weight")
)
move_freq

Unnamed: 0,END_VISIT_AREA_ID,NEXT_VISIT_AREA_ID,weight
0,2304280002,2304280003,7
1,2304280002,2304290001,1
2,2304280003,2304280004,5
3,2304280003,2304280009,1
4,2304280003,2304290001,1
...,...,...,...
2034,2309170009,2309170010,1
2035,2309180001,2309180002,1
2036,2309180002,2309180003,1
2037,2309180003,2309180004,1


## 유저 - 여행 엣지

In [47]:
# travel_df에서 TRAVEL_ID, TRAVELER_ID 사용
user_travel_edges = travel_df[["TRAVELER_ID", "TRAVEL_ID"]].copy()
user_travel_edges = user_travel_edges.dropna()

# 유저 ID와 여행 ID가 모두 있는 경우만 추출
user_travel_edges = user_travel_edges[user_travel_edges["TRAVELER_ID"].notnull() & user_travel_edges["TRAVEL_ID"].notnull()]

In [48]:
# edge로 표현
user_travel_edges = user_travel_edges.rename(columns={
    "TRAVELER_ID": "user_id",
    "TRAVEL_ID": "travel_id"
})

user_travel_edges

Unnamed: 0,user_id,travel_id
0,e000004,e_e000004
1,e000006,e_e000006
2,e000009,e_e000009
3,e000010,e_e000010
4,e000011,e_e000011
...,...,...
2555,g003294,g_g003294
2556,g005600,g_g005600
2557,g007343,g_g007343
2558,g010556,g_g010556


## 여행 - 장소 엣지

In [49]:
travel_visit_edges = visit_area_df[["TRAVEL_ID", "VISIT_AREA_ID"]].copy()
travel_visit_edges = travel_visit_edges.dropna()

travel_visit_edges["VISIT_AREA_ID"] = travel_visit_edges["VISIT_AREA_ID"].astype(int)

In [50]:
# rename for clarity
travel_visit_edges = travel_visit_edges.rename(columns={
    "TRAVEL_ID": "travel_id",
    "VISIT_AREA_ID": "visit_area_id"
})

travel_visit_edges

Unnamed: 0,travel_id,visit_area_id
0,e_e000004,2304300002
1,e_e000004,2304300003
2,e_e000004,2304300004
3,e_e000006,2304300002
4,e_e000006,2304300003
...,...,...
15936,g_g010556,2309020007
15937,g_g010556,2309020008
15938,h_h003275,2308270002
15939,h_h003275,2308270003


## 유저 - 장소 엣지

In [51]:
# 교집합이 존재하는 TRAVEL_ID 확인
travel_ids_in_user = set(travel_df["TRAVEL_ID"].unique())
travel_ids_in_visit = set(visit_area_df["TRAVEL_ID"].unique())

# 교집합 추출
common_travel_ids = travel_ids_in_user.intersection(travel_ids_in_visit)

In [52]:
# 공통 TRAVEL_ID 기반으로 필터링
travel_sub = travel_df[travel_df["TRAVEL_ID"].isin(common_travel_ids)]
visit_area_sub = visit_area_df[visit_area_df["TRAVEL_ID"].isin(common_travel_ids)]

# 조인하여 user - visit_area 생성
user_visit_edges = pd.merge(
    travel_sub[["TRAVEL_ID", "TRAVELER_ID"]],
    visit_area_sub[["TRAVEL_ID", "VISIT_AREA_ID"]],
    on="TRAVEL_ID",
    how="inner"
)

In [53]:
user_visit_edges = user_visit_edges.rename(columns={
    "TRAVELER_ID": "user_id",
    "VISIT_AREA_ID": "visit_area_id"
})
user_visit_edges["visit_area_id"] = user_visit_edges["visit_area_id"].astype(int)

user_visit_edges

Unnamed: 0,TRAVEL_ID,user_id,visit_area_id
0,e_e000004,e000004,2304300002
1,e_e000004,e000004,2304300003
2,e_e000004,e000004,2304300004
3,e_e000006,e000006,2304300002
4,e_e000006,e000006,2304300003
...,...,...,...
15936,g_g010556,g010556,2309020007
15937,g_g010556,g010556,2309020008
15938,h_h003275,h003275,2308270002
15939,h_h003275,h003275,2308270003


# Node 전처리

## 유저 노드


- 'USER_ID', 'PASSWORD', 'NAME', 'GENDER', 'BIRTHDATE',
- 'TRAVEL_TERM', 'TRAVEL_NUM',
- 'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SIDO_2', 'TRAVEL_LIKE_SIDO_3',
- 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
- 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
- 'TRAVEL_MOTIVE_1', 'TRAVEL_MOTIVE_2'

In [54]:
# 사용할 feature 컬럼 정의 (TRAVELER_ID 제외)
user_feature_cols = [
    'GENDER', 'TRAVEL_TERM', 'TRAVEL_NUM',
    'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SIDO_2', 'TRAVEL_LIKE_SIDO_3',
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
    'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1', 'TRAVEL_MOTIVE_2',
    'AGE_GRP'
]

# 결측값 0으로 채우고 float32로 변환
user_feature_tensor = user_df[user_feature_cols].fillna(0).astype(np.float32).to_numpy()

In [55]:
# ID 매핑: TRAVELER_ID → index
user_id_list = user_df["TRAVELER_ID"].tolist()
user_id_to_index = {uid: idx for idx, uid in enumerate(user_id_list)}

# 결과 요약
{
    "user_tensor_shape": user_feature_tensor.shape,
    "user_id_sample": user_id_list[:5],
    "user_index_sample": list(user_id_to_index.items())[:5]
}

{'user_tensor_shape': (1919, 17),
 'user_id_sample': ['e004720', 'e003564', 'e000396', 'e001890', 'e007797'],
 'user_index_sample': [('e004720', 0),
  ('e003564', 1),
  ('e000396', 2),
  ('e001890', 3),
  ('e007797', 4)]}

## 여행 노드
- 비용 범주화 (14만원 -> 10만원, 반올림)