In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

## user → travel 엣지

* `edge_index_user_to_travel[0]`: 출발 노드 (user)
* `edge_index_user_to_travel[1]`: 도착 노드 (travel)

---
### 사용 방식


```python
data[('user', 'traveled', 'travel')].edge_index = torch.tensor(edge_index_user_to_travel, dtype=torch.long)
```

In [None]:
# travel 데이터 로드
travel_df = pd.read_csv("../data/VL_csv/tn_travel_여행_E_COST_cleaned.csv")

# user → travel 관계 추출
user_travel_edges = travel_df[['TRAVELER_ID', 'TRAVEL_ID']].copy()

# 고유 ID 추출
unique_user_ids = user_travel_edges['TRAVELER_ID'].unique()
unique_travel_ids = user_travel_edges['TRAVEL_ID'].unique()

# ID to index 매핑
user_id_map = {id_: i for i, id_ in enumerate(unique_user_ids)}
travel_id_map = {id_: i for i, id_ in enumerate(unique_travel_ids)}

# 인덱스로 매핑
user_indices = user_travel_edges['TRAVELER_ID'].map(user_id_map).values
travel_indices = user_travel_edges['TRAVEL_ID'].map(travel_id_map).values

# PyG용 edge_index (2, N) 형태
edge_index_user_to_travel = np.vstack([user_indices, travel_indices])

print("Edge shape:", edge_index_user_to_travel.shape)
print("Edge index 예시:\n", edge_index_user_to_travel[:, :5])


Edge shape: (2, 320)
Edge index 예시:
 [[0 1 2 3 4]
 [0 1 2 3 4]]


## 이동수단 엣지

| 이동수단 코드     | 엣지 수   |
| ----------- | ------ |
| 1 (자차 등)    | 787    |
| 15 (시내버스 등) | 803    |
| 5, 12, 13 등 | 다양     |
| 총 코드 수      | 13개 종류 |

### 사용방식

```python
from torch_geometric.data import HeteroData
import torch

data = HeteroData()

for move_type, edge_index_np in edge_index_move_by_type.items():
    edge_type = ('visit_area', f'move_{move_type}', 'visit_area')
    data[edge_type].edge_index = torch.tensor(edge_index_np, dtype=torch.long)
```

---

In [7]:
move_df = pd.read_csv("../data/VL_csv/tn_move_his_이동내역_E.csv")

In [8]:
move_df.head(3).T

Unnamed: 0,0,1,2
TRAVEL_ID,e_e000008,e_e000008,e_e000008
TRIP_ID,2304290001,2304290002,2304290003
START_VISIT_AREA_ID,2304290001.0,,
END_VISIT_AREA_ID,,2304290002.0,2304290003.0
START_DT_MIN,2023-04-29 15:00,,
END_DT_MIN,,2023-04-29 20:00,2023-04-29 15:30
MVMN_CD_1,,1.0,1.0
MVMN_CD_2,,1.0,1.0


In [None]:
# visit_area ID 목록 확보 및 매핑 재구성
visit_df = pd.read_csv("../data/VL_csv/tn_visit_area_info_방문지정보_E.csv")
unique_visit_ids = visit_df['VISIT_AREA_ID'].dropna().unique()
visit_id_map = {id_: i for i, id_ in enumerate(unique_visit_ids)}

# 이동내역 정제
move_df = move_df[['TRAVEL_ID', 'START_VISIT_AREA_ID', 'END_VISIT_AREA_ID', 'MVMN_CD_1']].copy()
move_df = move_df.dropna(subset=['TRAVEL_ID'])

# 엣지 저장 딕셔너리 초기화
move_edge_dict = defaultdict(list)

# travel_id 단위로 그룹핑
for travel_id, group in move_df.groupby('TRAVEL_ID'):
    group = group.sort_values(by='END_VISIT_AREA_ID')

    path = []

    # 첫 시작점
    start = group.iloc[0]['START_VISIT_AREA_ID'] if pd.notna(group.iloc[0]['START_VISIT_AREA_ID']) else None
    if start is not None:
        path.append((start, group.iloc[0]['MVMN_CD_1']))

    for _, row in group.iterrows():
        if pd.notna(row['END_VISIT_AREA_ID']):
            path.append((row['END_VISIT_AREA_ID'], row['MVMN_CD_1']))

    # 연속 방문지 간 엣지 생성
    for (src, src_mv), (dst, _) in zip(path[:-1], path[1:]):
        if src in visit_id_map and dst in visit_id_map and pd.notna(src_mv):
            s = visit_id_map[src]
            d = visit_id_map[dst]
            t = int(src_mv)
            move_edge_dict[t].append((s, d))

# 이동수단별 edge_index 구성
edge_index_move_by_type = {
    move_type: np.array(edge_list).T
    for move_type, edge_list in move_edge_dict.items() if len(edge_list) > 0
}

# 결과 확인
[(k, v.shape) for k, v in edge_index_move_by_type.items()]


[(1, (2, 722)),
 (6, (2, 3)),
 (4, (2, 37)),
 (15, (2, 816)),
 (13, (2, 129)),
 (5, (2, 230)),
 (12, (2, 48)),
 (2, (2, 28)),
 (14, (2, 6)),
 (8, (2, 15)),
 (7, (2, 27)),
 (10, (2, 4)),
 (9, (2, 2)),
 (16, (2, 11))]

## travel → visit_area 엣지 생성

In [19]:
# 고유 TRAVEL_ID 및 VISIT_AREA_ID 매핑
unique_travel_ids = travel_df['TRAVEL_ID'].dropna().unique()
unique_visit_ids = visit_df['VISIT_AREA_ID'].dropna().unique()

travel_id_map = {id_: i for i, id_ in enumerate(unique_travel_ids)}
visit_id_map = {id_: i for i, id_ in enumerate(unique_visit_ids)}

# travel → visit_area 엣지 구성
edges = visit_df[['TRAVEL_ID', 'VISIT_AREA_ID']].dropna()
travel_indices = edges['TRAVEL_ID'].map(travel_id_map)
visit_indices = edges['VISIT_AREA_ID'].map(visit_id_map)

# 유효한 인덱스만 필터링
mask = travel_indices.notna() & visit_indices.notna()
edge_index_travel_to_visit = np.vstack([
    travel_indices[mask].astype(int).values,
    visit_indices[mask].astype(int).values
])


# 데이터 저장

In [22]:
import numpy as np
import os

save_path = "../data/"  # 또는 로컬 경로로 변경

# 1. user → travel
np.save(os.path.join(save_path, "edge_user_to_travel.npy"), edge_index_user_to_travel)

# 2. travel → visit_area
np.save(os.path.join(save_path, "edge_travel_to_visit.npy"), edge_index_travel_to_visit)

# 3. visit_area → visit_area (이동수단별)
for move_type, edge_index in edge_index_move_by_type.items():
    fname = f"edge_visit_move_{move_type}.npy"
    np.save(os.path.join(save_path, fname), edge_index)
