# 데이터 로드

In [499]:
import pandas as pd

In [500]:
# 주요 파일 경로
move_path = "../data/VL_csv/tn_move_his_이동내역_Cleaned_E.csv"
user_path = "../data/VL_csv/tn_traveller_master_여행객 Master_E_preprocessed.csv"
travel_path = "tn_travel_processed.csv"
visit_area_path = "../data/VL_csv/tn_visit_area_info_방문지정보_Cleaned_E.csv"

# 데이터 로딩
move_df = pd.read_csv(move_path)
user_df = pd.read_csv(user_path)
travel_df = pd.read_csv(travel_path)
visit_area_df = pd.read_csv(visit_area_path)

In [501]:
{
    "이동 내역": move_df.head(2),
    "유저 정보": user_df.head(2),
    "여행 정보": travel_df.head(2),
    "방문지 정보": visit_area_df.head(2)
}

{'이동 내역':    TRAVEL_ID     TRIP_ID  START_VISIT_AREA_ID  END_VISIT_AREA_ID  \
 0  e_e000004  2304300001         2.304300e+09                NaN   
 1  e_e000004  2304300002                  NaN       2.304300e+09   
 
        START_DT_MIN        END_DT_MIN  MVMN_CD_1  MVMN_CD_2  
 0  2023-04-30 13:30               NaN        NaN        NaN  
 1               NaN  2023-04-30 14:00        1.0        NaN  ,
 '유저 정보':    GENDER  EDU_NM  EDU_FNSH_SE  MARR_STTS  JOB_NM  HOUSE_INCOME  TRAVEL_TERM  \
 0       2       4          1.0          3      11           9.0            2   
 1       2       7          1.0          2       2          12.0            2   
 
    TRAVEL_LIKE_SIDO_1  TRAVEL_LIKE_SIDO_2  TRAVEL_LIKE_SIDO_3  ...  \
 0                  11                  47                  41  ...   
 1                  42                  48                  41  ...   
 
    TRAVEL_STYL_3  TRAVEL_STYL_4  TRAVEL_STYL_5  TRAVEL_STYL_6  TRAVEL_STYL_7  \
 0              4              4          

# Edge 전처리

## 장소 - 장소 엣지
- 기존에는 횟수로 계산했는데, 동선을 최적으로 제공하기 위해서 이동시간에 기반하는 것으로 계산

In [502]:
move_df

Unnamed: 0,TRAVEL_ID,TRIP_ID,START_VISIT_AREA_ID,END_VISIT_AREA_ID,START_DT_MIN,END_DT_MIN,MVMN_CD_1,MVMN_CD_2
0,e_e000004,2304300001,2.304300e+09,,2023-04-30 13:30,,,
1,e_e000004,2304300002,,2.304300e+09,,2023-04-30 14:00,1.0,
2,e_e000004,2304300003,,2.304300e+09,,2023-04-30 15:00,5.0,
3,e_e000004,2304300004,,2.304300e+09,,2023-04-30 15:30,5.0,
4,e_e000004,2304300005,,2.304300e+09,,2023-04-30 17:30,1.0,
...,...,...,...,...,...,...,...,...
21379,h_h003275,2308270002,,2.308270e+09,,2023-08-27 11:00,1.0,
21380,h_h003275,2308270003,,2.308270e+09,,2023-08-27 12:00,1.0,
21381,h_h003275,2308270004,,2.308270e+09,,2023-08-27 12:30,1.0,
21382,h_h003275,2308270005,,2.308270e+09,,2023-08-27 13:30,1.0,


In [503]:
move_df["START_DT_MIN"] = pd.to_datetime(move_df["START_DT_MIN"], errors="coerce")
move_df["END_DT_MIN"] = pd.to_datetime(move_df["END_DT_MIN"], errors="coerce")

# 필요한 열만 사용
cols = [
    "TRAVEL_ID", "TRIP_ID", "START_VISIT_AREA_ID", "END_VISIT_AREA_ID",
    "START_DT_MIN", "END_DT_MIN", "MVMN_CD_1"
]
move_df = move_df[cols].copy()

# 결과 저장 리스트
edges = []

In [504]:
# 여행별로 반복
for travel_id, group in move_df.sort_values(["TRAVEL_ID", "TRIP_ID"]).groupby("TRAVEL_ID"):
    group = group.reset_index(drop=True)
    n = len(group)
    if n < 2:
        continue

    # 첫 이동: START → END
    start_time = group.loc[0, "START_DT_MIN"]
    end_time = group.loc[1, "END_DT_MIN"]
    if pd.notna(start_time) and pd.notna(end_time):
        duration = (end_time - start_time).total_seconds() / 60
        from_id = int(group.loc[0, "START_VISIT_AREA_ID"]) if pd.notna(group.loc[0, "START_VISIT_AREA_ID"]) else None
        to_id = int(group.loc[1, "END_VISIT_AREA_ID"]) if pd.notna(group.loc[1, "END_VISIT_AREA_ID"]) else None
        transport = group.loc[1, "MVMN_CD_1"]
        if from_id and to_id and pd.notna(transport):
            edges.append([from_id, to_id, duration, int(transport)])

    # 이후 이동: END[i] → END[i+1]
    for i in range(1, n - 1):
        t1 = group.loc[i, "END_DT_MIN"]
        t2 = group.loc[i + 1, "END_DT_MIN"]
        if pd.notna(t1) and pd.notna(t2):
            duration = (t2 - t1).total_seconds() / 60
            from_id = int(group.loc[i, "END_VISIT_AREA_ID"]) if pd.notna(group.loc[i, "END_VISIT_AREA_ID"]) else None
            to_id = int(group.loc[i + 1, "END_VISIT_AREA_ID"]) if pd.notna(group.loc[i + 1, "END_VISIT_AREA_ID"]) else None
            transport = group.loc[i + 1, "MVMN_CD_1"]
            if from_id and to_id and pd.notna(transport):
                edges.append([from_id, to_id, duration, int(transport)])

# 결과 DataFrame으로 정리
edges_df = pd.DataFrame(edges, columns=[
    "FROM_VISIT_AREA_ID", "TO_VISIT_AREA_ID", "DURATION_MINUTES", "MVMN_CD_1"
])

In [505]:
# 이동수단 통합
def classify_transport(code):
    if code in [1, 2, 3]:
        return "drive"
    elif code in [4,5,6,7,8,9,10,11,12,13,50]:
        return "public"
    else:
        return "other"

edges_df["MVMN_TYPE"] = edges_df["MVMN_CD_1"].apply(classify_transport)
edges_df["is_drive"] = (edges_df["MVMN_TYPE"] == "drive").astype(int)
edges_df["is_public"] = (edges_df["MVMN_TYPE"] == "public").astype(int)
edges_df["is_other"] = (edges_df["MVMN_TYPE"] == "other").astype(int)

In [506]:
visit_area_edges_final = edges_df[[
    "FROM_VISIT_AREA_ID", "TO_VISIT_AREA_ID",
    "DURATION_MINUTES", "is_drive", "is_public", "is_other"
]]
visit_area_edges_final

Unnamed: 0,FROM_VISIT_AREA_ID,TO_VISIT_AREA_ID,DURATION_MINUTES,is_drive,is_public,is_other
0,2304300001,2304300002,30.0,1,0,0
1,2304300002,2304300003,60.0,0,1,0
2,2304300003,2304300004,30.0,0,1,0
3,2304300004,2304300005,120.0,1,0,0
4,2304300001,2304300002,30.0,0,1,0
...,...,...,...,...,...,...
18737,2308270001,2308270002,90.0,1,0,0
18738,2308270002,2308270003,60.0,1,0,0
18739,2308270003,2308270004,30.0,1,0,0
18740,2308270004,2308270005,60.0,1,0,0


## 유저 - 여행 엣지

In [507]:
# travel_df에서 TRAVEL_ID, TRAVELER_ID 사용
user_travel_edges = travel_df[["TRAVELER_ID", "TRAVEL_ID"]].copy()
user_travel_edges = user_travel_edges.dropna()

# 유저 ID와 여행 ID가 모두 있는 경우만 추출
user_travel_edges = user_travel_edges[user_travel_edges["TRAVELER_ID"].notnull() & user_travel_edges["TRAVEL_ID"].notnull()]

In [508]:
# edge로 표현
user_travel_edges = user_travel_edges.rename(columns={
    "TRAVELER_ID": "user_id",
    "TRAVEL_ID": "travel_id"
})

user_travel_edges

Unnamed: 0,user_id,travel_id
0,e000004,e_e000004
1,e000006,e_e000006
2,e000009,e_e000009
3,e000010,e_e000010
4,e000011,e_e000011
...,...,...
2555,g003294,g_g003294
2556,g005600,g_g005600
2557,g007343,g_g007343
2558,g010556,g_g010556


## 여행 - 장소 엣지

In [509]:
travel_visit_edges = visit_area_df[["TRAVEL_ID", "VISIT_AREA_ID"]].copy()
travel_visit_edges = travel_visit_edges.dropna()

travel_visit_edges["VISIT_AREA_ID"] = travel_visit_edges["VISIT_AREA_ID"].astype(int)

In [510]:
# rename for clarity
travel_visit_edges = travel_visit_edges.rename(columns={
    "TRAVEL_ID": "travel_id",
    "VISIT_AREA_ID": "visit_area_id"
})

travel_visit_edges

Unnamed: 0,travel_id,visit_area_id
0,e_e000004,2304300002
1,e_e000004,2304300003
2,e_e000004,2304300004
3,e_e000006,2304300002
4,e_e000006,2304300003
...,...,...
15936,g_g010556,2309020007
15937,g_g010556,2309020008
15938,h_h003275,2308270002
15939,h_h003275,2308270003


## 유저 - 장소 엣지

In [511]:
# 교집합이 존재하는 TRAVEL_ID 확인
travel_ids_in_user = set(travel_df["TRAVEL_ID"].unique())
travel_ids_in_visit = set(visit_area_df["TRAVEL_ID"].unique())

# 교집합 추출
common_travel_ids = travel_ids_in_user.intersection(travel_ids_in_visit)

In [512]:
# 공통 TRAVEL_ID 기반으로 필터링
travel_sub = travel_df[travel_df["TRAVEL_ID"].isin(common_travel_ids)]
visit_area_sub = visit_area_df[visit_area_df["TRAVEL_ID"].isin(common_travel_ids)]

# 조인하여 user - visit_area 생성
user_visit_edges = pd.merge(
    travel_sub[["TRAVEL_ID", "TRAVELER_ID"]],
    visit_area_sub[["TRAVEL_ID", "VISIT_AREA_ID"]],
    on="TRAVEL_ID",
    how="inner"
)

In [513]:
user_visit_edges = user_visit_edges.rename(columns={
    "TRAVELER_ID": "user_id",
    "VISIT_AREA_ID": "visit_area_id"
})
user_visit_edges["visit_area_id"] = user_visit_edges["visit_area_id"].astype(int)

user_visit_edges

Unnamed: 0,TRAVEL_ID,user_id,visit_area_id
0,e_e000004,e000004,2304300002
1,e_e000004,e000004,2304300003
2,e_e000004,e000004,2304300004
3,e_e000006,e000006,2304300002
4,e_e000006,e000006,2304300003
...,...,...,...
15936,g_g010556,g010556,2309020007
15937,g_g010556,g010556,2309020008
15938,h_h003275,h003275,2308270002
15939,h_h003275,h003275,2308270003


# Node 전처리

## 유저 노드


- 'USER_ID', 'PASSWORD', 'NAME', 'GENDER', 'BIRTHDATE',
- 'TRAVEL_TERM', 'TRAVEL_NUM',
- 'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SIDO_2', 'TRAVEL_LIKE_SIDO_3',
- 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
- 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
- 'TRAVEL_MOTIVE_1', 'TRAVEL_MOTIVE_2'

In [514]:
# 사용할 feature 컬럼 정의 (TRAVELER_ID 제외)
user_feature_cols = [
    'GENDER', 'TRAVEL_TERM', 'TRAVEL_NUM',
    'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SIDO_2', 'TRAVEL_LIKE_SIDO_3',
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
    'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1', 'TRAVEL_MOTIVE_2',
    'AGE_GRP'
]

# 결측값 0으로 채우고 float32로 변환
user_tensor = user_df[user_feature_cols].fillna(0).astype(np.float32).to_numpy()

In [563]:
user_tensor[0]

array([ 2.,  2.,  2., 11., 47., 41.,  2.,  4.,  4.,  4.,  4.,  4.,  5.,
        5.,  2.,  6., 60.], dtype=float32)

In [515]:
# ID 매핑: TRAVELER_ID → index
user_id_list = user_df["TRAVELER_ID"].tolist()
user_id_to_index = {uid: idx for idx, uid in enumerate(user_id_list)}

# 결과 요약
{
    "user_tensor_shape": user_tensor.shape,
    "user_id_sample": user_id_list[:5],
    "user_index_sample": list(user_id_to_index.items())[:5]
}

{'user_tensor_shape': (1919, 17),
 'user_id_sample': ['e004720', 'e003564', 'e000396', 'e001890', 'e007797'],
 'user_index_sample': [('e004720', 0),
  ('e003564', 1),
  ('e000396', 2),
  ('e001890', 3),
  ('e007797', 4)]}

## 여행 노드

In [516]:
# 필요한 컬럼만 추출 (TRAVEL_ID, TRAVELER_ID 제외)
excluded_cols = ['Unnamed: 0', 'TRAVEL_ID', 'TRAVELER_ID']
travel_feature_cols = [col for col in travel_df.columns if col not in excluded_cols]

# feature tensor 생성
travel_tensor = travel_df[travel_feature_cols].fillna(0).astype(np.float32).to_numpy()

In [594]:
travel_tensor[0].shape

(21,)

In [576]:
travel_feature_cols

['TOTAL_COST_BINNED_ENCODED',
 'WITH_PET',
 'MONTH',
 'DURATION',
 'MVMN_기타',
 'MVMN_대중교통',
 'MVMN_자가용',
 'TRAVEL_PURPOSE_1',
 'TRAVEL_PURPOSE_2',
 'TRAVEL_PURPOSE_3',
 'TRAVEL_PURPOSE_4',
 'TRAVEL_PURPOSE_5',
 'TRAVEL_PURPOSE_6',
 'TRAVEL_PURPOSE_7',
 'TRAVEL_PURPOSE_8',
 'TRAVEL_PURPOSE_9',
 'WHOWITH_2인여행',
 'WHOWITH_가족여행',
 'WHOWITH_기타',
 'WHOWITH_단독여행',
 'WHOWITH_친구/지인 여행']

In [517]:
# ID 매핑
travel_id_list = travel_df["TRAVEL_ID"].tolist()
travel_id_to_index = {tid: idx for idx, tid in enumerate(travel_id_list)}

# 요약 결과
{
    "travel_tensor_shape": travel_tensor.shape,
    "travel_feature_columns": travel_feature_cols,
    "travel_index_sample": list(travel_id_to_index.items())[:5]
}

{'travel_tensor_shape': (2560, 21),
 'travel_feature_columns': ['TOTAL_COST_BINNED_ENCODED',
  'WITH_PET',
  'MONTH',
  'DURATION',
  'MVMN_기타',
  'MVMN_대중교통',
  'MVMN_자가용',
  'TRAVEL_PURPOSE_1',
  'TRAVEL_PURPOSE_2',
  'TRAVEL_PURPOSE_3',
  'TRAVEL_PURPOSE_4',
  'TRAVEL_PURPOSE_5',
  'TRAVEL_PURPOSE_6',
  'TRAVEL_PURPOSE_7',
  'TRAVEL_PURPOSE_8',
  'TRAVEL_PURPOSE_9',
  'WHOWITH_2인여행',
  'WHOWITH_가족여행',
  'WHOWITH_기타',
  'WHOWITH_단독여행',
  'WHOWITH_친구/지인 여행'],
 'travel_index_sample': [('e_e000004', 0),
  ('e_e000006', 1),
  ('e_e000009', 2),
  ('e_e000010', 3),
  ('e_e000011', 4)]}

# Hetero Dataset 설계

In [518]:
import numpy as np
import torch
from torch_geometric.data import HeteroData

## Edge Index 연결

In [519]:
area_ids_final = sorted(visit_area_df["VISIT_AREA_ID"].dropna().astype(int).unique())
visit_area_id_to_index = {vid: i for i, vid in enumerate(area_ids_final)}
# visit_area_id_to_index

In [520]:
# user → travel edge_index 변환
user_travel_edges = travel_df[["TRAVELER_ID", "TRAVEL_ID"]].dropna().copy()
user_travel_edges = user_travel_edges[
    user_travel_edges["TRAVELER_ID"].isin(user_id_to_index) &
    user_travel_edges["TRAVEL_ID"].isin(travel_id_to_index)
]

user_travel_edge_index = np.array([
    [user_id_to_index[uid] for uid in user_travel_edges["TRAVELER_ID"]],
    [travel_id_to_index[tid] for tid in user_travel_edges["TRAVEL_ID"]]
], dtype=np.int64)

In [521]:
# travel → visit_area edge_index 변환
travel_visit = visit_area_df[["TRAVEL_ID", "VISIT_AREA_ID"]].dropna().copy()
travel_visit["VISIT_AREA_ID"] = travel_visit["VISIT_AREA_ID"].astype(int)

travel_visit = travel_visit[
    travel_visit["TRAVEL_ID"].isin(travel_id_to_index) &
    travel_visit["VISIT_AREA_ID"].isin(visit_area_id_to_index)
]

travel_visit_edge_index = np.array([
    [travel_id_to_index[tid] for tid in travel_visit["TRAVEL_ID"]],
    [visit_area_id_to_index[vid] for vid in travel_visit["VISIT_AREA_ID"]]
], dtype=np.int64)

In [522]:
# 결과 요약
{
    "user_travel_edge_index_shape": user_travel_edge_index.shape,
    "travel_visit_edge_index_shape": travel_visit_edge_index.shape,
    "user_travel_edge_example": user_travel_edge_index[:, :5].tolist(),
    "travel_visit_edge_example": travel_visit_edge_index[:, :5].tolist()
}


{'user_travel_edge_index_shape': (2, 1919),
 'travel_visit_edge_index_shape': (2, 15941),
 'user_travel_edge_example': [[944, 915, 308, 160, 373], [0, 2, 3, 5, 6]],
 'travel_visit_edge_example': [[0, 0, 0, 1, 1], [20, 21, 22, 20, 21]]}

## Visit Area 데이터 전처리

In [523]:
edges_df_filtered = edges_df[
    edges_df["FROM_VISIT_AREA_ID"].isin(visit_area_id_to_index) &
    edges_df["TO_VISIT_AREA_ID"].isin(visit_area_id_to_index)
].copy()

In [524]:
# visit_area_edge_index 생성
visit_area_edge_index = torch.tensor([
    [visit_area_id_to_index[f] for f in edges_df_filtered["FROM_VISIT_AREA_ID"]],
    [visit_area_id_to_index[t] for t in edges_df_filtered["TO_VISIT_AREA_ID"]]
], dtype=torch.long)

In [525]:
# visit_area_edge_attr 생성
visit_area_edge_attr = torch.tensor(
    edges_df_filtered[["DURATION_MINUTES", "is_drive", "is_public", "is_other"]].to_numpy(),
    dtype=torch.float
)

In [526]:
# visit_area_tensor (dummy: all zeros)
visit_area_tensor = torch.zeros((len(visit_area_id_to_index), 1), dtype=torch.float)

# 요약
{
    "visit_area_edge_index_shape": visit_area_edge_index.shape,
    "visit_area_edge_attr_shape": visit_area_edge_attr.shape,
    "visit_area_tensor_shape": visit_area_tensor.shape
}

{'visit_area_edge_index_shape': torch.Size([2, 18461]),
 'visit_area_edge_attr_shape': torch.Size([18461, 4]),
 'visit_area_tensor_shape': torch.Size([1432, 1])}

## Hetero Data 병합

In [527]:
data = HeteroData()

# 노드 등록
data['user'].x = torch.tensor(user_tensor, dtype=torch.float)
data['travel'].x = torch.tensor(travel_tensor, dtype=torch.float)
data['visit_area'].x = torch.tensor(visit_area_tensor, dtype=torch.float)  # dummy

  data['visit_area'].x = torch.tensor(visit_area_tensor, dtype=torch.float)  # dummy


In [528]:
# 엣지 등록
data['user', 'traveled', 'travel'].edge_index = torch.tensor(user_travel_edge_index, dtype=torch.long)
data['travel', 'contains', 'visit_area'].edge_index = torch.tensor(travel_visit_edge_index, dtype=torch.long)
data['visit_area', 'moved_to', 'visit_area'].edge_index = torch.tensor(visit_area_edge_index, dtype=torch.long)
data['visit_area', 'moved_to', 'visit_area'].edge_attr = torch.tensor(visit_area_edge_attr, dtype=torch.float)

  data['visit_area', 'moved_to', 'visit_area'].edge_index = torch.tensor(visit_area_edge_index, dtype=torch.long)
  data['visit_area', 'moved_to', 'visit_area'].edge_attr = torch.tensor(visit_area_edge_attr, dtype=torch.float)


In [529]:
# reverse edges 추가시
data['travel', 'traveled_by', 'user'].edge_index = data['user', 'traveled', 'travel'].edge_index[[1, 0]]
data['visit_area', 'contained_in', 'travel'].edge_index = data['travel', 'contains', 'visit_area'].edge_index[[1, 0]]

# HeteroData 확인
print(data)
print(data.metadata())

HeteroData(
  user={ x=[1919, 17] },
  travel={ x=[2560, 21] },
  visit_area={ x=[1432, 1] },
  (user, traveled, travel)={ edge_index=[2, 1919] },
  (travel, contains, visit_area)={ edge_index=[2, 15941] },
  (visit_area, moved_to, visit_area)={
    edge_index=[2, 18461],
    edge_attr=[18461, 4],
  },
  (travel, traveled_by, user)={ edge_index=[2, 1919] },
  (visit_area, contained_in, travel)={ edge_index=[2, 15941] }
)
(['user', 'travel', 'visit_area'], [('user', 'traveled', 'travel'), ('travel', 'contains', 'visit_area'), ('visit_area', 'moved_to', 'visit_area'), ('travel', 'traveled_by', 'user'), ('visit_area', 'contained_in', 'travel')])


# GNN 모델 재설계

In [663]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear

class RouteGNN(nn.Module):
    def __init__(self, metadata, hidden_channels=128):
        super().__init__()
        self.metadata = metadata

        # 💡 실제 입력 차원에 맞게 조정 (고정된 입력 차원 사용)
        self.embeddings = nn.ModuleDict({
            'user': Linear(17, hidden_channels),
            'travel': Linear(21, hidden_channels),
            'visit_area': Linear(1, hidden_channels),  # dummy feature
        })

        # 💡 HeteroConv 레이어 2단
        self.gnn1 = HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels)
            for edge_type in metadata[1]
        }, aggr='sum')

        self.gnn2 = HeteroConv({
            edge_type: SAGEConv((hidden_channels, hidden_channels), hidden_channels)
            for edge_type in metadata[1]
        }, aggr='sum')

        # 💡 Link prediction head
        self.link_predictor = nn.Sequential(
            nn.Linear(2 * hidden_channels, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, 1)
        )

    def forward(self, x_dict, edge_index_dict):
        # 🔐 임베딩 처리 (None 방지)
        x_dict = {
            node_type: self.embeddings[node_type](x) if x is not None else None
            for node_type, x in x_dict.items()
        }
        
        # 💬 GNN 메시지 전달
        x_dict = self.gnn1(x_dict, edge_index_dict)
        x_dict = {k: F.relu(v) for k, v in x_dict.items() if v is not None}
        x_dict = self.gnn2(x_dict, edge_index_dict)

        return x_dict

    def predict_link(self, node_embed, edge_index):
        """
        visit_area 간 (i, j) edge 쌍을 받아 score 출력
        """
        src, dst = edge_index
        z_src = node_embed[src]
        z_dst = node_embed[dst]
        z = torch.cat([z_src, z_dst], dim=-1)
        return self.link_predictor(z).squeeze(-1)


# GNN 학습

In [661]:
import torch
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss

def train(model, data, optimizer, epochs=50, device='cpu'):
    model.to(device)
    data = data.to(device)
    loss_fn = BCEWithLogitsLoss()
    
    train_loss = []
    train_acc = []

    # ✅ positive edge (실제 이동 경로)
    edge_index = data['visit_area', 'moved_to', 'visit_area'].edge_index
    pos_edge_src = edge_index[0]
    pos_edge_dst = edge_index[1]
    pos_edge_pairs = torch.stack([pos_edge_src, pos_edge_dst], dim=1)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        # 1. Forward GNN
        x_dict = model(data.x_dict, data.edge_index_dict)

        # 2. Positive score
        pos_scores = model.predict_link(x_dict['visit_area'], edge_index)

        # 3. Negative sampling
        num_pos = pos_edge_pairs.size(0)
        neg_src = torch.randint(0, data['visit_area'].num_nodes, (num_pos,), device=device)
        neg_dst = torch.randint(0, data['visit_area'].num_nodes, (num_pos,), device=device)
        neg_edge_index = torch.stack([neg_src, neg_dst], dim=0)

        neg_scores = model.predict_link(x_dict['visit_area'], neg_edge_index)

        # 4. 라벨 결합
        scores = torch.cat([pos_scores, neg_scores], dim=0)
        labels = torch.cat([torch.ones_like(pos_scores), torch.zeros_like(neg_scores)], dim=0)

        # 5. 손실 계산
        loss = loss_fn(scores, labels)
        loss.backward()
        optimizer.step()

        # 6. 정확도 계산
        pred = (torch.sigmoid(scores) > 0.5).float()
        correct = (pred == labels).sum().item()
        acc = correct / labels.size(0)

        if (epoch+1) % 10 == 0:
            print(f"[Epoch {epoch+1}] Loss: {loss.item():.4f} | Acc: {acc*100:.2f}%")

        train_loss.append(loss.item())
        train_acc.append(acc)

    return train_loss, train_acc


In [None]:
model = RouteGNN(metadata=data.metadata(), hidden_channels=128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

tran_loss, train_acc = train(model, data, optimizer, epochs=200, device='cuda' if torch.cuda.is_available() else 'cpu')

# 모델 추론

In [651]:
def recommend_route(node_embed, edge_index, edge_scores, start_node=None, max_steps=5):
    """
    visit_area 노드 임베딩, 엣지 index, score가 주어졌을 때
    가장 높은 score 기준으로 동선을 구성하는 greedy 경로 추천 함수
    """
    from collections import defaultdict

    # 엣지를 점수 기준으로 정렬
    scored_edges = list(zip(edge_index[0].tolist(), edge_index[1].tolist(), edge_scores.tolist()))
    scored_edges.sort(key=lambda x: -x[2])  # 높은 점수 순

    # 경로 생성
    visited = set()
    route = []

    current = start_node if start_node is not None else scored_edges[0][0]
    visited.add(current)
    route.append(current)

    for _ in range(max_steps - 1):
        # current에서 시작하는 후보 중 아직 방문하지 않은 곳
        candidates = [dst for src, dst, score in scored_edges if src == current and dst not in visited]
        if not candidates:
            break
        next_node = candidates[0]  # greedy하게 최고 점수 선택
        visited.add(next_node)
        route.append(next_node)
        current = next_node

    return route  # index 형태


In [None]:
def infer_route(model, data, user_input, travel_input, k=5, device='cpu'):
    model.eval()
    data = data.to(device)
    user_input = user_input.to(device)
    travel_input = travel_input.to(device)

    with torch.no_grad():
        # 1. user_input과 travel_input은 임베딩 전 상태임 (17차원, 21차원)

        # 2. 기존 raw feature와 합치기 (같은 차원 기준으로)
        x_dict_raw = {
            'user': torch.cat([data['user'].x, user_input], dim=0),       # [N+1, 17]
            'travel': torch.cat([data['travel'].x, travel_input], dim=0), # [M+1, 21]
            'visit_area': data['visit_area'].x                             # [V, 1]
        }

        # 3. forward: model이 내부에서 임베딩 처리함
        x_dict = model(x_dict_raw, data.edge_index_dict)

        visit_area_embed = x_dict['visit_area']

        # 4. 모든 visit_area 노드 쌍에 대해 link prediction
        n = visit_area_embed.size(0)
        all_edges = torch.combinations(torch.arange(n, device=device), r=2).t()
        edge_scores = model.predict_link(visit_area_embed, all_edges)

        # 5. 경로 구성
        route = recommend_route(visit_area_embed, all_edges, edge_scores, max_steps=k)

    return route


## 추론 입력 전처리

### 1) 유저 정보

In [653]:
def get_age_group(birthdate_str):
    """
    'YYYY-MM-DD' 형식의 생년월일 문자열을 받아
    20, 30, 40 등의 나이대로 변환하는 함수
    """
    from datetime import datetime
    
    birth_year = int(birthdate_str[:4])
    current_year = datetime.now().year
    age = current_year - birth_year + 1  # 한국식 나이
    age_group = (age // 10) * 10
    return age_group

def map_sido(sido:str):
    sido_code_map = {
        '서울특별시': '11',
        '부산광역시': '26',
        '대구광역시': '27',
        '인천광역시': '28',
        '광주광역시': '29',
        '대전광역시': '30',
        '울산광역시': '31',
        '세종특별자치시': '36',
        '경기도': '41',
        '강원도': '42',
        '충청북도': '43',
        '충청남도': '44',
        '전라북도': '45',
        '전라남도': '46',
        '경상북도': '47',
        '경상남도': '48',
        '제주특별자치도': '50'
    }

    return int(sido_code_map[sido])

In [654]:
def process_user_input(user_info:dict):
    user_feature_cols = [
    'GENDER', 'TRAVEL_TERM', 'TRAVEL_NUM',
    'TRAVEL_LIKE_SIDO_1', 'TRAVEL_LIKE_SIDO_2', 'TRAVEL_LIKE_SIDO_3',
    'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4',
    'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1', 'TRAVEL_MOTIVE_2',
    'AGE_GRP'
    ]
    
    # 1. 나잇대 계산
    user_info['AGE_GRP'] = get_age_group(user_info['BIRTHDATE'])
    
    # 2. 시도 변환
    for i in range(1, 4):
        user_info[f"TRAVEL_LIKE_SIDO_{i}"] = map_sido(user_info[f"TRAVEL_LIKE_SIDO_{i}"])
    
    # 3. 컬럼 필터링 (순서에 맞게)
    user_info = {k: int(user_info[k]) for k in user_feature_cols}
    
    return pd.DataFrame([user_info]).fillna(0).astype(np.float32).to_numpy()

In [655]:
temp_info = {'USER_ID': 'admin', 'PASSWORD': 'admin', 'CONFIRM_PASSWORD': 'admin', 'NAME': '유상범', 'BIRTHDATE': '1999-08-10', 'GENDER': '1', 'EDU_NM': '6', 'EDU_FNSH_SE': '2', 'MARR_STTS': '1', 'JOB_NM': '1', 'INCOME': '100', 'HOUSE_INCOME': '10000', 'TRAVEL_TERM': '1', 'TRAVEL_LIKE_SIDO_1': '부산광역시', 'TRAVEL_LIKE_SIDO_2': '전라남도', 'TRAVEL_LIKE_SIDO_3': '충청남도', 'TRAVEL_STYL_1': 4, 'TRAVEL_STYL_2': 4, 'TRAVEL_STYL_3': 1, 'TRAVEL_STYL_4': 5, 'TRAVEL_STYL_5': 2, 'TRAVEL_STYL_6': 4, 'TRAVEL_STYL_7': 3, 'TRAVEL_STYL_8': 2, 'TRAVEL_MOTIVE_1': '7', 'TRAVEL_MOTIVE_2': '7', 'FAMILY_MEMB': '1', 'TRAVEL_NUM': '1', 'TRAVEL_COMPANIONS_NUM': '1'}


test_user_tensor = process_user_input(temp_info)

print(test_user_tensor.shape)
test_user_tensor

(1, 17)


array([[ 1.,  1.,  1., 26., 46., 44.,  4.,  4.,  1.,  5.,  2.,  4.,  3.,
         2.,  7.,  7., 20.]], dtype=float32)

### 2) 여행 정보

{'mission_ENC': '0,5,7,9', 'date_range': '2025-05-28 - 2025-05-30', 'start_date': '', 'end_date': '', 'TOTAL_COST': '50', 'MVMN_NM_ENC': '2', 'whowith_ENC': '3', 'mission_type': 'normal'}

In [656]:
def process_travel_input(travel_info:dict):
    from datetime import datetime
    travel_feature_cols = [
        'TOTAL_COST_BINNED_ENCODED',
        'WITH_PET',
        'MONTH',
        'DURATION',
        'MVMN_기타',
        'MVMN_대중교통',
        'MVMN_자가용',
        'TRAVEL_PURPOSE_1',
        'TRAVEL_PURPOSE_2',
        'TRAVEL_PURPOSE_3',
        'TRAVEL_PURPOSE_4',
        'TRAVEL_PURPOSE_5',
        'TRAVEL_PURPOSE_6',
        'TRAVEL_PURPOSE_7',
        'TRAVEL_PURPOSE_8',
        'TRAVEL_PURPOSE_9',
        'WHOWITH_2인여행',
        'WHOWITH_가족여행',
        'WHOWITH_기타',
        'WHOWITH_단독여행',
        'WHOWITH_친구/지인 여행']
    
    
    # mission_ENC에 0 = 반려동물 동반 (WITH_PET)
    travel_info['mission_ENC'] = travel_info['mission_ENC'].strip().split(',')
    if '0' in travel_info['mission_ENC']:
        travel_info['WITH_PET'] = 1
    else:
        travel_info['WITH_PET'] = 0
        
    # TRAVEL_PURPOSE_1 ~~ TRAVEL_PURPOSE_9 (0으로 들어온 입력은 제거해줘야됨) 
    for i in range(1,10):
        if str(i) in travel_info['mission_ENC']:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 1
        else:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 0
        
    # MONTH
    dates = travel_info['date_range'].split(' - ')
    travel_info['start_date'] = datetime.strptime(dates[0].strip(), "%Y-%m-%d")
    travel_info['end_date'] = datetime.strptime(dates[1].strip(), "%Y-%m-%d")
    
    travel_info['MONTH'] = travel_info['end_date'].month
    
    # DURATION
    travel_info['DURATION'] = (travel_info['end_date'] - travel_info['start_date']).days
    
    # MNVM_기타, MVMN_대중교통, MVMN_자가용
    for m in ['자가용', '대중교통', '기타']:
        travel_info[f"MVMN_{m}"] = False
    
    if travel_info['MVMN_NM_ENC'] == '1':
        travel_info['MVMN_자가용'] = True
    elif travel_info['MVMN_NM_ENC'] == '2':
        travel_info['MVMN_대중교통'] = True
    else:
        travel_info['MVMN_기타'] = True
    
    # WHOWITH는 1부터 5까지 숫자로 들어옴 -> 원핫 인코딩으로 수정할 것
    # dict에 들어오는 숫자 의미: WHOWITH_단독여행, WHOWITH_2인여행, WHOWITH_가족여행, WHOWITH_친구/지인여행, WHOWITH_기타
    whowith_onehot = [0] * 5
    idx = int(travel_info['whowith_ENC']) - 1
    if 0 <= idx < 5:
        whowith_onehot[idx] = 1
    
    travel_info.update({
    'WHOWITH_단독여행': whowith_onehot[0],
    'WHOWITH_2인여행': whowith_onehot[1],
    'WHOWITH_가족여행': whowith_onehot[2],
    'WHOWITH_친구/지인 여행': whowith_onehot[3],
    'WHOWITH_기타': whowith_onehot[4],
    })
    
    # TOTAL_COST_BINNED_ENCODED
    travel_info['TOTAL_COST_BINNED_ENCODED'] = travel_info['TOTAL_COST'][-1]
    
    # 컬럼 필터링 (순서에 맞게)
    travel_info = {k: int(travel_info[k]) for k in travel_feature_cols}
    
    return pd.DataFrame([travel_info]).fillna(0).astype(np.float32).to_numpy()

In [657]:
test_travel = {'mission_ENC': '0,5,7,9', 'date_range': '2025-05-28 - 2025-05-30', 'start_date': '', 'end_date': '', 'TOTAL_COST': '1', 'MVMN_NM_ENC': '2', 'whowith_ENC': '3', 'mission_type': 'normal'}


test_travel_tensor = process_travel_input(test_travel)

print(test_travel_tensor.shape)
test_travel_tensor

(1, 21)


array([[1., 1., 5., 2., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 0., 0.]], dtype=float32)

In [658]:
user_input = torch.tensor(test_user_tensor, dtype=torch.float)  # 17차원
travel_input = torch.tensor(test_travel_tensor, dtype=torch.float)  # 21차원

user_input.shape, travel_input.shape

(torch.Size([1, 17]), torch.Size([1, 21]))

In [691]:
route_indices = infer_route(model, data, user_input, travel_input, k=6)

{'user': tensor([[ -6.1258, -11.1092,   7.3248,  ..., -13.1022,  -6.4677,   1.7539],
        [ -1.4838,  -8.8236,   2.4118,  ...,  -2.0858,  -5.1666,  -3.3756],
        [ -1.8175,  -9.2558,  -0.6166,  ...,  -1.3545,  -2.8330,  -2.7553],
        ...,
        [ -9.4388,  -6.5148,   1.9036,  ...,   2.7162,  -5.2777,   7.0602],
        [  2.2000, -10.1274,   1.2445,  ...,  -2.2761,  -3.9236,  -4.9991],
        [  3.1350,  -9.2342,   1.4030,  ...,  -5.6976,  -4.2754,  -4.5423]]), 'travel': tensor([[-0.2774, -0.9591,  0.5795,  ...,  0.9120,  0.2105,  0.3297],
        [ 0.0944, -0.8981,  0.4785,  ...,  0.7563,  0.2961,  0.2052],
        [-0.6656, -0.5887,  0.2132,  ...,  1.0141,  0.3621,  0.2081],
        ...,
        [-0.7150, -1.1076,  0.6699,  ...,  1.1899, -0.1484,  0.4139],
        [-0.5545, -1.0201,  0.8530,  ...,  1.2654, -0.0839,  0.6337],
        [-0.4121, -0.6833,  0.5774,  ...,  0.4632,  0.8349,  0.3017]]), 'visit_area': tensor([[ 0.4129, -0.8827,  0.3039,  ..., -0.6787,  0.6370, -

In [692]:
# visit_area_id로 역매핑
index_to_id = {v: k for k, v in visit_area_id_to_index.items()}
route_ids = [index_to_id[idx] for idx in route_indices]
print("추천 동선:", route_ids)

추천 동선: [2307300011, 2307300012, 2308040013, 2308160007, 2308290010, 2309110002, 2309110005, 2309110006, 2309110007, 2309110008]


In [697]:
for id in route_ids:
    print(id, " : ", visit_area_df[visit_area_df['VISIT_AREA_ID'] == id]['VISIT_AREA_NM'].values)

2307300011  :  ['서울월드컵경기장']
2307300012  :  ['나인 트리 프리미어 로카우스 호텔 서울 용산']
2308040013  :  ['청파 책가도']
2308160007  :  ['무지개 농원 펜션' '비틀스']
2308290010  :  ['상계주공 1단지 아파트' '울산역 주차장']
2309110002  :  ['AK플라자 홍대']
2309110005  :  ['AK플라자 홍대']
2309110006  :  ['수나 코 3호점']
2309110007  :  ['훠궈 나라 홍대점']
2309110008  :  ['뉴 코인 싱어 노래연습장']


In [698]:
visit_area_df[visit_area_df['VISIT_AREA_ID'] == 2308160007]

Unnamed: 0,VISIT_AREA_ID,TRAVEL_ID,VISIT_ORDER,VISIT_AREA_NM,VISIT_START_YMD,VISIT_END_YMD,ROAD_NM_ADDR,LOTNO_ADDR,X_COORD,Y_COORD,...,POI_NM,RESIDENCE_TIME_MIN,VISIT_AREA_TYPE_CD,REVISIT_YN,VISIT_CHC_REASON_CD,LODGING_TYPE_CD,DGSTFN,REVISIT_INTENTION,RCMDTN_INTENTION,SGG_CD
11054,2308160007,e_e005876,6,무지개 농원 펜션,2023-08-16,2023-08-17,경기 가평군 북면 꽃넘이길 115-47,경기 가평군 북면 제령리 557-7,127.518702,37.898967,...,무지개농원펜션,540.0,24,N,2.0,3.0,5.0,5.0,5.0,
12557,2308160007,e_e007399,7,비틀스,2023-08-16,2023-08-16,서울 서대문구 연세로7길 34-6,서울 서대문구 창천동 52-77,126.935079,37.557961,...,비틀즈,60.0,11,Y,4.0,,5.0,4.0,5.0,


In [700]:
visit_area_df[visit_area_df['VISIT_AREA_ID'] == 2307300012]

Unnamed: 0,VISIT_AREA_ID,TRAVEL_ID,VISIT_ORDER,VISIT_AREA_NM,VISIT_START_YMD,VISIT_END_YMD,ROAD_NM_ADDR,LOTNO_ADDR,X_COORD,Y_COORD,...,POI_NM,RESIDENCE_TIME_MIN,VISIT_AREA_TYPE_CD,REVISIT_YN,VISIT_CHC_REASON_CD,LODGING_TYPE_CD,DGSTFN,REVISIT_INTENTION,RCMDTN_INTENTION,SGG_CD
9250,2307300012,e_e004784,12,나인 트리 프리미어 로카우스 호텔 서울 용산,2023-07-30,2023-07-31,서울 용산구 한강대로23길 25,서울 용산구 한강로3가 40-1050,126.964386,37.527474,...,나인트리 프리미어 로카우스 호텔 서울 용산,30.0,24,N,7.0,1.0,3.0,3.0,3.0,


## VISIT_AREA 중복 문제
- 이전 장소와 다음 장소의 좌표를 갖고 동선상의 무리가 없는지 확인
- 거리를 바탕으로 필터링 수행

In [701]:
import numpy as np

def select_best_location_by_distance(route_ids, visit_area_df):
    selected_names = []

    for idx, vid in enumerate(route_ids):
        candidates = visit_area_df[visit_area_df['VISIT_AREA_ID'] == vid]

        # 후보가 하나일 경우 바로 선택
        if len(candidates) == 1:
            selected_names.append(candidates.iloc[0]['VISIT_AREA_NM'])
            continue

        # 이전/다음 위치 좌표 확보
        prev_coord = None
        next_coord = None

        if idx > 0:
            prev_id = route_ids[idx - 1]
            prev_row = visit_area_df[visit_area_df['VISIT_AREA_ID'] == prev_id]
            if not prev_row.empty:
                prev_coord = (prev_row.iloc[0]['X_COORD'], prev_row.iloc[0]['Y_COORD'])

        if idx < len(route_ids) - 1:
            next_id = route_ids[idx + 1]
            next_row = visit_area_df[visit_area_df['VISIT_AREA_ID'] == next_id]
            if not next_row.empty:
                next_coord = (next_row.iloc[0]['X_COORD'], next_row.iloc[0]['Y_COORD'])

        # 거리 계산 함수
        def total_distance(row):
            x, y = row['X_COORD'], row['Y_COORD']
            dist = 0
            if prev_coord:
                dist += np.linalg.norm(np.array([x, y]) - np.array(prev_coord))
            if next_coord:
                dist += np.linalg.norm(np.array([x, y]) - np.array(next_coord))
            return dist

        # 최단 거리 후보 선택
        best_row = candidates.loc[candidates.apply(total_distance, axis=1).idxmin()]
        selected_names.append(best_row['VISIT_AREA_NM'])

    return selected_names

In [702]:
names = select_best_location_by_distance(route_ids, visit_area_df)

for vid, name in zip(route_ids, names):
    print(vid, ":", name)

2307300011 : 서울월드컵경기장
2307300012 : 나인 트리 프리미어 로카우스 호텔 서울 용산
2308040013 : 청파 책가도
2308160007 : 비틀스
2308290010 : 상계주공 1단지 아파트
2309110002 : AK플라자 홍대
2309110005 : AK플라자 홍대
2309110006 : 수나 코 3호점
2309110007 : 훠궈 나라 홍대점
2309110008 : 뉴 코인 싱어 노래연습장


# 모델 및 pkl 파일 저장

In [708]:
import pickle

# 모델 저장
torch.save(model.state_dict(), './pickle/routegnn_model.pt')

# ID 매핑 저장
with open('./pickle/user_id_to_index.pkl', 'wb') as f:
    pickle.dump(user_id_to_index, f)

with open('./pickle/travel_id_to_index.pkl', 'wb') as f:
    pickle.dump(travel_id_to_index, f)

with open('./pickle/visit_area_id_to_index.pkl', 'wb') as f:
    pickle.dump(visit_area_id_to_index, f)
    
with open('./pickle/dataset.pkl', 'wb') as f:
    pickle.dump(data, f)

# 장소 정보 저장
visit_area_df.to_pickle('./pickle/visit_area_df.pkl')