In [45]:
import os
import pandas as pd
import numpy as np


import torch
import torch.nn as nn
from torch_geometric.nn import HANConv
from torch_geometric.data import HeteroData

In [46]:
class HAN_TravelRecommender(nn.Module):
    def __init__(self, metadata, hidden_dim=64, out_dim=1):
        super().__init__()
        self.han_conv = HANConv(
            in_channels=-1,
            out_channels=hidden_dim,
            metadata=metadata,
            heads=2
        )
        self.lin = nn.Linear(hidden_dim, out_dim)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.han_conv(x_dict, edge_index_dict)
        return self.lin(x_dict['visit_area'])

# Label값 설계

- 만족도(DGSTFN)와 추천의향(RCMDTN_INTENTION)**을 기반으로 라벨 벡터를 생성

In [47]:
# 1. visit_df 로드 (방문지 정보)
visit_df = pd.read_csv("../data/VL_csv/tn_visit_area_info_방문지정보_Cleaned_E.csv")

# 2. visit_id_map 생성
unique_visit_ids = visit_df['VISIT_AREA_ID'].dropna().unique()
visit_id_map = {id_: i for i, id_ in enumerate(unique_visit_ids)}

# 3. VISIT_AREA_ID별로 DGSTFN, RCMDTN_INTENTION 평균 계산
label_agg = visit_df.groupby('VISIT_AREA_ID')[['DGSTFN', 'RCMDTN_INTENTION']].mean()


In [48]:
# 4. visit_id_map 순서에 맞게 라벨 정렬
visit_id_list = list(visit_id_map.keys())
label_df = label_agg.reindex(visit_id_list).fillna(0)

# 5. (torch 불가 환경에서는 NumPy로 유지)
label_array = label_df.values.astype(np.float32)  # shape = (num_visit_area, 2)

# 6. 예시 출력
print("Label shape:", label_array.shape)
print("Label preview:\n", label_array[:5])

Label shape: (1432, 2)
Label preview:
 [[4.7272725 4.4545455]
 [4.3333335 4.       ]
 [4.5454545 4.3636365]
 [4.142857  4.142857 ]
 [4.25      4.25     ]]


# 데이터 생성

In [49]:
# 경로 설정 (필요 시 로컬 경로로 변경)
data_path = "../data/"

# 1. 노드 feature 로드
user_x = torch.tensor(np.load(os.path.join(data_path, "user_features.npy")), dtype=torch.float)
travel_x = torch.tensor(np.load(os.path.join(data_path, "travel_features.npy")), dtype=torch.float)
visit_x = torch.tensor(np.load(os.path.join(data_path, "visit_features.npy")), dtype=torch.float)

# 2. 엣지 로드
edge_user_to_travel = torch.tensor(np.load(os.path.join(data_path, "edge_user_to_travel.npy")), dtype=torch.long)
edge_travel_to_visit = torch.tensor(np.load(os.path.join(data_path, "edge_travel_to_visit.npy")), dtype=torch.long)

# 3. HeteroData 객체 구성
data = HeteroData()
data['user'].x = user_x
data['travel'].x = travel_x
data['visit_area'].x = visit_x

data[('user', 'traveled', 'travel')].edge_index = edge_user_to_travel
data[('travel', 'contains', 'visit_area')].edge_index = edge_travel_to_visit

# 4. 이동수단별 엣지 자동 로드 및 추가
for fname in os.listdir(data_path):
    if fname.startswith("edge_visit_move_") and fname.endswith(".npy"):
        move_type = fname.replace("edge_visit_move_", "").replace(".npy", "")
        edge_index = torch.tensor(np.load(os.path.join(data_path, fname)), dtype=torch.long)
        data[('visit_area', f'move_{move_type}', 'visit_area')].edge_index = edge_index

print("Metadata:", data.metadata())

Metadata: (['user', 'travel', 'visit_area'], [('user', 'traveled', 'travel'), ('travel', 'contains', 'visit_area'), ('visit_area', 'move_16', 'visit_area'), ('visit_area', 'move_15', 'visit_area'), ('visit_area', 'move_14', 'visit_area'), ('visit_area', 'move_10', 'visit_area'), ('visit_area', 'move_11', 'visit_area'), ('visit_area', 'move_13', 'visit_area'), ('visit_area', 'move_12', 'visit_area'), ('visit_area', 'move_8', 'visit_area'), ('visit_area', 'move_9', 'visit_area'), ('visit_area', 'move_1', 'visit_area'), ('visit_area', 'move_2', 'visit_area'), ('visit_area', 'move_3', 'visit_area'), ('visit_area', 'move_7', 'visit_area'), ('visit_area', 'move_6', 'visit_area'), ('visit_area', 'move_4', 'visit_area'), ('visit_area', 'move_5', 'visit_area')])


# 학습 파이프라인

In [50]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split

# 모델 정의
model = HAN_TravelRecommender(data.metadata(), hidden_dim=64, out_dim=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.MSELoss()

# 라벨 텐서와 학습 가능 인덱스
label_tensor = torch.tensor(label_array, dtype=torch.float)
valid_mask = (label_tensor.sum(dim=1) > 0)
valid_idx = torch.arange(label_tensor.shape[0])[valid_mask]

# train/val 분할 (stratify 없음 - 연속값이므로)
train_idx, val_idx = train_test_split(valid_idx.numpy(), test_size=0.2, random_state=42)
train_idx = torch.tensor(train_idx)
val_idx = torch.tensor(val_idx)

# 라벨도 선택
y_train = label_tensor[train_idx]
y_val = label_tensor[val_idx]

# 학습 루프
for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()

    out = model(data.x_dict, data.edge_index_dict)
    pred_train = out[train_idx]
    loss = criterion(pred_train, y_train)
    loss.backward()
    optimizer.step()

    # 검증
    model.eval()
    with torch.no_grad():
        pred_val = model(data.x_dict, data.edge_index_dict)[val_idx]
        val_loss = criterion(pred_val, y_val)

        # RMSE 계산 (평균)
        rmse = torch.sqrt(F.mse_loss(pred_val, y_val))
        mae = F.l1_loss(pred_val, y_val)

    if epoch % 10 == 0:
        print(f"[Epoch {epoch}] Train Loss: {loss.item():.4f} | Val Loss: {val_loss.item():.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")

[Epoch 10] Train Loss: 16.0863 | Val Loss: 15.3936 | RMSE: 3.9235 | MAE: 3.8295
[Epoch 20] Train Loss: 14.6298 | Val Loss: 13.7058 | RMSE: 3.7021 | MAE: 3.5430
[Epoch 30] Train Loss: 14.2043 | Val Loss: 13.0750 | RMSE: 3.6159 | MAE: 3.4400
[Epoch 40] Train Loss: 13.2057 | Val Loss: 12.4913 | RMSE: 3.5343 | MAE: 3.3601
[Epoch 50] Train Loss: 12.0282 | Val Loss: 11.6067 | RMSE: 3.4069 | MAE: 3.2286
[Epoch 60] Train Loss: 10.6923 | Val Loss: 10.3548 | RMSE: 3.2179 | MAE: 3.0197
[Epoch 70] Train Loss: 9.1769 | Val Loss: 8.9095 | RMSE: 2.9849 | MAE: 2.7697
[Epoch 80] Train Loss: 7.4391 | Val Loss: 7.2754 | RMSE: 2.6973 | MAE: 2.4573
[Epoch 90] Train Loss: 5.7376 | Val Loss: 5.6569 | RMSE: 2.3784 | MAE: 2.1173
[Epoch 100] Train Loss: 4.3578 | Val Loss: 4.3222 | RMSE: 2.0790 | MAE: 1.8162


# 모델 결과값 검증

In [51]:
# 1. 모델 출력
model.eval()
with torch.no_grad():
    out = model(data.x_dict, data.edge_index_dict)  # [num_visit_area, 2]

In [52]:
# 라벨이 있는 index (train + val의 union)
valid_mask = (label_tensor.sum(dim=1) > 0)
valid_idx = torch.arange(label_tensor.shape[0])[valid_mask]

# 모델 예측값 중 해당 인덱스만 추출
pred = out[valid_idx]
true = label_tensor[valid_idx]

# 결과 정리
import pandas as pd
result_df = pd.DataFrame({
    'visit_area_index': valid_idx.cpu().numpy(),
    'pred_dgstfn': pred[:, 0].cpu().numpy(),
    'pred_rcmdtn': pred[:, 1].cpu().numpy(),
    'true_dgstfn': true[:, 0].cpu().numpy(),
    'true_rcmdtn': true[:, 1].cpu().numpy(),
})
result_df.head(10)


Unnamed: 0,visit_area_index,pred_dgstfn,pred_rcmdtn,true_dgstfn,true_rcmdtn
0,0,4.166882,4.908982,4.727273,4.454545
1,1,2.971125,1.966112,4.333333,4.0
2,2,3.02937,1.870136,4.545455,4.363636
3,3,2.77943,1.863747,4.142857,4.142857
4,4,4.136183,2.407371,4.25,4.25
5,5,2.762706,1.969324,3.5,3.0
6,7,2.414602,1.598702,5.0,5.0
7,8,3.185455,2.215573,4.75,4.5
8,9,1.808123,1.253081,4.214286,4.071429
9,10,0.898622,1.040635,4.1875,3.9375


In [53]:
# visit_df 로드
visit_df = pd.read_csv("../data/VL_csv/tn_visit_area_info_방문지정보_cleaned_E.csv")

# index → VISIT_AREA_ID 매핑 생성
index_to_visit_id = {v: k for k, v in visit_id_map.items()}

# visit_area_index → 실제 ID
result_df['VISIT_AREA_ID'] = result_df['visit_area_index'].map(index_to_visit_id)

# ID → 장소명 매핑 (visit_df에 있는 이름 붙이기)
id_to_name = visit_df.set_index('VISIT_AREA_ID')['VISIT_AREA_NM'].to_dict()
result_df['VISIT_AREA_NM'] = result_df['VISIT_AREA_ID'].map(id_to_name)

# 결과 미리보기
result_df[['VISIT_AREA_ID', 'VISIT_AREA_NM', 'pred_rcmdtn', 'true_rcmdtn']].head(10)

Unnamed: 0,VISIT_AREA_ID,VISIT_AREA_NM,pred_rcmdtn,true_rcmdtn
0,2304300002,맥도날드,4.908982,4.454545
1,2304300003,롯데백화점 동탄점,1.966112,4.0
2,2304300004,동탄 호수 공원,1.870136,4.363636
3,2304300005,명륜 진사갈비 화성 남동탄점,1.863747,4.142857
4,2304300006,브라보 노래연습장,2.407371,4.25
5,2304300007,에코 코인 노래연습장,1.969324,3.0
6,2305010001,마석역 경춘선,1.598702,5.0
7,2305010002,신세계백화점 경기점,2.215573,4.5
8,2304290002,카페아쁠뤼스,1.253081,4.071429
9,2304290003,영등포역,1.040635,3.9375


- 이비인후과 같은 병원 데이터 값
- 00역과 같은 데이터

## 과대평가 or 과소평가된 장소 확인

In [54]:
# 추천의향 기준 예측 과대 평가
over_pred = result_df.copy()
over_pred['rcmdtn_error'] = result_df['pred_rcmdtn'] - result_df['true_rcmdtn']

# 예측이 1점 이상 높았던 방문지
over_pred = over_pred[over_pred['rcmdtn_error'] >= 1.0]

# 장소명 포함 정렬
over_pred = over_pred.sort_values(by='rcmdtn_error', ascending=False)
over_pred[['VISIT_AREA_NM', 'pred_rcmdtn', 'true_rcmdtn', 'rcmdtn_error']].head(10)

Unnamed: 0,VISIT_AREA_NM,pred_rcmdtn,true_rcmdtn,rcmdtn_error
500,광화문,15.980067,3.5,12.480067
1220,수원 화성 팔달문,13.173707,4.05,9.123707
514,커피 인더스트리 본점,12.508801,3.9,8.608801
518,(주) 신세계 니코르 AK 홍대점,12.613537,4.5,8.113537
1232,광야 서울,11.467629,4.428571,7.039058
495,운정 호수 공원,8.360746,4.0,4.360746
646,운정 호수 공원,7.937459,3.75,4.187459
164,사무실,7.700973,4.423077,3.277895
517,연교,6.771996,3.5,3.271996
345,한화리조트 용인 베잔송,7.254701,4.0,3.254701


In [55]:
# 실제 4점 이상인데, 예측은 2.5 이하로 한 경우
under_pred = result_df.copy()
under_pred = under_pred[(under_pred['true_rcmdtn'] >= 4.0) & (under_pred['pred_rcmdtn'] <= 2.5)]

# 오차 계산 및 정렬
under_pred['rcmdtn_error'] = under_pred['pred_rcmdtn'] - under_pred['true_rcmdtn']
under_pred = under_pred.sort_values(by='rcmdtn_error')

under_pred[['VISIT_AREA_NM', 'pred_rcmdtn', 'true_rcmdtn', 'rcmdtn_error']].head(10)

Unnamed: 0,VISIT_AREA_NM,pred_rcmdtn,true_rcmdtn,rcmdtn_error
19,숙소,0.660192,5.0,-4.339808
1009,아멕스 랜드 주변 카트 관광 및 전망대,0.822958,5.0,-4.177042
1298,수서역,0.8468,5.0,-4.1532
1299,동대구역,0.8468,5.0,-4.1532
715,양재 꽃 시장,0.848738,5.0,-4.151262
1226,Ofr 서울,0.910915,5.0,-4.089085
965,카몬 행궁 본점,0.911319,5.0,-4.088681
958,서울 영등포 영등포동 618-501,0.914273,5.0,-4.085727
1317,에버랜드 스카이 크루즈,0.929368,5.0,-4.070632
1316,에버랜드 범퍼카,0.929368,5.0,-4.070632


In [59]:
# 0. visit_df에서 좌표 정보 추출
visit_coords = visit_df.groupby("VISIT_AREA_ID")[["X_COORD", "Y_COORD"]].mean()

# 1. X/Y 좌표를 result_df에 병합
result_df['X'] = result_df['VISIT_AREA_ID'].map(visit_coords['X_COORD'])
result_df['Y'] = result_df['VISIT_AREA_ID'].map(visit_coords['Y_COORD'])

In [63]:
import folium

# 1. 과소 평가된 방문지 필터링
under_pred = result_df[
    (result_df['true_rcmdtn'] >= 4.0) & (result_df['pred_rcmdtn'] >= 4)
].copy()

# 2. NaN 좌표 제거
under_pred = under_pred.dropna(subset=['X', 'Y'])

# 3. 지도 중심
map_center = [under_pred['Y'].mean(), under_pred['X'].mean()]
fmap = folium.Map(location=map_center, zoom_start=12)

# 4. 마커 추가
for _, row in under_pred.iterrows():
    folium.Marker(
        location=[row['Y'], row['X']],
        popup=f"{row['VISIT_AREA_NM']}<br>예측:{row['pred_rcmdtn']:.2f} / 실제:{row['true_rcmdtn']:.2f}",
        icon=folium.Icon(color='red', icon='exclamation-sign')
    ).add_to(fmap)

# 5. 출력
fmap


# 회고

- 모델 성능이 좋지 못함 (샘플데이터)
    - 거리 정보가 제대로 들어가지 못하는 문제가 발생
    - 거리 반영 동선 제공 알고리즘 작성 필요
- 연결관계 및 점수 Score의 상관성의 설명력이 조금 부족함

- 데이터에 X, Y좌표가 잘못된 것들이 존재함