In [28]:
import os
import pandas as pd
import numpy as np


import torch
import torch.nn as nn
from torch_geometric.nn import HANConv
from torch_geometric.data import HeteroData

In [29]:
class HAN_TravelRecommender(nn.Module):
    def __init__(self, metadata, hidden_dim=64, out_dim=1):
        super().__init__()
        self.han_conv = HANConv(
            in_channels=-1,
            out_channels=hidden_dim,
            metadata=metadata,
            heads=2
        )
        self.lin = nn.Linear(hidden_dim, out_dim)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.han_conv(x_dict, edge_index_dict)
        return self.lin(x_dict['visit_area'])

# Label값 설계

- 만족도(DGSTFN)와 추천의향(RCMDTN_INTENTION)**을 기반으로 라벨 벡터를 생성

In [30]:
# 1. visit_df 로드 (방문지 정보)
visit_df = pd.read_csv("../data/VL_csv/tn_visit_area_info_방문지정보_Cleaned_E.csv")

# 2. visit_id_map 생성
unique_visit_ids = visit_df['VISIT_AREA_ID'].dropna().unique()
visit_id_map = {id_: i for i, id_ in enumerate(unique_visit_ids)}

# 3. VISIT_AREA_ID별로 DGSTFN, RCMDTN_INTENTION 평균 계산
label_agg = visit_df.groupby('VISIT_AREA_ID')[['DGSTFN', 'RCMDTN_INTENTION']].mean()


In [31]:
# 4. visit_id_map 순서에 맞게 라벨 정렬
visit_id_list = list(visit_id_map.keys())
label_df = label_agg.reindex(visit_id_list).fillna(0)

# 5. (torch 불가 환경에서는 NumPy로 유지)
label_array = label_df.values.astype(np.float32)  # shape = (num_visit_area, 2)

# 6. 예시 출력
print("Label shape:", label_array.shape)
print("Label preview:\n", label_array[:5])

Label shape: (794, 2)
Label preview:
 [[4.6666665 4.6666665]
 [3.6666667 2.6666667]
 [4.6666665 3.3333333]
 [4.        4.       ]
 [5.        4.       ]]


# 데이터 생성

In [32]:
# 경로 설정 (필요 시 로컬 경로로 변경)
data_path = "../data/"

# 1. 노드 feature 로드
user_x = torch.tensor(np.load(os.path.join(data_path, "user_features.npy")), dtype=torch.float)
travel_x = torch.tensor(np.load(os.path.join(data_path, "travel_features.npy")), dtype=torch.float)
visit_x = torch.tensor(np.load(os.path.join(data_path, "visit_features.npy")), dtype=torch.float)

# 2. 엣지 로드
edge_user_to_travel = torch.tensor(np.load(os.path.join(data_path, "edge_user_to_travel.npy")), dtype=torch.long)
edge_travel_to_visit = torch.tensor(np.load(os.path.join(data_path, "edge_travel_to_visit.npy")), dtype=torch.long)

# 3. HeteroData 객체 구성
data = HeteroData()
data['user'].x = user_x
data['travel'].x = travel_x
data['visit_area'].x = visit_x

data[('user', 'traveled', 'travel')].edge_index = edge_user_to_travel
data[('travel', 'contains', 'visit_area')].edge_index = edge_travel_to_visit

# 4. 이동수단별 엣지 자동 로드 및 추가
for fname in os.listdir(data_path):
    if fname.startswith("edge_visit_move_") and fname.endswith(".npy"):
        move_type = fname.replace("edge_visit_move_", "").replace(".npy", "")
        edge_index = torch.tensor(np.load(os.path.join(data_path, fname)), dtype=torch.long)
        data[('visit_area', f'move_{move_type}', 'visit_area')].edge_index = edge_index

print("Metadata:", data.metadata())

Metadata: (['user', 'travel', 'visit_area'], [('user', 'traveled', 'travel'), ('travel', 'contains', 'visit_area'), ('visit_area', 'move_16', 'visit_area'), ('visit_area', 'move_15', 'visit_area'), ('visit_area', 'move_14', 'visit_area'), ('visit_area', 'move_10', 'visit_area'), ('visit_area', 'move_13', 'visit_area'), ('visit_area', 'move_12', 'visit_area'), ('visit_area', 'move_8', 'visit_area'), ('visit_area', 'move_9', 'visit_area'), ('visit_area', 'move_1', 'visit_area'), ('visit_area', 'move_2', 'visit_area'), ('visit_area', 'move_7', 'visit_area'), ('visit_area', 'move_6', 'visit_area'), ('visit_area', 'move_4', 'visit_area'), ('visit_area', 'move_5', 'visit_area')])


# 학습 파이프라인

In [33]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split

# 모델 정의
model = HAN_TravelRecommender(data.metadata(), hidden_dim=64, out_dim=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.MSELoss()

# 라벨 텐서와 학습 가능 인덱스
label_tensor = torch.tensor(label_array, dtype=torch.float)
valid_mask = (label_tensor.sum(dim=1) > 0)
valid_idx = torch.arange(label_tensor.shape[0])[valid_mask]

# train/val 분할 (stratify 없음 - 연속값이므로)
train_idx, val_idx = train_test_split(valid_idx.numpy(), test_size=0.2, random_state=42)
train_idx = torch.tensor(train_idx)
val_idx = torch.tensor(val_idx)

# 라벨도 선택
y_train = label_tensor[train_idx]
y_val = label_tensor[val_idx]

# 학습 루프
for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()

    out = model(data.x_dict, data.edge_index_dict)
    pred_train = out[train_idx]
    loss = criterion(pred_train, y_train)
    loss.backward()
    optimizer.step()

    # 검증
    model.eval()
    with torch.no_grad():
        pred_val = model(data.x_dict, data.edge_index_dict)[val_idx]
        val_loss = criterion(pred_val, y_val)

        # RMSE 계산 (평균)
        rmse = torch.sqrt(F.mse_loss(pred_val, y_val))
        mae = F.l1_loss(pred_val, y_val)

    if epoch % 10 == 0:
        print(f"[Epoch {epoch}] Train Loss: {loss.item():.4f} | Val Loss: {val_loss.item():.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")

[Epoch 10] Train Loss: 17.0917 | Val Loss: 17.3984 | RMSE: 4.1711 | MAE: 4.0457
[Epoch 20] Train Loss: 15.5730 | Val Loss: 15.9609 | RMSE: 3.9951 | MAE: 3.8290
[Epoch 30] Train Loss: 14.4954 | Val Loss: 15.0366 | RMSE: 3.8777 | MAE: 3.5911
[Epoch 40] Train Loss: 13.3033 | Val Loss: 13.6334 | RMSE: 3.6923 | MAE: 3.4508
[Epoch 50] Train Loss: 10.7197 | Val Loss: 10.7859 | RMSE: 3.2842 | MAE: 3.0840
[Epoch 60] Train Loss: 7.0247 | Val Loss: 6.9772 | RMSE: 2.6414 | MAE: 2.4673
[Epoch 70] Train Loss: 3.7267 | Val Loss: 3.6740 | RMSE: 1.9168 | MAE: 1.7318
[Epoch 80] Train Loss: 1.9516 | Val Loss: 1.8773 | RMSE: 1.3701 | MAE: 1.1543
[Epoch 90] Train Loss: 1.5415 | Val Loss: 1.3997 | RMSE: 1.1831 | MAE: 0.9398
[Epoch 100] Train Loss: 1.4171 | Val Loss: 1.2526 | RMSE: 1.1192 | MAE: 0.8730


# 모델 결과값 검증

In [34]:
# 1. 모델 출력
model.eval()
with torch.no_grad():
    out = model(data.x_dict, data.edge_index_dict)  # [num_visit_area, 2]

In [35]:
# 라벨이 있는 index (train + val의 union)
valid_mask = (label_tensor.sum(dim=1) > 0)
valid_idx = torch.arange(label_tensor.shape[0])[valid_mask]

# 모델 예측값 중 해당 인덱스만 추출
pred = out[valid_idx]
true = label_tensor[valid_idx]

# 결과 정리
import pandas as pd
result_df = pd.DataFrame({
    'visit_area_index': valid_idx.cpu().numpy(),
    'pred_dgstfn': pred[:, 0].cpu().numpy(),
    'pred_rcmdtn': pred[:, 1].cpu().numpy(),
    'true_dgstfn': true[:, 0].cpu().numpy(),
    'true_rcmdtn': true[:, 1].cpu().numpy(),
})
result_df.head(10)


Unnamed: 0,visit_area_index,pred_dgstfn,pred_rcmdtn,true_dgstfn,true_rcmdtn
0,0,4.752633,4.674073,4.666667,4.666667
1,1,2.333206,2.140389,3.666667,2.666667
2,2,2.391112,2.222309,4.666667,3.333333
3,3,2.422743,2.265429,4.0,4.0
4,4,2.389711,2.221599,5.0,4.0
5,5,4.728202,4.686278,4.5,4.0
6,6,4.708914,4.665773,3.0,2.0
7,7,3.077544,2.831126,4.0,3.0
8,8,3.343313,3.186547,4.0,4.0
9,9,3.279676,3.101538,2.0,1.0


In [36]:
# visit_df 로드
visit_df = pd.read_csv("../data/VL_csv/tn_visit_area_info_방문지정보_cleaned_E.csv")

# index → VISIT_AREA_ID 매핑 생성
index_to_visit_id = {v: k for k, v in visit_id_map.items()}

# visit_area_index → 실제 ID
result_df['VISIT_AREA_ID'] = result_df['visit_area_index'].map(index_to_visit_id)

# ID → 장소명 매핑 (visit_df에 있는 이름 붙이기)
id_to_name = visit_df.set_index('VISIT_AREA_ID')['VISIT_AREA_NM'].to_dict()
result_df['VISIT_AREA_NM'] = result_df['VISIT_AREA_ID'].map(id_to_name)

# 결과 미리보기
result_df[['VISIT_AREA_ID', 'VISIT_AREA_NM', 'pred_rcmdtn', 'true_rcmdtn']].head(10)

Unnamed: 0,VISIT_AREA_ID,VISIT_AREA_NM,pred_rcmdtn,true_rcmdtn
0,2304290003,이형 무 이비인후과,4.674073,4.666667
1,2304290004,대한 성공회 서울주교좌성당,2.140389,2.666667
2,2304290005,오양 식관,2.222309,3.333333
3,2304300002,연평도동 춘게장,2.265429,4.0
4,2304300003,역전 분식,2.221599,4.0
5,2304300004,가평 브리지 짚라인,4.686278,4.0
6,2304300005,두메 막국수,4.665773,2.0
7,2304300006,가평 잣 고을시장,2.831126,3.0
8,2304300007,자라섬,3.186547,4.0
9,2304300008,남이섬 안 반지 닭갈비 1호점,3.101538,1.0


- 이비인후과 같은 병원 데이터 값
- 00역과 같은 데이터

## 과대평가 or 과소평가된 장소 확인

In [None]:
# 추천의향 기준 예측 과대 평가
over_pred = result_df.copy()
over_pred['rcmdtn_error'] = result_df['pred_rcmdtn'] - result_df['true_rcmdtn']

# 예측이 1점 이상 높았던 방문지
over_pred = over_pred[over_pred['rcmdtn_error'] >= 1.0]

# 장소명 포함 정렬
over_pred = over_pred.sort_values(by='rcmdtn_error', ascending=False)
over_pred[['VISIT_AREA_NM', 'pred_rcmdtn', 'true_rcmdtn', 'rcmdtn_error']].head(10)

Unnamed: 0,VISIT_AREA_NM,pred_rcmdtn,true_rcmdtn,rcmdtn_error
538,펌킨 이태원 클럽,6.623677,2.0,4.623677
296,판타스틱 코인 노래연습장,6.769447,3.0,3.769447
365,초심 삼겹살,5.692024,2.0,3.692024
539,큐브 게스트하우스,7.475237,4.0,3.475237
302,명동역 4호선,6.94966,3.5,3.44966
572,파리에 소,5.284885,2.0,3.284885
594,돌체 빠오라,6.280638,3.0,3.280638
172,분당 정자동 카페골목,6.257611,3.0,3.257611
289,여의도공원,6.12454,3.0,3.12454
301,남산공원 N서울타워,7.093152,4.0,3.093152


In [38]:
# 실제 4점 이상인데, 예측은 2.5 이하로 한 경우
under_pred = result_df.copy()
under_pred = under_pred[(under_pred['true_rcmdtn'] >= 4.0) & (under_pred['pred_rcmdtn'] <= 2.5)]

# 오차 계산 및 정렬
under_pred['rcmdtn_error'] = under_pred['pred_rcmdtn'] - under_pred['true_rcmdtn']
under_pred = under_pred.sort_values(by='rcmdtn_error')

under_pred[['VISIT_AREA_NM', 'pred_rcmdtn', 'true_rcmdtn', 'rcmdtn_error']].head(10)

Unnamed: 0,VISIT_AREA_NM,pred_rcmdtn,true_rcmdtn,rcmdtn_error
25,인천나비공원,2.060138,5.0,-2.939862
17,덕수궁,2.061808,5.0,-2.938192
21,광장시장,2.062459,5.0,-2.937541
180,남산골공원 서울 천년 타임캡슐 광장,2.278071,5.0,-2.721929
187,구읍뱃터,2.346991,5.0,-2.653009
209,서울역,2.368039,5.0,-2.631961
184,그랜마캐비넷,2.393229,5.0,-2.606771
582,JS 아트홀,2.414817,5.0,-2.585183
310,동대구역,2.436176,5.0,-2.563824
482,다이소 스타필드 하남점,1.962434,4.5,-2.537566


In [44]:
import folium

# 1. 과소 평가된 방문지 필터링
under_pred = result_df[
    (result_df['true_rcmdtn'] >= 4.0) & (result_df['pred_rcmdtn'] <= 2.5)
].copy()

# 2. NaN 좌표 제거
under_pred = under_pred.dropna(subset=['X', 'Y'])

# 3. 지도 중심
map_center = [under_pred['Y'].mean(), under_pred['X'].mean()]
fmap = folium.Map(location=map_center, zoom_start=12)

# 4. 마커 추가
for _, row in under_pred.iterrows():
    folium.Marker(
        location=[row['Y'], row['X']],
        popup=f"{row['VISIT_AREA_NM']}<br>예측:{row['pred_rcmdtn']:.2f} / 실제:{row['true_rcmdtn']:.2f}",
        icon=folium.Icon(color='red', icon='exclamation-sign')
    ).add_to(fmap)

# 5. 출력
fmap


# 회고

- 모델 성능이 좋지 못함 (샘플데이터)
    - 거리 정보가 제대로 들어가지 못하는 문제가 발생
    - 거리 반영 동선 제공 알고리즘 작성 필요
- 연결관계 및 점수 Score의 상관성의 설명력이 조금 부족함