In [132]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import StandardScaler


In [133]:
# 주요 파일 경로
move_path = "../data/VL_csv/move_with_new_id_final.csv"
travel_path = "tn_travel_processed.csv"
visit_area_path = "../data/VL_csv/visit_area_with_new_id_final.csv"

# 데이터 로딩
move_df = pd.read_csv(move_path)
travel_df = pd.read_csv(travel_path)
visit_area_df = pd.read_csv(visit_area_path)

In [134]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

# 2️⃣ visit_area feature 생성
visit_area_df['X_COORD'] = visit_area_df['X_COORD'].fillna(visit_area_df['X_COORD'].mean())
visit_area_df['Y_COORD'] = visit_area_df['Y_COORD'].fillna(visit_area_df['Y_COORD'].mean())
visit_area_df['VISIT_CHC_REASON_CD'] = visit_area_df['VISIT_CHC_REASON_CD'].fillna(0)
for col in ['DGSTFN', 'REVISIT_INTENTION', 'RCMDTN_INTENTION']:
    visit_area_df[col] = visit_area_df[col].fillna(3)

features = visit_area_df[['X_COORD', 'Y_COORD']].copy()
type_onehot = pd.get_dummies(visit_area_df['VISIT_AREA_TYPE_CD'], prefix='type')
reason_onehot = pd.get_dummies(visit_area_df['VISIT_CHC_REASON_CD'], prefix='reason')
visit_area_df['DGSTFN_norm'] = (visit_area_df['DGSTFN'] - 1) / 4.0
visit_area_df['REVISIT_norm'] = (visit_area_df['REVISIT_INTENTION'] - 1) / 4.0
visit_area_df['RCMDTN_norm'] = (visit_area_df['RCMDTN_INTENTION'] - 1) / 4.0
features = pd.concat([features, type_onehot, reason_onehot,
                      visit_area_df[['DGSTFN_norm', 'REVISIT_norm', 'RCMDTN_norm']]], axis=1)
scaler = StandardScaler()
visit_area_tensor = scaler.fit_transform(features.to_numpy(dtype=np.float32))

# 3️⃣ edge_index, edge_attr 생성
edges = []
for travel_id, group in move_df.groupby("TRAVEL_ID"):
    group = group.sort_values("TRIP_ID").reset_index(drop=True)
    for i in range(1, len(group)):
        from_id = group.loc[i-1, "END_NEW_ID"]
        to_id = group.loc[i, "END_NEW_ID"]
        duration = group.loc[i, "DURATION_MINUTES"] if "DURATION_MINUTES" in group.columns else 0
        transport = group.loc[i, "MVMN_CD_1"]
        if pd.notna(from_id) and pd.notna(to_id):
            edges.append([int(from_id), int(to_id), duration, transport])

edges_df = pd.DataFrame(edges, columns=["FROM_ID", "TO_ID", "DURATION_MINUTES", "MVMN_CD_1"])
edge_index = torch.tensor(edges_df[["FROM_ID", "TO_ID"]].to_numpy().T, dtype=torch.long)
edge_attr = edges_df[["DURATION_MINUTES"]].fillna(0).astype(np.float32).to_numpy()
edges_df["MVMN_TYPE"] = edges_df["MVMN_CD_1"].apply(lambda code: "drive" if code in [1,2,3] else "public" if code in [4,5,6,7,8,9,10,11,12,13,50] else "other")
edges_df["is_drive"] = (edges_df["MVMN_TYPE"] == "drive").astype(int)
edges_df["is_public"] = (edges_df["MVMN_TYPE"] == "public").astype(int)
edges_df["is_other"] = (edges_df["MVMN_TYPE"] == "other").astype(int)
edge_attr = torch.tensor(np.hstack([edge_attr, edges_df[["is_drive", "is_public", "is_other"]].to_numpy()]), dtype=torch.float32)

# 4️⃣ travel_tensor 생성
excluded_cols = ['Unnamed: 0', 'TRAVEL_ID', 'TRAVELER_ID']
travel_feature_cols = [col for col in travel_df.columns if col not in excluded_cols]
travel_tensor = travel_df[travel_feature_cols].fillna(0).astype(np.float32).to_numpy()

# 5️⃣ 최종 numpy 저장
np.save("../data/VL_csv/visit_area_tensor.npy", visit_area_tensor)
np.save("../data/VL_csv/edge_index.npy", edge_index.numpy())
np.save("../data/VL_csv/edge_attr.npy", edge_attr.numpy())
np.save("../data/VL_csv/travel_tensor.npy", travel_tensor)

print("✅ GNN 학습용 전처리 데이터 저장 완료!")


✅ GNN 학습용 전처리 데이터 저장 완료!


In [135]:
import numpy as np
import torch
from torch_geometric.data import HeteroData

# 1️⃣ numpy로부터 데이터 로드
visit_area_tensor = np.load("../data/VL_csv/visit_area_tensor.npy")
edge_index = np.load("../data/VL_csv/edge_index.npy")
edge_attr = np.load("../data/VL_csv/edge_attr.npy")
travel_tensor = np.load("../data/VL_csv/travel_tensor.npy")

# 2️⃣ HeteroData 생성
data = HeteroData()
data['visit_area'].x = torch.tensor(visit_area_tensor, dtype=torch.float32)
data['visit_area', 'moved_to', 'visit_area'].edge_index = torch.tensor(edge_index, dtype=torch.long)
data['visit_area', 'moved_to', 'visit_area'].edge_attr = torch.tensor(edge_attr, dtype=torch.float32)


In [136]:
data

HeteroData(
  visit_area={ x=[21384, 34] },
  (visit_area, moved_to, visit_area)={
    edge_index=[2, 16232],
    edge_attr=[16232, 4],
  }
)

In [137]:
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import SAGEConv
import random

class TravelGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, travel_context_dim):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, out_channels)

        self.residual = nn.Sequential(
            nn.Linear(out_channels + travel_context_dim, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, out_channels)
        )
        
        # 🚨 출력 크기를 target과 맞춤
        self.lin_out = nn.Linear(out_channels, 34)

    def forward(self, data, travel_context):
        x, edge_index = data['visit_area'].x, data['visit_area', 'moved_to', 'visit_area'].edge_index
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        x = self.conv3(x, edge_index)
        travel_context_expanded = travel_context.expand(x.size(0), -1)
        x = torch.cat([x, travel_context_expanded], dim=1)
        x = self.residual(x)
        return self.lin_out(x)


In [None]:
# 여행 정보
def process_travel_input(travel_info:dict):
    from datetime import datetime
    travel_feature_cols = [
        'TOTAL_COST_BINNED_ENCODED',
        'WITH_PET',
        'MONTH',
        'DURATION',
        'MVMN_기타',
        'MVMN_대중교통',
        'MVMN_자가용',
        'TRAVEL_PURPOSE_1',
        'TRAVEL_PURPOSE_2',
        'TRAVEL_PURPOSE_3',
        'TRAVEL_PURPOSE_4',
        'TRAVEL_PURPOSE_5',
        'TRAVEL_PURPOSE_6',
        'TRAVEL_PURPOSE_7',
        'TRAVEL_PURPOSE_8',
        'TRAVEL_PURPOSE_9',
        'WHOWITH_2인여행',
        'WHOWITH_가족여행',
        'WHOWITH_기타',
        'WHOWITH_단독여행',
        'WHOWITH_친구/지인 여행']
    
    
    # mission_ENC에 0 = 반려동물 동반 (WITH_PET)
    travel_info['mission_ENC'] = travel_info['mission_ENC'].strip().split(',')
    if '0' in travel_info['mission_ENC']:
        travel_info['WITH_PET'] = 1
    else:
        travel_info['WITH_PET'] = 0
        
    # TRAVEL_PURPOSE_1 ~~ TRAVEL_PURPOSE_9 (0으로 들어온 입력은 제거해줘야됨) 
    for i in range(1,10):
        if str(i) in travel_info['mission_ENC']:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 1
        else:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 0
        
    # MONTH
    dates = travel_info['date_range'].split(' - ')
    travel_info['start_date'] = datetime.strptime(dates[0].strip(), "%Y-%m-%d")
    travel_info['end_date'] = datetime.strptime(dates[1].strip(), "%Y-%m-%d")
    
    travel_info['MONTH'] = travel_info['end_date'].month
    
    # DURATION
    travel_info['DURATION'] = (travel_info['end_date'] - travel_info['start_date']).days
    
    # MNVM_기타, MVMN_대중교통, MVMN_자가용
    for m in ['자가용', '대중교통', '기타']:
        travel_info[f"MVMN_{m}"] = False
    
    if travel_info['MVMN_NM_ENC'] == '1':
        travel_info['MVMN_자가용'] = True
    elif travel_info['MVMN_NM_ENC'] == '2':
        travel_info['MVMN_대중교통'] = True
    else:
        travel_info['MVMN_기타'] = True
    
    # WHOWITH는 1부터 5까지 숫자로 들어옴 -> 원핫 인코딩으로 수정할 것
    # dict에 들어오는 숫자 의미: WHOWITH_단독여행, WHOWITH_2인여행, WHOWITH_가족여행, WHOWITH_친구/지인여행, WHOWITH_기타
    whowith_onehot = [0] * 5
    idx = int(travel_info['whowith_ENC']) - 1
    if 0 <= idx < 5:
        whowith_onehot[idx] = 1
    
    travel_info.update({
    'WHOWITH_단독여행': whowith_onehot[0],
    'WHOWITH_2인여행': whowith_onehot[1],
    'WHOWITH_가족여행': whowith_onehot[2],
    'WHOWITH_친구/지인 여행': whowith_onehot[3],
    'WHOWITH_기타': whowith_onehot[4],
    })
    
    # TOTAL_COST_BINNED_ENCODED
    travel_info['TOTAL_COST_BINNED_ENCODED'] = travel_info['TOTAL_COST'][-1]
    
    # 컬럼 필터링 (순서에 맞게)
    travel_info = {k: int(travel_info[k]) for k in travel_feature_cols}
    
    return pd.DataFrame([travel_info]).fillna(0).astype(np.float32).to_numpy()

In [None]:
# 여행 정보 전처리
test_travel = {
    'mission_ENC': '0,1',
    'date_range': '2025-09-28 - 2025-09-29',
    'start_date': '',
    'end_date': '',
    'TOTAL_COST': '1',
    'MVMN_NM_ENC': '2',
    'whowith_ENC': '1',
    'mission_type': 'normal'
}
device = "cuda" if torch.cuda.is_available() else "cpu"
test_travel_tensor = process_travel_input(test_travel)
test_travel_tensor = torch.tensor(test_travel_tensor, dtype=torch.float32).to(device)

In [None]:
# 모델 선언 부분
model = TravelGNN(in_channels=34, hidden_channels=64, out_channels=64, travel_context_dim=test_travel_tensor.shape[1]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()  # 예: feature reconstruction

data = data.to(device)

# 예시: travel_tensor 중 첫 여행정보로 학습
travel_context_tensor = torch.tensor(travel_tensor[0:1], dtype=torch.float32).to(device)
target = data['visit_area'].x  # 예: 방문지 feature 자체 복원

for epoch in range(1, 301):
    model.train()
    optimizer.zero_grad()
    out = model(data, travel_context_tensor)
    loss = criterion(out, target)
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 50, Loss: 0.6782
Epoch 100, Loss: 0.3395
Epoch 150, Loss: 0.1440
Epoch 200, Loss: 0.0609
Epoch 250, Loss: 0.0397
Epoch 300, Loss: 0.0284


In [140]:
import numpy as np
import torch
import random

# 여행 정보 전처리
test_travel = {
    'mission_ENC': '0,1',
    'date_range': '2025-09-28 - 2025-09-29',
    'start_date': '',
    'end_date': '',
    'TOTAL_COST': '1',
    'MVMN_NM_ENC': '2',
    'whowith_ENC': '1',
    'mission_type': 'normal'
}
test_travel_tensor = process_travel_input(test_travel)
test_travel_tensor = torch.tensor(test_travel_tensor, dtype=torch.float32).to(device)

# GNN 추론
model.eval()
with torch.no_grad():
    predicted_visit_area_embeddings = model(data, test_travel_tensor)

# ✅ 추천 top-10 (중복 제거)
scores = predicted_visit_area_embeddings.norm(dim=1)
top_indices_ranked = torch.topk(scores, k=len(scores)).indices.tolist()
used_ids = set()
unique_topk_list = []
for idx in top_indices_ranked:
    candidate = visit_area_df.iloc[idx]
    cand_id = candidate["NEW_VISIT_AREA_ID"]
    if cand_id not in used_ids:
        unique_topk_list.append([
            cand_id, candidate["VISIT_AREA_NM"],
            candidate["X_COORD"], candidate["Y_COORD"]
        ])
        used_ids.add(cand_id)
    if len(unique_topk_list) == 10:
        break

print("✅ 초기 추천 top-10 (중복 제거됨):")
for i, (area_id, area_nm, _, _) in enumerate(unique_topk_list, 1):
    print(f"{i}. {area_id} | {area_nm}")

# 랜덤으로 싫어요 표시
num_dislike = 3
disliked_area_ids = random.sample([area_id for area_id, _, _, _ in unique_topk_list], k=num_dislike)
print("\n🚫 랜덤으로 선택된 싫어요 장소 ID:", disliked_area_ids)

# 대체 추천
final_topk_list = unique_topk_list.copy()
used_area_ids = set(area_id for area_id, _, _, _ in final_topk_list)

for disliked_id in disliked_area_ids:
    dislike_loc = [area_id for area_id, _, _, _ in final_topk_list].index(disliked_id)
    mode = "start" if dislike_loc == 0 else "end" if dislike_loc == len(final_topk_list) - 1 else "middle"
    
    if mode == "start":
        next_coords = final_topk_list[dislike_loc+1][2:4]
    elif mode == "end":
        prev_coords = final_topk_list[dislike_loc-1][2:4]
    else:
        prev_coords = final_topk_list[dislike_loc-1][2:4]
        next_coords = final_topk_list[dislike_loc+1][2:4]
    
    best_replacement_id, best_replacement_nm, best_coords = None, None, None
    min_total_dist = float('inf')

    for idx in top_indices_ranked:
        candidate = visit_area_df.iloc[idx]
        cand_coords = [candidate["X_COORD"], candidate["Y_COORD"]]
        cand_id = candidate["NEW_VISIT_AREA_ID"]
        if cand_id in used_area_ids:
            continue

        if mode == "start":
            total_dist = np.linalg.norm(np.array(cand_coords) - np.array(next_coords))
        elif mode == "end":
            total_dist = np.linalg.norm(np.array(cand_coords) - np.array(prev_coords))
        else:
            dist_prev = np.linalg.norm(np.array(cand_coords) - np.array(prev_coords))
            dist_next = np.linalg.norm(np.array(cand_coords) - np.array(next_coords))
            total_dist = dist_prev + dist_next

        if total_dist < min_total_dist:
            min_total_dist = total_dist
            best_replacement_id = cand_id
            best_replacement_nm = candidate["VISIT_AREA_NM"]
            best_coords = cand_coords

        if min_total_dist == 0:
            break

    print(f"🌀 싫어요 장소 {disliked_id} ({mode}) → 대체 추천 {best_replacement_id} | {best_replacement_nm}")
    final_topk_list[dislike_loc] = [best_replacement_id, best_replacement_nm, best_coords[0], best_coords[1]]
    used_area_ids.add(best_replacement_id)

print("\n✨ 최종 대체 + 중복 제거된 top-10 추천 (완성):")
for idx, (area_id, area_nm, _, _) in enumerate(final_topk_list, 1):
    print(f"{idx}. {area_id} | {area_nm}")


✅ 초기 추천 top-10 (중복 제거됨):
1. 184 | 제주 국제공항
2. 9262 | 개항장 문화마당
3. 869 | 현대 프리미엄 아울렛 송도점
4. 3619 | 잠실종합운동장 올림픽 주경기장
5. 182 | 서울 장미축제
6. 4769 | 부천국제판타스틱영화제
7. 3189 | 서가 앤 쿡 KTX 서울 역사점
8. 3063 | 정릉동 교통광장
9. 8701 | 용인 어린이 상상의 숲
10. 7570 | 성암아트센터

🚫 랜덤으로 선택된 싫어요 장소 ID: [3063, 3619, 184]
🌀 싫어요 장소 3063 (middle) → 대체 추천 3196 | 배다리 생태공원
🌀 싫어요 장소 3619 (middle) → 대체 추천 1785 | 크리스 월드
🌀 싫어요 장소 184 (start) → 대체 추천 8896 | 차이나타운점

✨ 최종 대체 + 중복 제거된 top-10 추천 (완성):
1. 8896 | 차이나타운점
2. 9262 | 개항장 문화마당
3. 869 | 현대 프리미엄 아울렛 송도점
4. 1785 | 크리스 월드
5. 182 | 서울 장미축제
6. 4769 | 부천국제판타스틱영화제
7. 3189 | 서가 앤 쿡 KTX 서울 역사점
8. 3196 | 배다리 생태공원
9. 8701 | 용인 어린이 상상의 숲
10. 7570 | 성암아트센터


In [144]:
import numpy as np

# ✅ Day별 클러스터링 (예: 여행기간 2일)
travel_duration = int(test_travel_tensor[0, 3].item())
k = min((travel_duration + 1), len(final_topk_list))
from sklearn.cluster import KMeans

# top-k embedding 추출
embeddings_topk = predicted_visit_area_embeddings[
    [visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"] == area_id].index[0]
     for area_id, _, _, _ in final_topk_list]
].cpu().numpy()
kmeans = KMeans(n_clusters=k, random_state=42)
day_labels = kmeans.fit_predict(embeddings_topk)

# ✅ Day별 그룹 + Day1→Day2 연결성 최적화
day_groups = {day: [] for day in range(k)}
for idx, label in enumerate(day_labels):
    day_groups[label].append(final_topk_list[idx])

# Day별 중심좌표
centroids = kmeans.cluster_centers_

# Day1 마지막 → Day2 첫 장소 연결 최적화
if k >= 2:
    day1_last_coords = day_groups[0][-1][2:4]
    best_idx, min_dist = -1, float('inf')
    for i, (_, _, x, y) in enumerate(day_groups[1]):
        dist = np.linalg.norm(np.array(day1_last_coords) - np.array([x, y]))
        if dist < min_dist:
            min_dist, best_idx = dist, i
    day_groups[1] = [day_groups[1][best_idx]] + [day_groups[1][i] for i in range(len(day_groups[1])) if i != best_idx]

# ✅ 최종 출력 + Day별 거리 기반 필터링
print(f"\n🌟 최종 여행 일정 (총 {k}일, Day1→Day2 연결 최적화 + 거리기반 필터링):")
for day in range(k):
    # Day별 X, Y 좌표만 추출
    cluster_points_coords = np.array([[x, y] for _, _, x, y in day_groups[day]])
    centroid_coords = cluster_points_coords.mean(axis=0)  # Day별 중심 (2차원)

    # X, Y 기반 거리
    dists = np.linalg.norm(cluster_points_coords - centroid_coords, axis=1)
    avg_dist = np.mean(dists)
    dist_threshold = avg_dist+np.std(dists)

    print(f"\n📅 Day {day+1} (임계값 {dist_threshold:.2f}):")
    for i, (area_id, area_nm, x, y) in enumerate(day_groups[day]):
        dist = dists[i]
        if dist <= dist_threshold:
            print(f" - {area_id} | {area_nm} (거리 {dist:.2f})")
        else:
            print(f" ❌ {area_id} | {area_nm} (거리 {dist:.2f} > {dist_threshold:.2f} → 제외됨)")



🌟 최종 여행 일정 (총 2일, Day1→Day2 연결 최적화 + 거리기반 필터링):

📅 Day 1 (임계값 2.18):
 - 8896 | 차이나타운점 (거리 0.92)
 - 869 | 현대 프리미엄 아울렛 송도점 (거리 0.82)
 - 1785 | 크리스 월드 (거리 0.94)
 ❌ 3189 | 서가 앤 쿡 KTX 서울 역사점 (거리 3.07 > 2.18 → 제외됨)
 - 3196 | 배다리 생태공원 (거리 0.55)

📅 Day 2 (임계값 0.29):
 - 8701 | 용인 어린이 상상의 숲 (거리 0.27)
 ❌ 9262 | 개항장 문화마당 (거리 0.36 > 0.29 → 제외됨)
 - 182 | 서울 장미축제 (거리 0.06)
 - 4769 | 부천국제판타스틱영화제 (거리 0.06)
 - 7570 | 성암아트센터 (거리 0.10)
