In [32]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import StandardScaler


In [33]:
# 주요 파일 경로
move_path = "../data/VL_csv/move_with_new_id_final.csv"
travel_path = "tn_travel_processed.csv"
visit_area_path = "../data/VL_csv/visit_area_with_new_id_final.csv"

# 데이터 로딩
move_df = pd.read_csv(move_path)
travel_df = pd.read_csv(travel_path)
visit_area_df = pd.read_csv(visit_area_path)

In [34]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

# 2️⃣ visit_area feature 생성
visit_area_df['X_COORD'] = visit_area_df['X_COORD'].fillna(visit_area_df['X_COORD'].mean())
visit_area_df['Y_COORD'] = visit_area_df['Y_COORD'].fillna(visit_area_df['Y_COORD'].mean())
visit_area_df['VISIT_CHC_REASON_CD'] = visit_area_df['VISIT_CHC_REASON_CD'].fillna(0)
for col in ['DGSTFN', 'REVISIT_INTENTION', 'RCMDTN_INTENTION']:
    visit_area_df[col] = visit_area_df[col].fillna(3)

features = visit_area_df[['X_COORD', 'Y_COORD']].copy()
type_onehot = pd.get_dummies(visit_area_df['VISIT_AREA_TYPE_CD'], prefix='type')
reason_onehot = pd.get_dummies(visit_area_df['VISIT_CHC_REASON_CD'], prefix='reason')
visit_area_df['DGSTFN_norm'] = (visit_area_df['DGSTFN'] - 1) / 4.0
visit_area_df['REVISIT_norm'] = (visit_area_df['REVISIT_INTENTION'] - 1) / 4.0
visit_area_df['RCMDTN_norm'] = (visit_area_df['RCMDTN_INTENTION'] - 1) / 4.0
features = pd.concat([features, type_onehot, reason_onehot,
                      visit_area_df[['DGSTFN_norm', 'REVISIT_norm', 'RCMDTN_norm']]], axis=1)
scaler = StandardScaler()
visit_area_tensor = scaler.fit_transform(features.to_numpy(dtype=np.float32))

# 3️⃣ edge_index, edge_attr 생성
edges = []
for travel_id, group in move_df.groupby("TRAVEL_ID"):
    group = group.sort_values("TRIP_ID").reset_index(drop=True)
    for i in range(1, len(group)):
        from_id = group.loc[i-1, "END_NEW_ID"]
        to_id = group.loc[i, "END_NEW_ID"]
        duration = group.loc[i, "DURATION_MINUTES"] if "DURATION_MINUTES" in group.columns else 0
        transport = group.loc[i, "MVMN_CD_1"]
        if pd.notna(from_id) and pd.notna(to_id):
            edges.append([int(from_id), int(to_id), duration, transport])

edges_df = pd.DataFrame(edges, columns=["FROM_ID", "TO_ID", "DURATION_MINUTES", "MVMN_CD_1"])
edge_index = torch.tensor(edges_df[["FROM_ID", "TO_ID"]].to_numpy().T, dtype=torch.long)
edge_attr = edges_df[["DURATION_MINUTES"]].fillna(0).astype(np.float32).to_numpy()
edges_df["MVMN_TYPE"] = edges_df["MVMN_CD_1"].apply(lambda code: "drive" if code in [1,2,3] else "public" if code in [4,5,6,7,8,9,10,11,12,13,50] else "other")
edges_df["is_drive"] = (edges_df["MVMN_TYPE"] == "drive").astype(int)
edges_df["is_public"] = (edges_df["MVMN_TYPE"] == "public").astype(int)
edges_df["is_other"] = (edges_df["MVMN_TYPE"] == "other").astype(int)
edge_attr = torch.tensor(np.hstack([edge_attr, edges_df[["is_drive", "is_public", "is_other"]].to_numpy()]), dtype=torch.float32)

# 4️⃣ travel_tensor 생성
excluded_cols = ['Unnamed: 0', 'TRAVEL_ID', 'TRAVELER_ID']
travel_feature_cols = [col for col in travel_df.columns if col not in excluded_cols]
travel_tensor = travel_df[travel_feature_cols].fillna(0).astype(np.float32).to_numpy()

# 5️⃣ 최종 numpy 저장
np.save("../data/VL_csv/visit_area_tensor.npy", visit_area_tensor)
np.save("../data/VL_csv/edge_index.npy", edge_index.numpy())
np.save("../data/VL_csv/edge_attr.npy", edge_attr.numpy())
np.save("../data/VL_csv/travel_tensor.npy", travel_tensor)

print("✅ GNN 학습용 전처리 데이터 저장 완료!")


✅ GNN 학습용 전처리 데이터 저장 완료!


In [35]:
import numpy as np
import torch
from torch_geometric.data import HeteroData

# 1️⃣ numpy로부터 데이터 로드
visit_area_tensor = np.load("../data/VL_csv/visit_area_tensor.npy")
edge_index = np.load("../data/VL_csv/edge_index.npy")
edge_attr = np.load("../data/VL_csv/edge_attr.npy")
travel_tensor = np.load("../data/VL_csv/travel_tensor.npy")

# 2️⃣ HeteroData 생성
data = HeteroData()
data['visit_area'].x = torch.tensor(visit_area_tensor, dtype=torch.float32)
data['visit_area', 'moved_to', 'visit_area'].edge_index = torch.tensor(edge_index, dtype=torch.long)
data['visit_area', 'moved_to', 'visit_area'].edge_attr = torch.tensor(edge_attr, dtype=torch.float32)


In [36]:
data

HeteroData(
  visit_area={ x=[21384, 34] },
  (visit_area, moved_to, visit_area)={
    edge_index=[2, 16232],
    edge_attr=[16232, 4],
  }
)

In [37]:
import torch.nn as nn
from torch_geometric.nn import SAGEConv

class TravelGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        self.lin_out = nn.Linear(out_channels + travel_tensor.shape[1], out_channels)  # travel context concat

    def forward(self, data, travel_context):
        x, edge_index, edge_attr = data['visit_area'].x, data['visit_area', 'moved_to', 'visit_area'].edge_index, data['visit_area', 'moved_to', 'visit_area'].edge_attr
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        # travel context를 모든 visit_area에 concat (broadcast)
        travel_context_expanded = travel_context.expand(x.size(0), -1)
        x = torch.cat([x, travel_context_expanded], dim=1)
        return self.lin_out(x)

In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# 모델 선언 부분
model = TravelGNN(in_channels=visit_area_tensor.shape[1], hidden_channels=64, out_channels=34).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()  # 예: feature reconstruction

data = data.to(device)

# 예시: travel_tensor 중 첫 여행정보로 학습
travel_context_tensor = torch.tensor(travel_tensor[0:1], dtype=torch.float32).to(device)
target = data['visit_area'].x  # 예: 방문지 feature 자체 복원

for epoch in range(1, 301):
    model.train()
    optimizer.zero_grad()
    out = model(data, travel_context_tensor)
    loss = criterion(out, target)
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 50, Loss: 0.6666
Epoch 100, Loss: 0.3558
Epoch 150, Loss: 0.1726
Epoch 200, Loss: 0.0781
Epoch 250, Loss: 0.0393
Epoch 300, Loss: 0.0267


In [39]:
# 여행 정보
def process_travel_input(travel_info:dict):
    from datetime import datetime
    travel_feature_cols = [
        'TOTAL_COST_BINNED_ENCODED',
        'WITH_PET',
        'MONTH',
        'DURATION',
        'MVMN_기타',
        'MVMN_대중교통',
        'MVMN_자가용',
        'TRAVEL_PURPOSE_1',
        'TRAVEL_PURPOSE_2',
        'TRAVEL_PURPOSE_3',
        'TRAVEL_PURPOSE_4',
        'TRAVEL_PURPOSE_5',
        'TRAVEL_PURPOSE_6',
        'TRAVEL_PURPOSE_7',
        'TRAVEL_PURPOSE_8',
        'TRAVEL_PURPOSE_9',
        'WHOWITH_2인여행',
        'WHOWITH_가족여행',
        'WHOWITH_기타',
        'WHOWITH_단독여행',
        'WHOWITH_친구/지인 여행']
    
    
    # mission_ENC에 0 = 반려동물 동반 (WITH_PET)
    travel_info['mission_ENC'] = travel_info['mission_ENC'].strip().split(',')
    if '0' in travel_info['mission_ENC']:
        travel_info['WITH_PET'] = 1
    else:
        travel_info['WITH_PET'] = 0
        
    # TRAVEL_PURPOSE_1 ~~ TRAVEL_PURPOSE_9 (0으로 들어온 입력은 제거해줘야됨) 
    for i in range(1,10):
        if str(i) in travel_info['mission_ENC']:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 1
        else:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 0
        
    # MONTH
    dates = travel_info['date_range'].split(' - ')
    travel_info['start_date'] = datetime.strptime(dates[0].strip(), "%Y-%m-%d")
    travel_info['end_date'] = datetime.strptime(dates[1].strip(), "%Y-%m-%d")
    
    travel_info['MONTH'] = travel_info['end_date'].month
    
    # DURATION
    travel_info['DURATION'] = (travel_info['end_date'] - travel_info['start_date']).days
    
    # MNVM_기타, MVMN_대중교통, MVMN_자가용
    for m in ['자가용', '대중교통', '기타']:
        travel_info[f"MVMN_{m}"] = False
    
    if travel_info['MVMN_NM_ENC'] == '1':
        travel_info['MVMN_자가용'] = True
    elif travel_info['MVMN_NM_ENC'] == '2':
        travel_info['MVMN_대중교통'] = True
    else:
        travel_info['MVMN_기타'] = True
    
    # WHOWITH는 1부터 5까지 숫자로 들어옴 -> 원핫 인코딩으로 수정할 것
    # dict에 들어오는 숫자 의미: WHOWITH_단독여행, WHOWITH_2인여행, WHOWITH_가족여행, WHOWITH_친구/지인여행, WHOWITH_기타
    whowith_onehot = [0] * 5
    idx = int(travel_info['whowith_ENC']) - 1
    if 0 <= idx < 5:
        whowith_onehot[idx] = 1
    
    travel_info.update({
    'WHOWITH_단독여행': whowith_onehot[0],
    'WHOWITH_2인여행': whowith_onehot[1],
    'WHOWITH_가족여행': whowith_onehot[2],
    'WHOWITH_친구/지인 여행': whowith_onehot[3],
    'WHOWITH_기타': whowith_onehot[4],
    })
    
    # TOTAL_COST_BINNED_ENCODED
    travel_info['TOTAL_COST_BINNED_ENCODED'] = travel_info['TOTAL_COST'][-1]
    
    # 컬럼 필터링 (순서에 맞게)
    travel_info = {k: int(travel_info[k]) for k in travel_feature_cols}
    
    return pd.DataFrame([travel_info]).fillna(0).astype(np.float32).to_numpy()

In [40]:
import numpy as np
import torch
import random

# 여행 정보 전처리
test_travel = {
    'mission_ENC': '0,1',
    'date_range': '2025-09-28 - 2025-10-31',
    'start_date': '',
    'end_date': '',
    'TOTAL_COST': '1',
    'MVMN_NM_ENC': '2',
    'whowith_ENC': '1',
    'mission_type': 'normal'
}
test_travel_tensor = process_travel_input(test_travel)
test_travel_tensor = torch.tensor(test_travel_tensor, dtype=torch.float32).to(device)

# GNN 추론
model.eval()
with torch.no_grad():
    predicted_visit_area_embeddings = model(data, test_travel_tensor)  # (N, 34)

# 추천 top-10
scores = predicted_visit_area_embeddings.norm(dim=1)
topk_indices = torch.topk(scores, k=10).indices
topk_recommend = visit_area_df.loc[topk_indices.tolist(), ["NEW_VISIT_AREA_ID", "VISIT_AREA_NM"]]
topk_recommend_list = topk_recommend.values.tolist()

print("✅ 초기 추천 top-10 (ID | 명칭):")
for idx, (area_id, area_nm) in enumerate(topk_recommend_list, 1):
    print(f"{idx}. {area_id} | {area_nm}")

# 랜덤으로 싫어요 표시
num_dislike = 3
disliked_area_ids = random.sample([area_id for area_id, _ in topk_recommend_list], k=num_dislike)
print("\n🚫 랜덤으로 선택된 싫어요 장소 ID:", disliked_area_ids)

# start/middle/end 자동 결정 & 좌표 기반 best index 찾기
def find_best_replacement(disliked_rows, prev_coords, next_coords, mode):
    best_idx = None
    min_distance = float('inf')
    
    for idx, row in disliked_rows.iterrows():
        x = row["X_COORD"]
        y = row["Y_COORD"]
        
        if mode == "start":
            dist_next = np.sqrt((x - next_coords[0])**2 + (y - next_coords[1])**2)
            total_dist = dist_next
        elif mode == "end":
            dist_prev = np.sqrt((x - prev_coords[0])**2 + (y - prev_coords[1])**2)
            total_dist = dist_prev
        else:  # middle
            dist_prev = np.sqrt((x - prev_coords[0])**2 + (y - prev_coords[1])**2)
            dist_next = np.sqrt((x - next_coords[0])**2 + (y - next_coords[1])**2)
            total_dist = dist_prev + dist_next
        
        if total_dist < min_distance:
            min_distance = total_dist
            best_idx = idx
    return best_idx

for disliked_id in disliked_area_ids:
    disliked_rows = visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"] == disliked_id]
    dislike_loc = [area_id for area_id, _ in topk_recommend_list].index(disliked_id)
    
    mode = "start" if dislike_loc == 0 else "end" if dislike_loc == len(topk_indices) - 1 else "middle"
    
    if mode == 'start':        
        prev_coords = disliked_rows[['X_COORD', 'Y_COORD']].values.tolist()[0]
        next_id = topk_recommend_list[dislike_loc+1][0]
        next_coords = visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"] == next_id][['X_COORD', 'Y_COORD']].values.tolist()[0]
    elif mode == 'end':
        prev_id = topk_recommend_list[dislike_loc-1][0]
        prev_coords = visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"] == prev_id][['X_COORD', 'Y_COORD']].values.tolist()[0]
        next_coords = disliked_rows[['X_COORD', 'Y_COORD']].values.tolist()[0]
    else:
        prev_id = topk_recommend_list[dislike_loc-1][0]
        next_id = topk_recommend_list[dislike_loc+1][0]
        prev_coords = visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"] == prev_id][['X_COORD', 'Y_COORD']].values.tolist()[0]
        next_coords = visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"] == next_id][['X_COORD', 'Y_COORD']].values.tolist()[0]

    best_idx = find_best_replacement(disliked_rows, prev_coords, next_coords, mode)
    
    disliked_emb = predicted_visit_area_embeddings[best_idx]
    distances = torch.norm(predicted_visit_area_embeddings - disliked_emb, dim=1)
    distances[best_idx] = 1e9  # 자기 자신 제외
    
    replacement_idx = torch.argmin(distances).item()
    replacement_id = visit_area_df.iloc[replacement_idx]["NEW_VISIT_AREA_ID"]
    replacement_nm = visit_area_df.iloc[replacement_idx]["VISIT_AREA_NM"]
    
    print(f"🌀 싫어요 장소 {disliked_id} ({mode}) → 대체 추천 {replacement_id} | {replacement_nm}")

# 싫어요 반영 후 top-10 추천
disliked_indices = visit_area_df[visit_area_df["NEW_VISIT_AREA_ID"].isin(disliked_area_ids)].index.tolist()
scores_post = scores.clone()
scores_post[disliked_indices] = -1e9  # 제외

topk_indices_post = torch.topk(scores_post, k=10).indices
topk_recommend_post = visit_area_df.loc[topk_indices_post.tolist(), ["NEW_VISIT_AREA_ID", "VISIT_AREA_NM"]]
topk_recommend_post_list = topk_recommend_post.values.tolist()

print("\n✨ 싫어요 반영 후 추천 top-10 (ID | 명칭):")
for idx, (area_id, area_nm) in enumerate(topk_recommend_post_list, 1):
    print(f"{idx}. {area_id} | {area_nm}")


✅ 초기 추천 top-10 (ID | 명칭):
1. 6212 | 로컬 스티치 홍대 2호
2. 4958 | 가톨릭대학교 성심교정
3. 1755 | 젠트리
4. 4270 | 서울 가락 초등학교
5. 1755 | 젠트리
6. 9177 | KDB산업은행 대전지점
7. 344 | 송도센트럴파크
8. 7531 | 킨텍스 제1전시장
9. 9181 | 인천 중구청 본관
10. 9181 | 인천 중구청 본관

🚫 랜덤으로 선택된 싫어요 장소 ID: [9181, 6212, 9177]
🌀 싫어요 장소 9181 (middle) → 대체 추천 9181 | 인천 중구청 본관
🌀 싫어요 장소 6212 (start) → 대체 추천 1755 | 젠트리
🌀 싫어요 장소 9177 (middle) → 대체 추천 1755 | 젠트리

✨ 싫어요 반영 후 추천 top-10 (ID | 명칭):
1. 4958 | 가톨릭대학교 성심교정
2. 1755 | 젠트리
3. 4270 | 서울 가락 초등학교
4. 1755 | 젠트리
5. 344 | 송도센트럴파크
6. 7531 | 킨텍스 제1전시장
7. 2432 | 난지한강공원
8. 777 | 캐리비안베이
9. 2176 | 여의도 물빛무대
10. 1515 | 동국대학교 서울캠퍼스
