In [19]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import StandardScaler


In [20]:
# 주요 파일 경로
move_path = "../data/VL_csv/tn_move_his_이동내역_Cleaned_E.csv"
user_path = "../data/VL_csv/tn_traveller_master_여행객 Master_E_preprocessed.csv"
travel_path = "tn_travel_processed.csv"
visit_area_path = "../data/VL_csv/tn_visit_area_info_방문지정보_Cleaned_E.csv"

# 데이터 로딩
move_df = pd.read_csv(move_path)
user_df = pd.read_csv(user_path)
travel_df = pd.read_csv(travel_path)
visit_area_df = pd.read_csv(visit_area_path)

In [None]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import HeteroData
from torch import nn, optim
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# visit_area feature 전처리
visit_area_df['X_COORD'] = visit_area_df['X_COORD'].fillna(visit_area_df['X_COORD'].mean())
visit_area_df['Y_COORD'] = visit_area_df['Y_COORD'].fillna(visit_area_df['Y_COORD'].mean())
visit_area_df['VISIT_CHC_REASON_CD'] = visit_area_df['VISIT_CHC_REASON_CD'].fillna(0)
for col in ['DGSTFN', 'REVISIT_INTENTION', 'RCMDTN_INTENTION']:
    visit_area_df[col] = visit_area_df[col].fillna(3)

features = visit_area_df[['X_COORD', 'Y_COORD']].copy()
type_onehot = pd.get_dummies(visit_area_df['VISIT_AREA_TYPE_CD'], prefix='type')
reason_onehot = pd.get_dummies(visit_area_df['VISIT_CHC_REASON_CD'], prefix='reason')
visit_area_df['DGSTFN_norm'] = (visit_area_df['DGSTFN'] - 1) / 4.0
visit_area_df['REVISIT_norm'] = (visit_area_df['REVISIT_INTENTION'] - 1) / 4.0
visit_area_df['RCMDTN_norm'] = (visit_area_df['RCMDTN_INTENTION'] - 1) / 4.0
features = pd.concat([features, type_onehot, reason_onehot,
                      visit_area_df[['DGSTFN_norm', 'REVISIT_norm', 'RCMDTN_norm']]], axis=1)

scaler = StandardScaler()
visit_area_tensor = scaler.fit_transform(features.to_numpy(dtype=np.float32))

# travel_context (batch)
excluded_cols = ['Unnamed: 0', 'TRAVEL_ID', 'TRAVELER_ID']
travel_feature_cols = [col for col in travel_df.columns if col not in excluded_cols]
travel_tensor = travel_df[travel_feature_cols].fillna(0).astype(np.float32).to_numpy()  # (2560, 21)
travel_context_tensor = torch.tensor(travel_tensor, dtype=torch.float32)

# edge_index 생성
move_df["START_DT_MIN"] = pd.to_datetime(move_df["START_DT_MIN"], errors="coerce")
move_df["END_DT_MIN"] = pd.to_datetime(move_df["END_DT_MIN"], errors="coerce")
cols = ["TRAVEL_ID", "TRIP_ID", "START_VISIT_AREA_ID", "END_VISIT_AREA_ID",
        "START_DT_MIN", "END_DT_MIN", "MVMN_CD_1"]
move_df = move_df[cols].copy()

edges = []
for travel_id, group in move_df.sort_values(["TRAVEL_ID", "TRIP_ID"]).groupby("TRAVEL_ID"):
    group = group.reset_index(drop=True)
    n = len(group)
    if n < 2:
        continue
    start_time = group.loc[0, "START_DT_MIN"]
    end_time = group.loc[1, "END_DT_MIN"]
    if pd.notna(start_time) and pd.notna(end_time):
        duration = (end_time - start_time).total_seconds() / 60
        from_id = group.loc[0, "START_VISIT_AREA_ID"]
        to_id = group.loc[1, "END_VISIT_AREA_ID"]
        transport = group.loc[1, "MVMN_CD_1"]
        if pd.notna(from_id) and pd.notna(to_id) and pd.notna(transport):
            edges.append([int(from_id), int(to_id), duration, int(transport)])
    for i in range(1, n - 1):
        t1 = group.loc[i, "END_DT_MIN"]
        t2 = group.loc[i + 1, "END_DT_MIN"]
        if pd.notna(t1) and pd.notna(t2):
            duration = (t2 - t1).total_seconds() / 60
            from_id = group.loc[i, "END_VISIT_AREA_ID"]
            to_id = group.loc[i + 1, "END_VISIT_AREA_ID"]
            transport = group.loc[i + 1, "MVMN_CD_1"]
            if pd.notna(from_id) and pd.notna(to_id) and pd.notna(transport):
                edges.append([int(from_id), int(to_id), duration, int(transport)])

edges_df = pd.DataFrame(edges, columns=[
    "FROM_VISIT_AREA_ID", "TO_VISIT_AREA_ID", "DURATION_MINUTES", "MVMN_CD_1"
])

# 방문지 ID → index 매핑
visit_area_id_list = visit_area_df["VISIT_AREA_ID"].tolist()
visit_area_id_to_index = {vid: idx for idx, vid in enumerate(visit_area_id_list)}

edges_df["FROM_IDX"] = edges_df["FROM_VISIT_AREA_ID"].map(visit_area_id_to_index)
edges_df["TO_IDX"] = edges_df["TO_VISIT_AREA_ID"].map(visit_area_id_to_index)

# edge_index / edge_attr
edge_index = torch.tensor([
    edges_df["FROM_IDX"].tolist(),
    edges_df["TO_IDX"].tolist()
], dtype=torch.long)
edge_attr = torch.tensor(
    edges_df[["DURATION_MINUTES"]].fillna(0).astype(np.float32).to_numpy(), dtype=torch.float32
)
# 이동수단 one-hot
edges_df["MVMN_TYPE"] = edges_df["MVMN_CD_1"].apply(lambda code: "drive" if code in [1,2,3] else "public" if code in [4,5,6,7,8,9,10,11,12,13,50] else "other")
edges_df["is_drive"] = (edges_df["MVMN_TYPE"] == "drive").astype(int)
edges_df["is_public"] = (edges_df["MVMN_TYPE"] == "public").astype(int)
edges_df["is_other"] = (edges_df["MVMN_TYPE"] == "other").astype(int)
edge_attr = torch.cat([
    edge_attr,
    torch.tensor(edges_df[["is_drive", "is_public", "is_other"]].to_numpy(), dtype=torch.float32)
], dim=1)

# 5️⃣ HeteroData
data = HeteroData()
data['visit_area'].x = torch.tensor(visit_area_tensor, dtype=torch.float32)
data['visit_area', 'moved_to', 'visit_area'].edge_index = edge_index
data['visit_area', 'moved_to', 'visit_area'].edge_attr = edge_attr

print("✅ HeteroData created successfully!")
print(data)

# 6️GNN 모델
class TravelGNN(nn.Module):
    def __init__(self, visit_area_input_dim=34, travel_context_dim=21, hidden_dim=64):
        super(TravelGNN, self).__init__()
        self.lin_in = nn.Linear(visit_area_input_dim + travel_context_dim, hidden_dim)
        self.conv1 = SAGEConv(hidden_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.lin_out = nn.Linear(hidden_dim, visit_area_input_dim)  # 최종 출력 34차원으로 수정!

    def forward(self, data, travel_context):
        x = data['visit_area'].x  # (N, 34)
        context_expand = travel_context.repeat_interleave(
            repeats=x.size(0) // travel_context.size(0), dim=0
        )
        x = torch.cat([x, context_expand], dim=1)
        x = self.lin_in(x)

        edge_index = data['visit_area', 'moved_to', 'visit_area'].edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)

        out = self.lin_out(x)  # (N, 34)
        return out

# 7️학습 루프
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TravelGNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

data = data.to(device)
travel_context_tensor = travel_context_tensor.to(device)
target = data['visit_area'].x.to(device)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for i in tqdm(range(travel_context_tensor.size(0))):
        travel_context = travel_context_tensor[i:i+1, :]  # (1, 21)

        optimizer.zero_grad()
        out = model(data, travel_context)
        loss = criterion(out, target)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / travel_context_tensor.size(0)
    print(f"[Epoch {epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")

print("🎉 학습 완료!")


✅ HeteroData created successfully!
HeteroData(
  visit_area={ x=[21384, 34] },
  (visit_area, moved_to, visit_area)={
    edge_index=[2, 18742],
    edge_attr=[18742, 4],
  }
)


100%|██████████| 2560/2560 [01:18<00:00, 32.76it/s]


[Epoch 1/10] Loss: 0.0330


100%|██████████| 2560/2560 [01:24<00:00, 30.44it/s]


[Epoch 2/10] Loss: 0.0014


100%|██████████| 2560/2560 [01:34<00:00, 27.11it/s]


[Epoch 3/10] Loss: 0.0010


100%|██████████| 2560/2560 [01:30<00:00, 28.17it/s]


[Epoch 4/10] Loss: 0.0009


100%|██████████| 2560/2560 [01:16<00:00, 33.51it/s]


[Epoch 5/10] Loss: 0.0007


100%|██████████| 2560/2560 [01:15<00:00, 33.90it/s]


[Epoch 6/10] Loss: 0.0007


100%|██████████| 2560/2560 [01:15<00:00, 33.92it/s]


[Epoch 7/10] Loss: 0.0006


100%|██████████| 2560/2560 [01:21<00:00, 31.45it/s]


[Epoch 8/10] Loss: 0.0005


100%|██████████| 2560/2560 [01:22<00:00, 31.07it/s]


[Epoch 9/10] Loss: 0.0005


100%|██████████| 2560/2560 [01:21<00:00, 31.58it/s]

[Epoch 10/10] Loss: 0.0004
🎉 학습 완료!





In [32]:
# 여행 정보
def process_travel_input(travel_info:dict):
    from datetime import datetime
    travel_feature_cols = [
        'TOTAL_COST_BINNED_ENCODED',
        'WITH_PET',
        'MONTH',
        'DURATION',
        'MVMN_기타',
        'MVMN_대중교통',
        'MVMN_자가용',
        'TRAVEL_PURPOSE_1',
        'TRAVEL_PURPOSE_2',
        'TRAVEL_PURPOSE_3',
        'TRAVEL_PURPOSE_4',
        'TRAVEL_PURPOSE_5',
        'TRAVEL_PURPOSE_6',
        'TRAVEL_PURPOSE_7',
        'TRAVEL_PURPOSE_8',
        'TRAVEL_PURPOSE_9',
        'WHOWITH_2인여행',
        'WHOWITH_가족여행',
        'WHOWITH_기타',
        'WHOWITH_단독여행',
        'WHOWITH_친구/지인 여행']
    
    
    # mission_ENC에 0 = 반려동물 동반 (WITH_PET)
    travel_info['mission_ENC'] = travel_info['mission_ENC'].strip().split(',')
    if '0' in travel_info['mission_ENC']:
        travel_info['WITH_PET'] = 1
    else:
        travel_info['WITH_PET'] = 0
        
    # TRAVEL_PURPOSE_1 ~~ TRAVEL_PURPOSE_9 (0으로 들어온 입력은 제거해줘야됨) 
    for i in range(1,10):
        if str(i) in travel_info['mission_ENC']:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 1
        else:
            travel_info[f'TRAVEL_PURPOSE_{i}'] = 0
        
    # MONTH
    dates = travel_info['date_range'].split(' - ')
    travel_info['start_date'] = datetime.strptime(dates[0].strip(), "%Y-%m-%d")
    travel_info['end_date'] = datetime.strptime(dates[1].strip(), "%Y-%m-%d")
    
    travel_info['MONTH'] = travel_info['end_date'].month
    
    # DURATION
    travel_info['DURATION'] = (travel_info['end_date'] - travel_info['start_date']).days
    
    # MNVM_기타, MVMN_대중교통, MVMN_자가용
    for m in ['자가용', '대중교통', '기타']:
        travel_info[f"MVMN_{m}"] = False
    
    if travel_info['MVMN_NM_ENC'] == '1':
        travel_info['MVMN_자가용'] = True
    elif travel_info['MVMN_NM_ENC'] == '2':
        travel_info['MVMN_대중교통'] = True
    else:
        travel_info['MVMN_기타'] = True
    
    # WHOWITH는 1부터 5까지 숫자로 들어옴 -> 원핫 인코딩으로 수정할 것
    # dict에 들어오는 숫자 의미: WHOWITH_단독여행, WHOWITH_2인여행, WHOWITH_가족여행, WHOWITH_친구/지인여행, WHOWITH_기타
    whowith_onehot = [0] * 5
    idx = int(travel_info['whowith_ENC']) - 1
    if 0 <= idx < 5:
        whowith_onehot[idx] = 1
    
    travel_info.update({
    'WHOWITH_단독여행': whowith_onehot[0],
    'WHOWITH_2인여행': whowith_onehot[1],
    'WHOWITH_가족여행': whowith_onehot[2],
    'WHOWITH_친구/지인 여행': whowith_onehot[3],
    'WHOWITH_기타': whowith_onehot[4],
    })
    
    # TOTAL_COST_BINNED_ENCODED
    travel_info['TOTAL_COST_BINNED_ENCODED'] = travel_info['TOTAL_COST'][-1]
    
    # 컬럼 필터링 (순서에 맞게)
    travel_info = {k: int(travel_info[k]) for k in travel_feature_cols}
    
    return pd.DataFrame([travel_info]).fillna(0).astype(np.float32).to_numpy()

In [59]:
import numpy as np
import torch

# 여행 정보 전처리
test_travel = {
    'mission_ENC': '0,1',
    'date_range': '2025-09-28 - 2025-10-31',
    'start_date': '',
    'end_date': '',
    'TOTAL_COST': '1',
    'MVMN_NM_ENC': '2',
    'whowith_ENC': '1',
    'mission_type': 'normal'
}
test_travel_tensor = process_travel_input(test_travel)
test_travel_tensor = torch.tensor(test_travel_tensor, dtype=torch.float32).to(device)

# GNN 추론
model.eval()
with torch.no_grad():
    predicted_visit_area_embeddings = model(data, test_travel_tensor)  # (21384, 34)

# 추천 top-10
scores = predicted_visit_area_embeddings.norm(dim=1)
topk_indices = torch.topk(scores, k=10).indices
topk_recommend_area_ids = [visit_area_df.iloc[idx]["VISIT_AREA_ID"] for idx in topk_indices.tolist()]
print("✅ 초기 추천 top-10:", topk_recommend_area_ids)

# 랜덤으로 싫어요 표시
import random
num_dislike = 3
disliked_area_ids = random.sample(topk_recommend_area_ids, k=num_dislike)
print("🚫 랜덤으로 선택된 싫어요 장소 ID:", disliked_area_ids)

# start/middle/end 자동 결정 & 좌표 기반 best index 찾기
def find_best_replacement(disliked_rows, prev_coords, next_coords, mode):
    best_idx = None
    min_distance = float('inf')
    
    for idx, row in disliked_rows.iterrows():
        x = row["X_COORD"]
        y = row["Y_COORD"]
        
        if mode == "start":
            dist_next = np.sqrt((x - next_coords[0])**2 + (y - next_coords[1])**2)
            total_dist = dist_next
        elif mode == "end":
            dist_prev = np.sqrt((x - prev_coords[0])**2 + (y - prev_coords[1])**2)
            total_dist = dist_prev
        else:  # middle
            dist_prev = np.sqrt((x - prev_coords[0])**2 + (y - prev_coords[1])**2)
            dist_next = np.sqrt((x - next_coords[0])**2 + (y - next_coords[1])**2)
            total_dist = dist_prev + dist_next
        
        if total_dist < min_distance:
            min_distance = total_dist
            best_idx = idx
    return best_idx


for disliked_id in disliked_area_ids:
    disliked_rows = visit_area_df[visit_area_df["VISIT_AREA_ID"] == disliked_id]
    
    dislike_loc = topk_recommend_area_ids.index(disliked_id)
    
    mode = "start" if dislike_loc == 0 else "end" if dislike_loc == len(topk_indices) - 1 else "middle"
    
    if mode == 'start':        
        prev_coords = visit_area_df[visit_area_df["VISIT_AREA_ID"] == disliked_id][['X_COORD', 'Y_COORD']].values.tolist()[0]
        next_coords = visit_area_df[visit_area_df["VISIT_AREA_ID"] == topk_recommend_area_ids[dislike_loc+1]][['X_COORD', 'Y_COORD']].values.tolist()[0]
    elif mode == 'end':
        prev_coords = visit_area_df[visit_area_df["VISIT_AREA_ID"] == topk_recommend_area_ids[dislike_loc-1]][['X_COORD', 'Y_COORD']].values.tolist()[0]
        next_coords = visit_area_df[visit_area_df["VISIT_AREA_ID"] == disliked_id][['X_COORD', 'Y_COORD']].values.tolist()[0]
    else:
        prev_coords = visit_area_df[visit_area_df["VISIT_AREA_ID"] == topk_recommend_area_ids[dislike_loc-1]][['X_COORD', 'Y_COORD']].values.tolist()[0]
        next_coords = visit_area_df[visit_area_df["VISIT_AREA_ID"] == topk_recommend_area_ids[dislike_loc+1]][['X_COORD', 'Y_COORD']].values.tolist()[0]
    print(prev_coords, next_coords)
    best_idx = find_best_replacement(disliked_rows, prev_coords, next_coords, mode)
    
    disliked_emb = predicted_visit_area_embeddings[best_idx]
    distances = torch.norm(predicted_visit_area_embeddings - disliked_emb, dim=1)
    distances[best_idx] = 1e9  # 자기 자신 제외
    
    replacement_idx = torch.argmin(distances).item()
    replacement_id = visit_area_df.iloc[replacement_idx]["VISIT_AREA_ID"]
    
    print(f"🌀 싫어요 장소 {disliked_id} ({mode}) → 대체 추천 {replacement_id}")

# 싫어요 반영 후 top-10 추천
disliked_indices = visit_area_df[visit_area_df["VISIT_AREA_ID"].isin(disliked_area_ids)].index.tolist()
scores_post = scores.clone()
scores_post[disliked_indices] = -1e9  # 제외

topk_indices_post = torch.topk(scores_post, k=10).indices
topk_recommend_area_ids_post = [visit_area_df.iloc[idx]["VISIT_AREA_ID"] for idx in topk_indices_post.tolist()]
print("✨ 싫어요 반영 후 추천 top-10:", topk_recommend_area_ids_post)


✅ 초기 추천 top-10: [2305270002, 2308140010, 2305290005, 2305250003, 2308260005, 2307010002, 2309010006, 2306030006, 2307020002, 2305240004]
🚫 랜덤으로 선택된 싫어요 장소 ID: [2308140010, 2306030006, 2307020002]
[126.492769, 33.507079] [126.907292, 37.5160148]
🌀 싫어요 장소 2308140010 (middle) → 대체 추천 2305270002
[126.9780638, 37.6587628] [127.02502436907358, 37.476058309968245]
🌀 싫어요 장소 2306030006 (middle) → 대체 추천 2308250004
[127.5308669, 37.5129607] [127.02502436907358, 37.476058309968245]
🌀 싫어요 장소 2307020002 (middle) → 대체 추천 2305280005
✨ 싫어요 반영 후 추천 top-10: [2305270002, 2305290005, 2305250003, 2308260005, 2307010002, 2309010006, 2305240004, 2305270002, 2307080004, 2308270002]
