In [165]:
import os
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

# 경로 설정
data_dir = "./data/VL_csv/"

# CSV 파일 경로
t_path = lambda name: os.path.join(data_dir, name)

# CSV 로딩 함수
def load_data():
    traveler_df = pd.read_csv(t_path("tn_traveller_master_여행객 Master_E.csv"))
    place_df = pd.read_csv(t_path("tn_visit_area_info_방문지정보_E.csv"))
    activity_df = pd.read_csv(t_path("tn_activity_his_활동내역_E.csv"))
    lodge_df = pd.read_csv(t_path("tn_lodge_consume_his_숙박소비내역_E.csv"))
    move_df = pd.read_csv(t_path("tn_move_his_이동내역_E.csv"))
    mvmn_consume_df = pd.read_csv(t_path("tn_mvmn_consume_his_이동수단소비내역_E.csv"))
    travel_df = pd.read_csv(t_path("tn_travel_여행_E.csv"))
    return traveler_df, place_df, activity_df, lodge_df, move_df, mvmn_consume_df, travel_df

In [166]:
# 지금 위치 확인
current_dir = os.getcwd()
print(f"Current Directory: {current_dir}")
print(f"Data Directory: {data_dir}")
print(f"Data Exist: {os.path.exists(t_path('tn_visit_area_info_방문지정보_E.csv'))}")

Current Directory: c:\Users\daeho\OneDrive\문서\GitHub\GNN_Recommend
Data Directory: ./data/VL_csv/
Data Exist: True


In [167]:
# 범주형 문자 → 숫자 매핑
def encode_column(series):
    le = LabelEncoder()
    return le.fit_transform(series.astype(str)), le

def preprocess_traveler(df):
    gender_map = {'남': 0, '여': 1}
    df['GENDER'] = df['GENDER'].map(gender_map).fillna(-1)
    return df

In [168]:
def build_hetero_graph():
    traveler_df, place_df, activity_df, lodge_df, move_df, mvmn_consume_df, travel_df = load_data()

    data = HeteroData()

    # 여행자 노드
    traveler_df = traveler_df.drop_duplicates(subset="TRAVELER_ID")
    traveler_df = preprocess_traveler(traveler_df)
    traveler_ids, traveler_encoder = encode_column(travel_df["TRAVELER_ID"])
    traveler_feats = traveler_df[["AGE_GRP", "GENDER", "MARR_STTS"]].fillna(0).astype(float)
    data['traveler'].x = torch.tensor(traveler_feats.values, dtype=torch.float)

    # 장소 노드 (VISIT_AREA_ID)
    place_ids, place_encoder = encode_column(place_df["VISIT_AREA_ID"])
    place_feats = place_df[["VISIT_ORDER"]].fillna(0).astype(float)
    data['place'].x = torch.tensor(place_feats.values, dtype=torch.float)

    # 활동 노드 (ACTIVITY_TYPE_CD 기준)
    activity_df = activity_df.drop_duplicates(subset=["TRAVEL_ID", "ACTIVITY_TYPE_CD"])
    activity_ids, activity_encoder = encode_column(activity_df["ACTIVITY_TYPE_CD"])
    activity_feats = activity_df[["ACTIVITY_TYPE_SEQ"]].fillna(0).astype(float)
    data['activity'].x = torch.tensor(activity_feats.values, dtype=torch.float)

    # 숙소 노드 (LODGING_TYPE_CD 기준)
    lodge_df = lodge_df.drop_duplicates(subset=["TRAVEL_ID", "LODGING_TYPE_CD"])
    lodge_ids, lodge_encoder = encode_column(lodge_df["LODGING_TYPE_CD"])
    lodge_feats = lodge_df[["LODGING_PAYMENT_SEQ"]].fillna(0).astype(float)
    data['lodge'].x = torch.tensor(lodge_feats.values, dtype=torch.float)

    # 여행 ID - 여행자 매핑 테이블
    travel_mapping = travel_df[['TRAVEL_ID', 'TRAVELER_ID']].dropna()

    # 엣지: 여행자 → 장소
    edge_df = place_df.merge(travel_mapping, on='TRAVEL_ID')
    edge_df = edge_df[edge_df['TRAVELER_ID'].astype(str).isin(traveler_encoder.classes_)]
    src = traveler_encoder.transform(edge_df['TRAVELER_ID'].astype(str))
    dst = place_encoder.transform(edge_df['VISIT_AREA_ID'].astype(str))
    data['traveler', 'visited', 'place'].edge_index = torch.tensor([src, dst], dtype=torch.long)

    # 엣지: 여행자 → 활동
    edge_df = activity_df.merge(travel_mapping, on='TRAVEL_ID')
    src = traveler_encoder.transform(edge_df['TRAVELER_ID'].astype(str))
    dst = activity_encoder.transform(edge_df['ACTIVITY_TYPE_CD'].astype(str))
    data['traveler', 'did', 'activity'].edge_index = torch.tensor([src, dst], dtype=torch.long)

    # 엣지: 여행자 → 숙소
    edge_df = lodge_df.merge(travel_mapping, on='TRAVEL_ID')
    src = traveler_encoder.transform(edge_df['TRAVELER_ID'])
    dst = lodge_encoder.transform(edge_df['LODGING_TYPE_CD'].astype(str))
    data['traveler', 'stayed_in', 'lodge'].edge_index = torch.tensor([src, dst], dtype=torch.long)

    # 엣지: 장소 → 장소 (이동 정보 기반 동선 연결)
    move_df = move_df.dropna(subset=["START_VISIT_AREA_ID", "END_VISIT_AREA_ID"])
    src = place_encoder.transform(move_df["START_VISIT_AREA_ID"].astype(str))
    dst = place_encoder.transform(move_df["END_VISIT_AREA_ID"].astype(str))
    data['place', 'move_to', 'place'].edge_index = torch.tensor([src, dst], dtype=torch.long)

     # traveler 노드 self-loop 추가
    num_travelers = data['traveler'].x.size(0)
    self_loop_src = torch.arange(num_travelers, dtype=torch.long)
    self_loop_dst = torch.arange(num_travelers, dtype=torch.long)
    data['traveler', 'self_loop', 'traveler'].edge_index = torch.stack([self_loop_src, self_loop_dst], dim=0)


    # 엣지 feature: 이동 수단 거리 or 비용
    if 'DSTNC' in move_df.columns:
        edge_attr = torch.tensor(move_df['DSTNC'].fillna(0).values, dtype=torch.float).unsqueeze(1)
        data['place', 'move_to', 'place'].edge_attr = edge_attr

    return data, place_encoder, traveler_encoder, activity_encoder, lodge_encoder

In [169]:
hetero_data, place_encoder, traveler_encoder, activity_encoder, lodge_encoder = build_hetero_graph()
print(hetero_data)

HeteroData(
  traveler={ x=[320, 3] },
  place={ x=[2770, 1] },
  activity={ x=[1348, 1] },
  lodge={ x=[90, 1] },
  (traveler, visited, place)={ edge_index=[2, 2770] },
  (traveler, did, activity)={ edge_index=[2, 1348] },
  (traveler, stayed_in, lodge)={ edge_index=[2, 90] },
  (place, move_to, place)={ edge_index=[2, 0] },
  (traveler, self_loop, traveler)={ edge_index=[2, 320] }
)


# GNN 모델 설계

In [170]:
import torch
import torch.nn as nn
from torch_geometric.nn import HeteroConv, GATConv, SAGEConv, Linear

In [171]:
class GNNRecommender(torch.nn.Module):
    def __init__(self, metadata, hidden_channels=32, out_channels=16):
        super(GNNRecommender, self).__init__()
        self.metadata = metadata

        # HeteroConv 1층
        self.conv1 = HeteroConv({
            ('traveler', 'self_loop', 'traveler'): GATConv((-1, -1), hidden_channels, add_self_loops=True),
            ('traveler', 'visited', 'place'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            ('traveler', 'did', 'activity'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            ('traveler', 'stayed_in', 'lodge'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            ('place', 'move_to', 'place'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
        }, aggr='sum')

        # 노드별 선형 변환
        self.lin_dict = nn.ModuleDict({
            node_type: Linear(hidden_channels, out_channels)
            for node_type in metadata[0]
        })

    def forward(self, x_dict, edge_index_dict):
        # HeteroConv 적용
        x_dict_updated = self.conv1(x_dict, edge_index_dict)

        # 선형 변환 및 활성화 함수 적용
        out_dict = {}
        for node_type, x in x_dict_updated.items():
            out_dict[node_type] = self.lin_dict[node_type](x.relu())
        return out_dict

In [172]:
metadata = hetero_data.metadata()
model = GNNRecommender(metadata)
out = model(hetero_data.x_dict, hetero_data.edge_index_dict)
print(out)

{'traveler': tensor([[ 1.7106, -1.9376,  1.3860,  ...,  0.8402, -0.5126,  0.1970],
        [ 2.5951, -2.9942,  2.1778,  ...,  1.4861, -0.8253,  0.3602],
        [ 2.5951, -2.9942,  2.1778,  ...,  1.4861, -0.8253,  0.3602],
        ...,
        [ 2.7170, -2.9505,  2.2162,  ...,  1.3829, -0.9025,  0.4142],
        [ 2.5951, -2.9942,  2.1778,  ...,  1.4861, -0.8253,  0.3602],
        [ 1.8155, -2.0093,  1.4891,  ...,  1.0361, -0.6694,  0.2886]],
       grad_fn=<AddmmBackward0>), 'place': tensor([[-1.0511,  3.2328, -0.6404,  ..., -7.5083,  2.6533,  0.3972],
        [-1.0511,  3.2328, -0.6404,  ..., -7.5083,  2.6533,  0.3972],
        [-1.0511,  3.2328, -0.6404,  ..., -7.5083,  2.6533,  0.3972],
        ...,
        [ 0.0623, -0.1268, -0.0867,  ...,  0.1749, -0.1217, -0.0128],
        [ 0.0623, -0.1268, -0.0867,  ...,  0.1749, -0.1217, -0.0128],
        [ 0.0623, -0.1268, -0.0867,  ...,  0.1749, -0.1217, -0.0128]],
       grad_fn=<AddmmBackward0>), 'activity': tensor([[ 2.9149,  0.8843,  6.

In [173]:
# gnn_train.py - GNN 학습 루프
import torch
from torch.nn import BCEWithLogitsLoss
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.utils import negative_sampling

# 학습 함수 정의
def train_gnn(data, epochs=10, lr=0.005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 모델 생성 및 이동
    model = GNNRecommender(data.metadata()).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = BCEWithLogitsLoss()

    # 훈련용 edge (링크 예측 대상): traveler → place
    edge_index = data['traveler', 'visited', 'place'].edge_index
    num_nodes_traveler = data['traveler'].num_nodes
    num_nodes_place = data['place'].num_nodes

    for epoch in range(epochs):
        model.train()

        # 양성 샘플 (실제 방문)
        pos_edge = edge_index.t()

        # 음성 샘플 (방문 안한 곳 랜덤 샘플링)
        neg_edge = negative_sampling(
            edge_index=edge_index,
            num_nodes=(num_nodes_traveler, num_nodes_place),
            num_neg_samples=pos_edge.size(0),
            method='sparse').t()

        # 병합
        edge_label_index = torch.cat([pos_edge, neg_edge], dim=0)
        edge_label = torch.cat([
            torch.ones(pos_edge.size(0)),
            torch.zeros(neg_edge.size(0))
        ], dim=0).to(device)

        # forward
        x_dict = {k: v.to(device) for k, v in data.x_dict.items()}
        edge_index_dict = {k: v.to(device) for k, v in data.edge_index_dict.items()}
        out_dict = model(x_dict, edge_index_dict)

        src_emb = out_dict.get('traveler', x_dict['traveler'])[edge_label_index[:, 0]]
        dst_emb = out_dict['place'][edge_label_index[:, 1]]
        print(src_emb.shape, dst_emb.shape)
        pred = (src_emb * dst_emb).sum(dim=-1)
        loss = criterion(pred, edge_label)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"[Epoch {epoch+1}] Loss: {loss.item():.4f}")

    return model


In [174]:
def train_gnn(data, epochs=10, lr=0.005):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 모델 생성 및 이동
    model = GNNRecommender(data.metadata(), hidden_channels=32, out_channels=16).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = BCEWithLogitsLoss()

    # 훈련용 edge (링크 예측 대상): traveler → place
    edge_index = data['traveler', 'visited', 'place'].edge_index.to(device)
    num_nodes_traveler = data['traveler'].x.size(0)
    num_nodes_place = data['place'].x.size(0)

    for epoch in range(epochs):
        model.train()

        # 양성 샘플 (실제 방문)
        pos_edge = edge_index.t()

        # 음성 샘플 (방문 안한 곳 랜덤 샘플링)
        neg_edge = negative_sampling(
            edge_index=edge_index,
            num_nodes=(num_nodes_traveler, num_nodes_place),
            num_neg_samples=pos_edge.size(0),
            method='sparse').t()

        # 병합
        edge_label_index = torch.cat([pos_edge, neg_edge], dim=0)
        edge_label = torch.cat([
            torch.ones(pos_edge.size(0), device=device),
            torch.zeros(neg_edge.size(0), device=device)
        ], dim=0)

        # forward
        x_dict = {k: v.to(device) for k, v in data.x_dict.items()}
        edge_index_dict = {k: v.to(device) for k, v in data.edge_index_dict.items()}
        out_dict = model(x_dict, edge_index_dict)
        
        # traveler와 place 노드의 임베딩 추출
        src_emb = out_dict['traveler'][edge_label_index[:, 0]]
        dst_emb = out_dict['place'][edge_label_index[:, 1]]

        # 점수 계산 및 손실 함수 적용
        pred = (src_emb * dst_emb).sum(dim=-1)
        loss = criterion(pred, edge_label)

        # 역전파 및 최적화
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"[Epoch {epoch+1}] Loss: {loss.item():.4f}")

    return model

In [175]:
model = train_gnn(hetero_data, epochs=20, lr=0.01)

[Epoch 1] Loss: 17.5068
[Epoch 2] Loss: 7.1169
[Epoch 3] Loss: 9.7600
[Epoch 4] Loss: 9.5770
[Epoch 5] Loss: 7.8286
[Epoch 6] Loss: 5.3115
[Epoch 7] Loss: 2.5463
[Epoch 8] Loss: 1.1627
[Epoch 9] Loss: 2.5066
[Epoch 10] Loss: 0.5967
[Epoch 11] Loss: 1.0457
[Epoch 12] Loss: 1.4552
[Epoch 13] Loss: 1.5375
[Epoch 14] Loss: 1.2705
[Epoch 15] Loss: 1.0298
[Epoch 16] Loss: 0.6895
[Epoch 17] Loss: 0.6087
[Epoch 18] Loss: 1.0459
[Epoch 19] Loss: 0.8062
[Epoch 20] Loss: 0.5370


In [180]:
def recommend_topk_places(model, data, traveler_id, place_encoder, top_k=5):
    device = next(model.parameters()).device
    model.eval()

    # traveler ID 인덱스 변환
    traveler_idx = traveler_encoder.transform([traveler_id])[0]

    # 데이터 준비
    x_dict = {k: v.to(device) for k, v in data.x_dict.items()}
    edge_index_dict = {k: v.to(device) for k, v in data.edge_index_dict.items()}

    with torch.no_grad():
        out_dict = model(x_dict, edge_index_dict)

        # traveler 임베딩
        traveler_emb = out_dict['traveler'][traveler_idx].unsqueeze(0)  # shape [1, d]
        place_emb = out_dict['place']  # shape [N, d]

        print("Traveler embedding sample:", traveler_emb[0][:5])
        print("Place embedding variance:", place_emb.var(dim=0).mean())

        # 내적 기반 유사도 계산
        scores = (traveler_emb @ place_emb.T).squeeze(0)  # shape [N]
        topk = torch.topk(scores, k=top_k)

        top_indices = topk.indices.cpu().tolist()
        top_scores = topk.values.cpu().tolist()

    return list(zip(top_indices, top_scores))

In [181]:
result = recommend_topk_places(model, hetero_data, 'e000297', place_encoder, top_k=5)
for place_idx, score in result:
    print(f"추천 장소 ID: {place_idx}, 점수: {score:.4f}")

Traveler embedding sample: tensor([-0.9857, -0.1818,  0.1061,  0.0718,  0.2792], device='cuda:0')
Place embedding variance: tensor(0.3773, device='cuda:0')
추천 장소 ID: 779, 점수: 5.5825
추천 장소 ID: 98, 점수: 5.5825
추천 장소 ID: 780, 점수: 5.5825
추천 장소 ID: 782, 점수: 5.5825
추천 장소 ID: 781, 점수: 5.5825


In [182]:
import pandas as pd

place_df = pd.read_csv("./data/VL_CSV/tn_visit_area_info_방문지정보_E.csv")
id_to_place = dict(zip(place_encoder.transform(place_df['VISIT_AREA_ID'].astype(str)), place_df['VISIT_AREA_NM']))

for idx, score in result:
    place_name = id_to_place.get(idx, "Unknown")
    print(f"추천 장소: {place_name} (인덱스 {idx}), 점수: {score:.4f}")

추천 장소: 아우어 베이커리 하남 스타필드점 (인덱스 779), 점수: 5.5825
추천 장소: 허 서방 불고기 냉면 (인덱스 98), 점수: 5.5825
추천 장소: 집 (인덱스 780), 점수: 5.5825
추천 장소: 이제 제 하남 스타필드점 (인덱스 782), 점수: 5.5825
추천 장소: 안스 베이커리 스타필드 하남점 (인덱스 781), 점수: 5.5825
