## 데이터 다운로드

In [5]:
import kagglehub
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Download latest version
path = kagglehub.dataset_download("andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews")
path = path.replace('\\', '/')

movie_df = pd.read_csv(os.path.join(path, os.listdir(path)[0]))
review_df = pd.read_csv(os.path.join(path, os.listdir(path)[1]))

print("Path to dataset files:", path)
print(movie_df.shape)
print(review_df.shape)

Path to dataset files: /Users/visuworks/.cache/kagglehub/datasets/andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews/versions/4
(143258, 16)
(1444963, 11)


In [8]:
# 영화는 리뷰 개수가 상위 10%인 영화만 사용, 56개 이상
tmp = review_df['id'].value_counts() >= 56
filtered_movie_ids = tmp[tmp].index.tolist()

# 사용자는 작성한 리뷰가 4개 이상인 사용자만 사용
tmp = review_df['criticName'].value_counts() >= 4
filtered_review_ids = tmp[tmp].index.tolist()

print(f"# of filtered_movie_ids: {len(filtered_movie_ids)}")
print(f"# of filtered_review_ids: {len(filtered_review_ids)}")

# of filtered_movie_ids: 7002
# of filtered_review_ids: 7933


In [9]:
# filtered_movie_ids를 set으로 변환하여 검색 속도 최적화
filtered_movie_ids = set(filtered_movie_ids)
filtered_review_ids = set(filtered_review_ids)

filtered_review_df = review_df[review_df['id'].apply(lambda x: x in filtered_movie_ids)].reset_index(drop=True)
filtered_review_df = filtered_review_df[filtered_review_df['criticName'].apply(lambda x: x in filtered_review_ids)].reset_index(drop=True)

In [10]:
import kagglehub
import os
import pandas as pd

def get_filtered_df():
    # Download latest version
    path = kagglehub.dataset_download("andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews")
    path = path.replace('\\', '/')

    movie_df = pd.read_csv(os.path.join(path, os.listdir(path)[0]))
    review_df = pd.read_csv(os.path.join(path, os.listdir(path)[1]))
    
    # 영화는 리뷰 개수가 상위 10%인 영화만 사용, 56개 이상
    tmp = review_df['id'].value_counts() >= 56
    filtered_movie_ids = tmp[tmp].index.tolist()

    # 사용자는 작성한 리뷰가 4개 이상인 사용자만 사용
    tmp = review_df['criticName'].value_counts() >= 4
    filtered_review_ids = tmp[tmp].index.tolist()
    
    # filtered_movie_ids를 set으로 변환하여 검색 속도 최적화
    filtered_movie_ids = set(filtered_movie_ids)
    filtered_review_ids = set(filtered_review_ids)

    filtered_review_df = review_df[review_df['id'].apply(lambda x: x in filtered_movie_ids)].reset_index(drop=True)
    filtered_review_df = filtered_review_df[filtered_review_df['criticName'].apply(lambda x: x in filtered_review_ids)].reset_index(drop=True)
    
    scoreSentiment_map = {"POSITIVE": 1, "NEGATIVE": 0}

    # map 메서드 사용
    filtered_review_df['scoreSentiment'] = filtered_review_df['scoreSentiment'].map(scoreSentiment_map)

    return filtered_review_df

## 데이터 나누기

In [11]:
import pandas as pd
import numpy as np

def split_data(df, user_col, item_col, train_ratio=0.6, val_ratio=0.3, test_ratio=0.1, random_state=42):
    """
    Splits a DataFrame into train, validation, and test sets by a 6:3:1 ratio.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing user and item interactions.
    - user_col (str): The column name representing user IDs.
    - item_col (str): The column name representing item (movie) IDs.
    - train_ratio (float): Proportion of the dataset to include in the train split.
    - val_ratio (float): Proportion of the dataset to include in the validation split.
    - test_ratio (float): Proportion of the dataset to include in the test split.
    - random_state (int): Random seed for reproducibility.

    Returns:
    - train_df (pd.DataFrame): Training data.
    - val_df (pd.DataFrame): Validation data.
    - test_df (pd.DataFrame): Testing data.
    """
    # Ensure the ratios add up to 1
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum up to 1."

    train_list, val_list, test_list = [], [], []

    # Group by user to split data per user
    for user, group in df.groupby(user_col):
        # Shuffle the group
        group = group.sample(frac=1, random_state=random_state)
        n = len(group)
        
        # Calculate split indices
        train_end = int(n * train_ratio)
        val_end = train_end + int(n * val_ratio)
        
        # Split data
        train_data = group.iloc[:train_end]
        val_data = group.iloc[train_end:val_end]
        test_data = group.iloc[val_end:]

        # Append to respective lists
        train_list.append(train_data)
        val_list.append(val_data)
        test_list.append(test_data)

    # Concatenate all splits
    train_df = pd.concat(train_list).reset_index(drop=True)
    val_df = pd.concat(val_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)

    return train_df, val_df, test_df

filtered_review_df = get_filtered_df()
train_df, val_df, test_df = split_data(filtered_review_df, user_col='criticName', item_col='id')

print("Train DataFrame:")
print(train_df.shape)
print("\nValidation DataFrame:")
print(val_df.shape)
print("\nTest DataFrame:")
print(test_df.shape)


Train DataFrame:
(555578, 11)

Validation DataFrame:
(275831, 11)

Test DataFrame:
(99686, 11)


# 모델 빌드하기

## GraphSAGE

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, HeteroConv
from torch_geometric.loader import DataLoader
from torch_geometric.data import HeteroData

In [16]:
user_col = "criticName"
item_col = "id"
label_col = "scoreSentiment"
df = filtered_review_df

num_movie_features = 64
num_review_features = 64

data = HeteroData()

# Create two node types "paper" and "author" holding a feature matrix:
data['movie'].x = torch.randn(df[item_col].nunique(), num_movie_features)
data['reviewer'].x = torch.randn(df[user_col].nunique(), num_review_features)

# Create an edge type "(author, writes, paper)" and building the
# graph connectivity:
# Map user and item IDs to unique indices for graph processing
unique_users = df[user_col].unique()
unique_items = df[item_col].unique()

user_map = {user: idx for idx, user in enumerate(unique_users)}
item_map = {item: idx + len(unique_users) for idx, item in enumerate(unique_items)}

# Convert edges to indices
edge_index = []
user_indices = df[user_col].map(user_map).values
item_indices = df[item_col].map(item_map).values
edge_index.append(torch.tensor([user_indices, item_indices], dtype=torch.long))
    
edge_index = torch.cat(edge_index, dim=1)  # Combine edges from train, val, test
data['movie', 'rates', 'reviewer'].edge_index = edge_index  # [2, num_edges]
data['movie', 'rates', 'reviewer'].rating = torch.tensor(df[label_col])  # float 타입

data['reviewer', 'rates', 'movie'].edge_index = data['movie', 'rates', 'reviewer'].edge_index[[1, 0]]
data['reviewer', 'rates', 'movie'].rating = torch.tensor(df[label_col])  # float 타입

In [17]:
data

HeteroData(
  movie={ x=[7002, 64] },
  reviewer={ x=[7494, 64] },
  (movie, rates, reviewer)={
    edge_index=[2, 931095],
    rating=[931095],
  },
  (reviewer, rates, movie)={
    edge_index=[2, 931095],
    rating=[931095],
  }
)

In [18]:
edge_index = []
tr_user_indices = train_df[user_col].map(user_map).values
tr_item_indices = train_df[item_col].map(item_map).values
edge_index.append(torch.tensor([tr_user_indices, tr_item_indices], dtype=torch.long))
tr_edge_index = torch.cat(edge_index, dim=1)  # Combine edges from train, val, test

In [19]:
data['movie', 'rates', 'reviewer'].edge_index

tensor([[    0,     1,     2,  ...,  5513,  1273,   115],
        [ 7494,  7494,  7494,  ..., 14495, 14495, 14495]])

In [20]:
tr_edge_index[0][0], tr_edge_index[1][0]

(tensor(5870), tensor(9546))

In [21]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.nn import HeteroConv, SAGEConv

# HeteroGraphSAGE 정의
class HeteroGraphSAGE(torch.nn.Module):
    def __init__(self, metadata, hidden_channels, out_channels):
        super(HeteroGraphSAGE, self).__init__()
        # 첫 번째 HeteroConv 레이어 정의
        self.conv1 = HeteroConv({
            ('movie', 'rates', 'reviewer'): SAGEConv((-1, -1), hidden_channels),
            ('reviewer', 'rates', 'movie'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='mean')

        # 두 번째 HeteroConv 레이어 정의
        self.conv2 = HeteroConv({
            ('movie', 'rates', 'reviewer'): SAGEConv((hidden_channels, hidden_channels), out_channels),
            ('reviewer', 'rates', 'movie'): SAGEConv((hidden_channels, hidden_channels), out_channels),
        }, aggr='mean')

        # 최종 분류 레이어 (이진 분류)
        self.fc = Linear(out_channels, 1)

    def forward(self, x_dict, edge_index_dict):
        # 첫 번째 GraphSAGE 레이어
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # 두 번째 GraphSAGE 레이어
        x_dict = self.conv2(x_dict, edge_index_dict)

        # 엣지 표현 계산 (user -> item)
        user_item_edge_index = edge_index_dict[('movie', 'rates', 'reviewer')]
        user_repr = x_dict['movie'][user_item_edge_index[0]]
        item_repr = x_dict['reviewer'][user_item_edge_index[1]]

        # 엣지별 예측값 계산
        edge_repr = user_repr + item_repr
        edge_prediction = torch.sigmoid(self.fc(edge_repr).squeeze(-1))  # Sigmoid for binary classification

        return edge_prediction
    
    
model = HeteroGraphSAGE(
        metadata=data.metadata(),  # HeteroData의 메타데이터 사용
        hidden_channels=128,
        out_channels=64
    )

In [22]:
data

HeteroData(
  movie={ x=[7002, 64] },
  reviewer={ x=[7494, 64] },
  (movie, rates, reviewer)={
    edge_index=[2, 931095],
    rating=[931095],
  },
  (reviewer, rates, movie)={
    edge_index=[2, 931095],
    rating=[931095],
  }
)

In [23]:
model

HeteroGraphSAGE(
  (conv1): HeteroConv(num_relations=2)
  (conv2): HeteroConv(num_relations=2)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [24]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

model = model.to(device)
data = data.to(device)

In [25]:
# 데이터 추출
x_dict = data.x_dict
edge_index_dict = data.edge_index_dict
rating = data['movie', 'rates', 'reviewer'].rating.float().to(device)
pred = model(x_dict, edge_index_dict)

In [26]:
pred.shape

torch.Size([931095])

In [27]:
# 데이터 준비
rating = data['movie', 'rates', 'reviewer'].rating

train_edge_index = torch.arange(int(0.6 * len(rating)))
val_edge_index = torch.arange(int(0.6 * len(rating)), int(0.9 * len(rating)))
test_edge_index = torch.arange(int(0.9 * len(rating)), len(rating))

train_labels = data['movie', 'rates', 'reviewer'].rating[train_edge_index].float().to(device)
val_labels = data['movie', 'rates', 'reviewer'].rating[val_edge_index].float().to(device)
test_labels = data['movie', 'rates', 'reviewer'].rating[test_edge_index].float().to(device)

In [30]:
epochs = 100
lr = 0.01

model = HeteroGraphSAGE(
        metadata=data.metadata(),  # HeteroData의 메타데이터 사용
        hidden_channels=128,
        out_channels=64
    ).to(device)

model = model.to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = torch.nn.BCELoss()  # Binary Cross Entropy Loss

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    pred = model(data.x_dict, data.edge_index_dict)

    # Train loss 계산
    train_pred = pred[train_edge_index]
    loss = loss_fn(train_pred, train_labels)
    loss.backward()
    optimizer.step()

    # Validation loss 계산
    model.eval()
    with torch.no_grad():
        val_pred = pred[val_edge_index]
        val_loss = loss_fn(val_pred, val_labels)

    # Accuracy 계산
    train_acc = ((train_pred > 0.5).float() == train_labels).float().mean().item()
    val_acc = ((val_pred > 0.5).float() == val_labels).float().mean().item()

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {loss.item():.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss.item():.4f}, Val Acc: {val_acc:.4f}")


Epoch 1/100, Train Loss: 0.6416, Train Acc: 0.6678, Val Loss: 0.6600, Val Acc: 0.6666
Epoch 2/100, Train Loss: 0.6356, Train Acc: 0.6676, Val Loss: 0.6369, Val Acc: 0.6656
Epoch 3/100, Train Loss: 0.7291, Train Acc: 0.6678, Val Loss: 0.7613, Val Acc: 0.6666
Epoch 4/100, Train Loss: 0.6475, Train Acc: 0.6634, Val Loss: 0.6767, Val Acc: 0.6621
Epoch 5/100, Train Loss: 1.0054, Train Acc: 0.3322, Val Loss: 1.0037, Val Acc: 0.3334
Epoch 6/100, Train Loss: 0.6674, Train Acc: 0.6395, Val Loss: 0.6861, Val Acc: 0.6394
Epoch 7/100, Train Loss: 0.6370, Train Acc: 0.6689, Val Loss: 0.6562, Val Acc: 0.6677
Epoch 8/100, Train Loss: 0.6644, Train Acc: 0.6396, Val Loss: 0.6647, Val Acc: 0.6386
Epoch 9/100, Train Loss: 0.6683, Train Acc: 0.6678, Val Loss: 0.6882, Val Acc: 0.6666
Epoch 10/100, Train Loss: 0.6753, Train Acc: 0.6678, Val Loss: 0.6921, Val Acc: 0.6666
Epoch 11/100, Train Loss: 0.6304, Train Acc: 0.6676, Val Loss: 0.6314, Val Acc: 0.6656
Epoch 12/100, Train Loss: 0.6404, Train Acc: 0.6678,

In [164]:
model.eval()
data = data.to(device)
test_labels = test_labels.to(device)

with torch.no_grad():
    # Forward pass
    pred = model(data.x_dict, data.edge_index_dict)

    # 테스트 정확도 계산
    test_pred = pred[test_edge_index]
    test_acc = ((test_pred > 0.5).float() == test_labels).float().mean().item()

print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.6585
