In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls -1ha kaggle.json

kaggle.json


In [3]:
!kaggle datasets download -d yelp-dataset/yelp-dataset

Dataset URL: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset
License(s): other
Downloading yelp-dataset.zip to /shared/home/lexxsh
100%|█████████████████████████████████████▉| 4.07G/4.07G [03:25<00:00, 21.9MB/s]
100%|██████████████████████████████████████| 4.07G/4.07G [03:25<00:00, 21.3MB/s]


In [4]:
!unzip yelp-dataset.zip

Archive:  yelp-dataset.zip
  inflating: Dataset_User_Agreement.pdf  
  inflating: yelp_academic_dataset_business.json  
  inflating: yelp_academic_dataset_checkin.json  
  inflating: yelp_academic_dataset_review.json  
  inflating: yelp_academic_dataset_tip.json  
  inflating: yelp_academic_dataset_user.json  


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from math import sqrt
from scipy.sparse.linalg import svds
from tqdm import tqdm

# 데이터 로드
print("Loading data...")
business_data = pd.read_json('yelp_academic_dataset_business.json', lines=True)
review_data = pd.read_json('yelp_academic_dataset_review.json', lines=True)
user_data = pd.read_json('yelp_academic_dataset_user.json', lines=True)

# 데이터 전처리
print("Preprocessing data...")
# 비즈니스 데이터에서 레스토랑 카테고리 필터링
business_data = business_data[business_data['categories'].str.contains('Restaurants', na=False)]
business_data = business_data[['business_id', 'name', 'categories']]

# 리뷰 데이터에서 필요한 열 선택
review_data = review_data[['user_id', 'business_id', 'stars']]

# 사용자 데이터에서 필요한 열 선택
user_data = user_data[['user_id', 'name']]

# 데이터 병합
print("Merging data...")
merged_data = pd.merge(review_data, business_data, on='business_id')
merged_data = pd.merge(merged_data, user_data, on='user_id')

# 사용자-가게-평점 행렬 생성
print("Creating user-item matrix...")
user_item_matrix = merged_data.pivot_table(index='user_id', columns='business_id', values='stars').fillna(0)

# Train/Test 분할
print("Splitting data...")
train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=0)

# 사용자-가게 행렬에서 사용자, 아이템 행렬 추출 및 다루기
print("Decomposing matrix...")
R = train_data.values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# 행렬 분해 (Singular Value Decomposition)
U, sigma, Vt = svds(R_demeaned, k=50)

# 행렬 복원
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predicted_ratings = pd.DataFrame(all_user_predicted_ratings, columns = train_data.columns)

# 추천 함수
def recommend_restaurants(predictions, user_id, num_recommendations=5):
    user_idx = user_item_matrix.index.get_loc(user_id)
    user_ratings = user_item_matrix.iloc[user_idx]
    pred_ratings = predictions.iloc[user_idx]
    sortable_recommendations = pred_ratings.where(user_ratings == 0).dropna(ascending=False).head(num_recommendations)
    recommendations = list(sortable_recommendations.index)
    return recommendations

# RMSE 계산 함수
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

# Cross-validation
print("Performing cross-validation...")
n_folds = 5
kf = KFold(n_splits=n_folds)
rmse_scores = []
for train_index, test_index in tqdm(kf.split(user_item_matrix.values), total=n_folds, desc='KFold Progress'):
    train_data_cv, test_data_cv = user_item_matrix.values[train_index], user_item_matrix.values[test_index]
    
    # Train 데이터 처리
    user_ratings_mean_cv = np.mean(train_data_cv, axis = 1)
    R_demeaned_cv = train_data_cv - user_ratings_mean_cv.reshape(-1, 1)
    
    # 행렬 분해
    U_cv, sigma_cv, Vt_cv = svds(R_demeaned_cv, k=50)
    sigma_cv = np.diag(sigma_cv)
    all_user_predicted_ratings_cv = np.dot(np.dot(U_cv, sigma_cv), Vt_cv) + user_ratings_mean_cv.reshape(-1, 1)
    
    # RMSE 계산
    rmse_score = rmse(all_user_predicted_ratings_cv, test_data_cv)
    rmse_scores.append(rmse_score)

average_rmse = np.mean(rmse_scores)
print(f"Average RMSE across {n_folds}-folds: {average_rmse}")

# 예시 사용자에 대한 추천 생성
example_user_id = user_item_matrix.index[0]
recommended_restaurants = recommend_restaurants(predicted_ratings, example_user_id)
print(f"Recommended Restaurants for User {example_user_id}: {recommended_restaurants}")


Loading data...
Preprocessing data...
Merging data...
Creating user-item matrix...


  user_item_matrix = merged_data.pivot_table(index='user_id', columns='business_id', values='stars').fillna(0)
