In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from sklearn.preprocessing import LabelEncoder
from lightfm import LightFM



In [3]:
# 데이터 로드

orders = pd.read_csv('./data/orders.csv')
order_products = pd.read_csv('./data/order_products__prior.csv')
products = pd.read_csv('./data/products.csv')

In [4]:
# Interaction Matrix 생성

merged = orders.merge(order_products, on='order_id')
df = merged[['user_id', 'product_id']]

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(df['user_id'])
item_ids = item_encoder.fit_transform(df['product_id'])

data = np.ones(len(user_ids))

interaction_matrix = coo_matrix((data, (user_ids, item_ids)))


In [5]:
interaction_matrix.shape

(206209, 49677)

In [6]:
# 검증용 데이터 분리

from lightfm.cross_validation import random_train_test_split

train, test = random_train_test_split(interaction_matrix, test_percentage=0.2)

In [7]:
print(train.shape, test.shape)

(206209, 49677) (206209, 49677)


In [8]:
print(train.getnnz(), test.getnnz(), interaction_matrix.getnnz())

25947591 6486898 32434489


In [None]:
from scipy.sparse import issparse

print(type(train), train.getformat())
print(type(test), test.getformat())

In [9]:
# LightFM 모델 학습

from lightfm.evaluation import auc_score

model = LightFM(
    no_components=50,
    loss='warp',
    learning_rate=0.03,
    item_alpha=1e-6,
    user_alpha=1e-6
)

model.fit(train, epochs=5, num_threads=1)

sample_users = np.random.choice(train.shape[0], size=100, replace=False)
score = auc_score(model, test, user_ids=sample_users, num_threads=1).mean()
print(f"AUC (100 users): {score:.4f}")

: 

In [None]:
from numpy import dot
from numpy.linalg import norm

target_product_name = "Coca Cola Classic"  # 예시

target_product_id = products[products['product_name'] == target_product_name]['product_id'].values[0]
target_encoded_id = item_encoder.transform([target_product_id])[0]

target_vec = model.item_embeddings[target_encoded_id]

all_item_vecs = model.item_embeddings
cos_sim = all_item_vecs @ target_vec / (norm(all_item_vecs, axis=1) * norm(target_vec))

top_sim_items = np.argsort(-cos_sim)[:10]
recommended_product_ids = item_encoder.inverse_transform(top_sim_items)

products[products['product_id'].isin(recommended_product_ids)][['product_id', 'product_name']]
