In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
book_df = pd.read_csv('C:/Users/davin/Desktop/Fake Data/books_meta_sample.csv', on_bad_lines='skip', encoding="UTF-8")
book_df.columns = ['book_id', 'name', 'link', 'author']

user_df = pd.read_csv('C:/Users/davin/Desktop/Fake Data/users.csv', on_bad_lines='skip', encoding="UTF-8")
user_df.columns = ['user_id', 'join_date', 'age', 'gender']

rating_df = pd.read_csv('C:/Users/davin/Desktop/Fake Data/interactions.csv', on_bad_lines='skip', encoding="UTF-8")
rating_df.columns = ['user_id', 'book_id', 'rating','timestamp', 'review_length_chars', 'liked']

cbr = pd.merge(rating_df, book_df, on='book_id')
columns = ['link', 'author', 'timestamp', 'review_length_chars', 'liked', 'name']
cbr = cbr.drop(columns, axis=1)
print(cbr.head())

  user_id  book_id  rating
0    U054     1418       4
1    U064      526       5
2    U041      310       4
3    U004     1015       4
4    U028     1015       4


In [4]:
users = list(cbr['user_id'].unique())
books = list(cbr['book_id'].unique())

user2idx = {user: idx for idx, user in enumerate(users)}
book2idx = {book: idx for idx, book in enumerate(books)}

num_users = len(users)
num_books = len(books)

user_encoder = LabelEncoder()
book_encoder = LabelEncoder()

cbr['user_idx'] = user_encoder.fit_transform(cbr['user_id'])
cbr['book_idx'] = book_encoder.fit_transform(cbr['book_id'])

print(cbr.head())

# Chia train/test
train_df, test_df = train_test_split(cbr, test_size=0.2, random_state=42)

# Chuyển sang tensor
train_dataset = TensorDataset(
    torch.tensor(train_df['user_idx'].values, dtype=torch.long),
    torch.tensor(train_df['book_idx'].values, dtype=torch.long),
    torch.tensor(train_df['rating'].values, dtype=torch.float32)
)

test_dataset = TensorDataset(
    torch.tensor(test_df['user_idx'].values, dtype=torch.long),
    torch.tensor(test_df['book_idx'].values, dtype=torch.long),
    torch.tensor(test_df['rating'].values, dtype=torch.float32)
)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

  user_id  book_id  rating  user_idx  book_idx
0    U054     1418       4        53       925
1    U064      526       5        63       345
2    U041      310       4        40       214
3    U004     1015       4         3       667
4    U028     1015       4        27       667


In [5]:
class NCF(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.book_embedding = nn.Embedding(num_books, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim*2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, user, book):
        user_emb = self.user_embedding(user)
        book_emb = self.book_embedding(book)
        x = torch.cat([user_emb, book_emb], dim=-1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

num_users = len(users)
num_books = len(books)

model = NCF(num_users, num_books)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [6]:
epochs = 20  # bạn có thể tăng lên 50–100 nếu dữ liệu lớn

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for user_batch, book_batch, rating_batch in train_loader:
        optimizer.zero_grad()
        preds = model(user_batch, book_batch).squeeze()
        loss = criterion(preds, rating_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/20, Training Loss: 2.1608
Epoch 2/20, Training Loss: 0.7782
Epoch 3/20, Training Loss: 0.6922
Epoch 4/20, Training Loss: 0.6378
Epoch 5/20, Training Loss: 0.5987
Epoch 6/20, Training Loss: 0.5710
Epoch 7/20, Training Loss: 0.5502
Epoch 8/20, Training Loss: 0.5380
Epoch 9/20, Training Loss: 0.5185
Epoch 10/20, Training Loss: 0.5101
Epoch 11/20, Training Loss: 0.4861
Epoch 12/20, Training Loss: 0.4739
Epoch 13/20, Training Loss: 0.4604
Epoch 14/20, Training Loss: 0.4488
Epoch 15/20, Training Loss: 0.4370
Epoch 16/20, Training Loss: 0.4298
Epoch 17/20, Training Loss: 0.4352
Epoch 18/20, Training Loss: 0.4223
Epoch 19/20, Training Loss: 0.4189
Epoch 20/20, Training Loss: 0.4111


In [7]:
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for user_batch, book_batch, rating_batch in test_loader:
        preds = model(user_batch, book_batch).squeeze()
        all_preds.extend(preds.tolist())
        all_true.extend(rating_batch.tolist())

# Chuyển sang DataFrame để xem
test_results = test_df.copy()
test_results['predicted_rating'] = all_preds
print(test_results.head())

     user_id  book_id  rating  user_idx  book_idx  predicted_rating
6408    U056     1069       5        55       707          5.101834
7678    U084     1429       3        83       933          3.466252
4843    U035     1914       4        34      1234          4.540919
357     U014      151       4        13       105          4.624361
3314    U094     1193       4        93       786          3.314741


In [8]:
import torch
import pandas as pd

def recommend_books_for_user(
    user_id,
    model,
    user2idx,
    book2idx,
    ratings_df,      # DataFrame: user_id, book_id, rating
    top_k=5,
    device="cpu"
):

    # Kiểm tra user có tồn tại không
    if user_id not in user2idx:
        raise ValueError(f"User {user_id} không tồn tại trong dữ liệu!")

    model.eval()
    model.to(device)

    # Lấy index user
    user_idx = user2idx[user_id]

    # Danh sách tất cả sách trong hệ thống
    all_book_ids = list(book2idx.keys())

    # Sách user đã đọc
    already_read = ratings_df[ratings_df["user_id"] == user_id]["book_id"].values
    unread_books = [b for b in all_book_ids if b not in already_read]

    if len(unread_books) == 0:
        return "User này đã đọc hết sách!"

    unread_book_indices = [book2idx[b] for b in unread_books]

    # Dự đoán điểm
    with torch.no_grad():
        user_tensor = torch.tensor([user_idx] * len(unread_book_indices), dtype=torch.long).to(device)
        book_tensor = torch.tensor(unread_book_indices, dtype=torch.long).to(device)

        preds = model(user_tensor, book_tensor).squeeze()

        k = min(top_k, len(unread_books))
        top_k_indices = torch.topk(preds, k).indices.cpu().numpy()

    # Trả về danh sách top K dạng (book_id, score)
    recommendations = []
    for i in top_k_indices:
        book_id = unread_books[i]
        score = round(preds[i].item(), 3)
        recommendations.append((book_id, score))

    return recommendations


In [9]:
top_books = recommend_books_for_user(
    user_id="U056",
    model=model,
    user2idx=user2idx,
    book2idx=book2idx,
    ratings_df=cbr,
    top_k=5
)

for book, score in top_books:
    print(f"Book ID: {book} | Predicted rating: {score}")


Book ID: 1549 | Predicted rating: 5.321
Book ID: 541 | Predicted rating: 5.303
Book ID: 1581 | Predicted rating: 5.17
Book ID: 1568 | Predicted rating: 5.117
Book ID: 360 | Predicted rating: 5.083


In [11]:
import os, datetime, torch

os.makedirs("checkpoints", exist_ok=True)
state_path = f"checkpoints/ncf_state.pth"
full_path = f"checkpoints/ncf_full.pth"

# Save only weights
torch.save({ 'state_dict': model.state_dict() }, state_path)
print("Lưu state_dict tại:", state_path)

# Save full model (architecture + weights).
# This is less portable across code changes.
# torch.save(model, full_path)
# print("Saved full model to:", full_path)


Lưu state_dict tại: checkpoints/ncf_state.pth


In [None]:
num_users = 610          # thay bằng số user thực tế bạn dùng
num_items = 9724         # thay bằng số item thực tế bạn dùng
embedding_dim = 64
hidden_layers = [128, 64, 32]

model_loaded = NCF(num_users, num_items, embedding_dim, hidden_layers)

# --- Bước 2: Load trọng số đã lưu ---
ckpt = torch.load("checkpoints/ncf_state_YYYYMMDD_HHMMSS.pth", map_location="cpu")
model_loaded.load_state_dict(ckpt["state_dict"])
model_loaded.eval()