In [None]:
import pandas as pd

df = pd.read_csv('/content/data_for_train_test.csv')

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np


# 1. CẤU HÌNH

CONFIG = {
    'csv_path': 'data_for_train_test.csv',
    'pt_path': 'nfm_rec_data.pt',
    'batch_size': 32,
    'lr': 0.002,
    'epochs': 20,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'aug_samples': 500,
    'ranking_weight': 0.5,
    'margin': 0.5
}

def generate_synthetic_negative_data(original_df, num_fake_samples=500):
    """Tạo dữ liệu giả: Aspect thấp -> Rating thấp để model học chê"""
    print(f"--- [AUGMENTATION] Tạo {num_fake_samples} mẫu dữ liệu tiêu cực giả lập ---")
    existing_items = original_df['item_id'].unique()
    fake_data = []

    for _ in range(num_fake_samples):
        rand_item = np.random.choice(existing_items)
        bad_aspects = np.random.choice([1, 2], size=4, p=[0.8, 0.2])
        bad_rating = np.random.uniform(0.5, 1.5)   # Rating thấp

        row = {
            'item_id': rand_item,
            'ratings': bad_rating,
            'Khong_Gian_Canh_Quan': bad_aspects[0], 'Ha_Tang_Tien_Ich': bad_aspects[1],
            'Dich_Vu_Con_Nguoi': bad_aspects[2], 'Gia_Ca_Chi_Phi': bad_aspects[3],
            'test': 0
        }
        fake_data.append(row)
    return pd.concat([original_df, pd.DataFrame(fake_data)], ignore_index=True)


# 2. DATASET

class HybridDataset(Dataset):
    def __init__(self, dataframe, all_item_ids, is_train=True):
        self.df = dataframe
        self.all_items = all_item_ids
        self.is_train = is_train

        self.aspect_cols = ['Khong_Gian_Canh_Quan', 'Ha_Tang_Tien_Ich', 'Dich_Vu_Con_Nguoi', 'Gia_Ca_Chi_Phi']

        self.item_ids = torch.tensor(self.df['item_id'].values, dtype=torch.long)
        self.ratings = torch.tensor(self.df['ratings'].values, dtype=torch.float32)

        # Normalize Aspect
        aspect_values = self.df[self.aspect_cols].fillna(0).values.astype(np.float32)
        self.aspect_vecs = torch.tensor(aspect_values / 3.0, dtype=torch.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # 1. Lấy mẫu Positive
        pos_item = self.item_ids[idx]
        aspect = self.aspect_vecs[idx]
        rating = self.ratings[idx]

        if not self.is_train:
            # Nếu là test thì không cần negative
            return pos_item, aspect, rating, pos_item

        # 2. Lấy mẫu Negative (Random Item) cho bài toán Ranking
        neg_item_val = np.random.choice(self.all_items)
        while neg_item_val == pos_item.item():
            neg_item_val = np.random.choice(self.all_items)

        neg_item = torch.tensor(neg_item_val, dtype=torch.long)

        return pos_item, aspect, rating, neg_item


# 3. MODEL

class ItemAspectRatingModel(nn.Module):
    def __init__(self, pretrained_item_vecs, aspect_dim=4, hidden_layers=[64, 32], item_dropout_prob=0.3):
        super(ItemAspectRatingModel, self).__init__()

        self.item_embedding = nn.Embedding.from_pretrained(pretrained_item_vecs, freeze=False)
        item_emb_dim = pretrained_item_vecs.shape[1]
        self.item_dropout_prob = item_dropout_prob

        # MLP
        layers = []
        in_dim = item_emb_dim + aspect_dim
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            in_dim = hidden_dim

        self.mlp = nn.Sequential(*layers)
        self.output_layer = nn.Linear(in_dim, 1)

    def forward(self, item_idx, aspect_vec):
        i_vec = self.item_embedding(item_idx)

        # Item Dropout
        if self.training and self.item_dropout_prob > 0:
            mask = torch.bernoulli(torch.full((i_vec.shape[0], 1), 1 - self.item_dropout_prob)).to(i_vec.device)
            i_vec = i_vec * mask

        combined = torch.cat([i_vec, aspect_vec], dim=1)
        return torch.sigmoid(self.output_layer(self.mlp(combined))) * 5.0


# 4. TRAINING LOOP

def train_hybrid_model():
    print(f"--- Bắt đầu quy trình train trên {CONFIG['device']} ---")

    # 1. Load Data
    df_raw = pd.read_csv(CONFIG['csv_path'])
    all_items = df_raw['item_id'].unique()

    # Augmentation
    df_full = generate_synthetic_negative_data(df_raw, num_fake_samples=CONFIG['aug_samples'])

    # Split Train/Test
    train_df = df_full[df_full['test'] == 0].reset_index(drop=True)
    test_df = df_full[df_full['test'] == 1].reset_index(drop=True)

    train_dataset = HybridDataset(train_df, all_items, is_train=True)
    test_dataset = HybridDataset(test_df, all_items, is_train=False)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

    # 2. Load Vector
    try:
        pt_data = torch.load(CONFIG['pt_path'])
        item_vecs = pt_data['item_emb']
        print(f"Đã load Item Embeddings: {item_vecs.shape}")
    except Exception as e:
        print(f"Lỗi load file PT: {e}")
        return

    # 3. Model & Optimizer
    model = ItemAspectRatingModel(pretrained_item_vecs=item_vecs, item_dropout_prob=0.3).to(CONFIG['device'])
    optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['lr'])

    criterion_mse = nn.MSELoss()
    criterion_rank = nn.MarginRankingLoss(margin=CONFIG['margin'])

    # 4. Loop
    best_rmse = float('inf')

    for epoch in range(CONFIG['epochs']):
        model.train()
        total_loss = 0
        mse_part = 0
        rank_part = 0

        for pos_item, aspect, rating, neg_item in train_loader:
            pos_item, aspect = pos_item.to(CONFIG['device']), aspect.to(CONFIG['device'])
            rating = rating.to(CONFIG['device'])
            neg_item = neg_item.to(CONFIG['device'])

            optimizer.zero_grad()

            # --- Forward Pass ---
            pred_pos = model(pos_item, aspect).squeeze()
            pred_neg = model(neg_item, aspect).squeeze()

            # --- Tính Loss ---
            loss_mse = criterion_mse(pred_pos, rating)

            # 2. Ranking Loss
            high_rating_mask = (rating >= 4.0).float()

            target = torch.ones_like(pred_pos)

            rank_loss_raw = torch.clamp(CONFIG['margin'] - (pred_pos - pred_neg), min=0)
            loss_rank = (rank_loss_raw * high_rating_mask).mean()

            # 3. Tổng hợp Loss
            loss_total = loss_mse + CONFIG['ranking_weight'] * loss_rank

            loss_total.backward()
            optimizer.step()

            total_loss += loss_total.item()
            mse_part += loss_mse.item()
            rank_part += loss_rank.item()

        # --- Eval ---
        model.eval()
        total_test_loss = 0
        with torch.no_grad():
            for pos_item, aspect, rating, _ in test_loader:
                pos_item, aspect, rating = pos_item.to(CONFIG['device']), aspect.to(CONFIG['device']), rating.to(CONFIG['device'])
                pred = model(pos_item, aspect).squeeze()
                total_test_loss += criterion_mse(pred, rating).item()

        rmse = np.sqrt(total_test_loss / len(test_loader))

        print(f"Epoch {epoch+1:02d} | Loss Total: {total_loss/len(train_loader):.4f} "
              f"(MSE: {mse_part/len(train_loader):.4f}, Rank: {rank_part/len(train_loader):.4f}) "
              f"| Test RMSE: {rmse:.4f}")

        if rmse < best_rmse:
            best_rmse = rmse
            torch.save(model.state_dict(), 'best_hybrid_model.pth')

    print(f"\nTraining xong! Best RMSE: {best_rmse:.4f}")
    print("Model đã lưu tại: 'best_hybrid_model.pth'")

if __name__ == "__main__":
    train_hybrid_model()

--- Bắt đầu quy trình train trên cpu ---
--- [AUGMENTATION] Tạo 500 mẫu dữ liệu tiêu cực giả lập ---
Đã load Item Embeddings: torch.Size([411, 8])
Epoch 01 | Loss Total: 2.8334 (MSE: 2.6409, Rank: 0.3850) | Test RMSE: 0.9660
Epoch 02 | Loss Total: 2.1283 (MSE: 1.9403, Rank: 0.3760) | Test RMSE: 1.0636
Epoch 03 | Loss Total: 1.8127 (MSE: 1.6245, Rank: 0.3762) | Test RMSE: 1.0684
Epoch 04 | Loss Total: 1.6549 (MSE: 1.4709, Rank: 0.3681) | Test RMSE: 1.0329
Epoch 05 | Loss Total: 1.4237 (MSE: 1.2362, Rank: 0.3751) | Test RMSE: 1.0085
Epoch 06 | Loss Total: 1.2229 (MSE: 1.0338, Rank: 0.3783) | Test RMSE: 0.9446
Epoch 07 | Loss Total: 1.0465 (MSE: 0.8558, Rank: 0.3813) | Test RMSE: 0.9426
Epoch 08 | Loss Total: 0.9467 (MSE: 0.7489, Rank: 0.3956) | Test RMSE: 0.9188
Epoch 09 | Loss Total: 0.8864 (MSE: 0.6982, Rank: 0.3765) | Test RMSE: 0.8828
Epoch 10 | Loss Total: 0.8438 (MSE: 0.6533, Rank: 0.3811) | Test RMSE: 0.8780
Epoch 11 | Loss Total: 0.8008 (MSE: 0.6141, Rank: 0.3733) | Test RMSE: 0.

In [None]:
class ItemAspectRatingModel(nn.Module):
    def __init__(self, pretrained_item_vecs, aspect_dim=4, hidden_layers=[64, 32], item_dropout_prob=0.3):
        super(ItemAspectRatingModel, self).__init__()

        self.item_embedding = nn.Embedding.from_pretrained(pretrained_item_vecs, freeze=False)
        item_emb_dim = pretrained_item_vecs.shape[1]
        self.item_dropout_prob = item_dropout_prob

        # MLP
        layers = []
        in_dim = item_emb_dim + aspect_dim
        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            in_dim = hidden_dim

        self.mlp = nn.Sequential(*layers)
        self.output_layer = nn.Linear(in_dim, 1)

    def forward(self, item_idx, aspect_vec):
        i_vec = self.item_embedding(item_idx)

        # Item Dropout
        if self.training and self.item_dropout_prob > 0:
            mask = torch.bernoulli(torch.full((i_vec.shape[0], 1), 1 - self.item_dropout_prob)).to(i_vec.device)
            i_vec = i_vec * mask

        combined = torch.cat([i_vec, aspect_vec], dim=1)
        return torch.sigmoid(self.output_layer(self.mlp(combined))) * 5.0

In [None]:
import json
import os
def load_model_inference(model_path, pt_file_path):
    """
    Load model lên để dự đoán (Inference).
    Cần file .pt gốc để lấy lại Shape của Item Embedding.
    """
    # 1. Load Config
    config_path = model_path.replace('.pth', '_config.json')
    if not os.path.exists(config_path):
        print("⚠️ Không tìm thấy file config, sẽ dùng config mặc định.")
        config = {'hidden_layers': [64, 32], 'aspect_dim': 4}
    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

    # 2. Load Vector Gốc
    try:
        pt_data = torch.load(pt_file_path)
        item_vecs = pt_data['item_emb']
    except Exception as e:
        raise FileNotFoundError(f"❌ Cần file {pt_file_path} để khởi tạo model! Lỗi: {e}")

    # 3. Khởi tạo kiến trúc Model
    model = ItemAspectRatingModel(
        pretrained_item_vecs=item_vecs,
        aspect_dim=config.get('aspect_dim', 4),
        hidden_layers=config.get('hidden_layers', [64, 32]),
        item_dropout_prob=0
    )

    # 4. Load Trọng số đã train
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    print("✅ Load model thành công! Sẵn sàng dự đoán.")
    return model, device

In [None]:
model = load_model_inference('/content/best_hybrid_model.pth', '/content/nfm_rec_data.pt')

⚠️ Không tìm thấy file config, sẽ dùng config mặc định.
✅ Load model thành công! Sẵn sàng dự đoán.


In [None]:

# DEMO DỰ ĐOÁN

model_path = 'best_hybrid_model.pth'
pt_path = 'nfm_rec_data.pt'

try:
    model, device = load_model_inference(model_path, pt_path)
except Exception as e:
    print(e)
    exit()

# 2. Hàm dự đoán rating
def predict_single_case(model, item_id, aspects):
    item_tensor = torch.tensor([item_id], dtype=torch.long).to(device)

    aspect_tensor = torch.tensor([aspects], dtype=torch.float32).to(device) / 3.0

    with torch.no_grad():
        prediction = model(item_tensor, aspect_tensor)

    return prediction.item()

# 3. Test thử các trường hợp
print("\n--- KẾT QUẢ DỰ ĐOÁN ---")

item_id = 121
good_aspects = [3, 3, 3, 3]
rating = predict_single_case(model, item_id, good_aspects)
print(f"Item {item_id} | Review Tốt {good_aspects} -> Rating: {rating:.2f}/5.0")

bad_aspects = [0, 0, 0, 1]
rating_bad = predict_single_case(model, item_id, bad_aspects)
print(f"Item {item_id} | Review Tệ  {bad_aspects} -> Rating: {rating_bad:.2f}/5.0")

item_id_2 = 10
rating_2 = predict_single_case(model, item_id_2, [2, 2, 2, 2])
print(f"Item {item_id_2}  | Review Vừa {good_aspects} -> Rating: {rating_2:.2f}/5.0")

⚠️ Không tìm thấy file config, sẽ dùng config mặc định.
✅ Load model thành công! Sẵn sàng dự đoán.

--- KẾT QUẢ DỰ ĐOÁN ---
Item 121 | Review Tốt [3, 3, 3, 3] -> Rating: 4.28/5.0
Item 121 | Review Tệ  [0, 0, 0, 1] -> Rating: 3.12/5.0
Item 10  | Review Vừa [3, 3, 3, 3] -> Rating: 2.48/5.0


In [None]:
def ranking(df, user_id,topk = 10):
  temp = df[df['user_id'] == user_id]
  temp = temp.sample(n=1)
  temp_vec = temp[['Khong_Gian_Canh_Quan',	'Ha_Tang_Tien_Ich',	'Dich_Vu_Con_Nguoi',	'Gia_Ca_Chi_Phi']].values.tolist()
  temp_vec = temp_vec[0]
  ratings = []
  for j in df['item_id'].unique():
    ratings.append((j,predict_single_case(model, j, temp_vec)))

  result = sorted(ratings, key=lambda x: x[1], reverse=True)[:topk]
  return result

In [None]:
ranking(df, 10)

[(np.int64(240), 4.954970359802246),
 (np.int64(8), 4.9522247314453125),
 (np.int64(117), 4.951949119567871),
 (np.int64(380), 4.950490474700928),
 (np.int64(237), 4.94845724105835),
 (np.int64(398), 4.9460129737854),
 (np.int64(234), 4.945907115936279),
 (np.int64(388), 4.943285942077637),
 (np.int64(162), 4.936796188354492),
 (np.int64(251), 4.936674118041992)]

In [None]:
total = 0
test = df[df['test'] == 1]
for i in test['user_id'].unique():
  iid = test[test['user_id'] == i]['item_id'].iloc[0]
  exist = any(item_id == iid for item_id,rank in ranking(test,i,10))
  if exist:
    total += 1
print(total/len(test['user_id'].unique()))

0.03015873015873016
