In [12]:
# NeuMF training + leave-one-out evaluation (sample 99 negatives per positive)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import random
import os

# -------------------------
# Paths
# -------------------------
train_path = './output_neumf/neumf_train.csv'
test_path  = './output_neumf/neumf_test.csv'

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Keep only necessary cols and ensure types
train_df = train_df[['user','item','label']].copy()
test_df  = test_df[['user','item','label']].copy()

# Ensure positive-only test for holdout evaluation
test_pos_df = test_df[test_df['label'] == 1].copy()
if test_pos_df.empty:
    raise RuntimeError("Kh√¥ng t√¨m th·∫•y sample positive trong test_df (label==1). H√£y ƒë·∫£m b·∫£o test ch·ª©a positives holdout.")

# -------------------------
# Encode users/items
# -------------------------
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Fit encoders on union of train+test to avoid unseen labels at inference
all_users = pd.concat([train_df['user'], test_pos_df['user']]).astype(str)
all_items = pd.concat([train_df['item'], test_pos_df['item']]).astype(str)
user_encoder.fit(all_users)
item_encoder.fit(all_items)

train_df['user'] = user_encoder.transform(train_df['user'].astype(str))
train_df['item'] = item_encoder.transform(train_df['item'].astype(str))
test_pos_df['user'] = user_encoder.transform(test_pos_df['user'].astype(str))
test_pos_df['item'] = item_encoder.transform(test_pos_df['item'].astype(str))

n_users = len(user_encoder.classes_)
n_items = len(item_encoder.classes_)

print(f"Users: {n_users}, Items: {n_items}, Train rows: {len(train_df)}, Test positives: {len(test_pos_df)}")

# -------------------------
# Build train interaction set (to avoid sampling seen items as negatives)
# -------------------------
train_user_pos = train_df[train_df['label']==1].groupby('user')['item'].apply(set).to_dict()
# ensure users with no positives have empty set
for u in range(n_users):
    if u not in train_user_pos:
        train_user_pos[u] = set()

# -------------------------
# Dataset & DataLoader (pointwise)
# -------------------------
class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.labels = torch.tensor(df['label'].values, dtype=torch.float32)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

train_loader = DataLoader(RatingDataset(train_df), batch_size=1024, shuffle=True, num_workers=0)

# -------------------------
# NeuMF model (slightly larger)
# -------------------------
class NeuMF(nn.Module):
    def __init__(self, n_users, n_items, mf_dim=32, mlp_layers=[128,64,32,16], dropout=0.0):
        super().__init__()
        self.user_emb_gmf = nn.Embedding(n_users, mf_dim)
        self.item_emb_gmf = nn.Embedding(n_items, mf_dim)
        self.user_emb_mlp = nn.Embedding(n_users, mlp_layers[0]//2)
        self.item_emb_mlp = nn.Embedding(n_items, mlp_layers[0]//2)
        mlp = []
        input_dim = mlp_layers[0]
        for dim in mlp_layers[1:]:
            mlp.append(nn.Linear(input_dim, dim))
            mlp.append(nn.ReLU())
            if dropout>0:
                mlp.append(nn.Dropout(dropout))
            input_dim = dim
        self.mlp = nn.Sequential(*mlp)
        self.fc = nn.Linear(mf_dim + mlp_layers[-1], 1)
        self.sigmoid = nn.Sigmoid()
        # init
        nn.init.normal_(self.user_emb_gmf.weight, std=0.01)
        nn.init.normal_(self.item_emb_gmf.weight, std=0.01)
        nn.init.normal_(self.user_emb_mlp.weight, std=0.01)
        nn.init.normal_(self.item_emb_mlp.weight, std=0.01)

    def forward(self, user, item):
        gmf = self.user_emb_gmf(user) * self.item_emb_gmf(item)
        mlp_in = torch.cat([self.user_emb_mlp(user), self.item_emb_mlp(item)], dim=-1)
        mlp_out = self.mlp(mlp_in)
        concat = torch.cat([gmf, mlp_out], dim=-1)
        out = self.fc(concat)
        return self.sigmoid(out).squeeze()

# -------------------------
# Training setup
# -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NeuMF(n_users, n_items, mf_dim=32, mlp_layers=[128,64,32,16], dropout=0.0).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
criterion = nn.BCELoss()

# -------------------------
# Train
# -------------------------
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for users, items, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        users = users.to(device); items = items.to(device); labels = labels.to(device)
        preds = model(users, items)
        loss = criterion(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * users.size(0)
    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}: avg_loss={avg_loss:.6f}")

# -------------------------
# Evaluation: leave-one-out with sampled negatives
# For each positive (user, pos_item) in test_pos_df:
#   sample N_NEG negatives from items NOT in user's train positives AND NOT equal pos_item
#   rank pos_item among negatives by predicted score
# -------------------------
def precision_at_k_single(pos_index, ranked_list, k=10):
    return 1.0 if pos_index < k else 0.0

def average_precision_single(pos_index, ranked_list, k=10):
    # In leave-one-out with single positive, AP is 1/(rank) if rank<=k else 0
    if pos_index < k:
        return 1.0 / (pos_index + 1.0)
    return 0.0

def evaluate_leave_one_out(model, test_pos_df, train_user_pos, n_items, n_negatives=99, k=10, seed=42):
    rng = random.Random(seed)
    model.eval()
    precisions = []
    aps = []
    users = test_pos_df['user'].values
    items = test_pos_df['item'].values
    with torch.no_grad():
        for u, pos_item in tqdm(zip(users, items), total=len(users), desc="Eval LOO"):
            # sample negatives
            neg_pool = set(range(n_items)) - train_user_pos.get(u, set()) - {pos_item}
            # if pool smaller than required, take all
            neg_list = rng.sample(list(neg_pool), k=min(n_negatives, len(neg_pool)))
            candidate_items = neg_list + [pos_item]
            # compute scores
            user_tensor = torch.tensor([u]*len(candidate_items), dtype=torch.long).to(device)
            item_tensor = torch.tensor(candidate_items, dtype=torch.long).to(device)
            scores = model(user_tensor, item_tensor).cpu().numpy()
            # rank descending
            ranked_idx = np.argsort(-scores)
            # find rank (0-based) of pos_item
            pos_rank = int(np.where(np.array(candidate_items)[ranked_idx] == pos_item)[0][0])
            precisions.append(precision_at_k_single(pos_rank, ranked_idx, k))
            aps.append(average_precision_single(pos_rank, ranked_idx, k))
    return np.mean(precisions), np.mean(aps)

precision10, map10 = evaluate_leave_one_out(model, test_pos_df, train_user_pos, n_items, n_negatives=99, k=10)
print("===================================================")
print(f"Precision@10: {precision10:.4f}")
print(f"MAP@10:       {map10:.4f}")
print("===================================================")


Users: 42, Items: 139, Train rows: 3373, Test positives: 217


Epoch 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 91.53it/s]


Epoch 1: avg_loss=0.714706


Epoch 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 82.33it/s]


Epoch 2: avg_loss=0.709957


Epoch 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 62.35it/s]


Epoch 3: avg_loss=0.706248


Epoch 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 77.27it/s]


Epoch 4: avg_loss=0.702621


Epoch 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 73.39it/s]


Epoch 5: avg_loss=0.698361


Epoch 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 83.69it/s]


Epoch 6: avg_loss=0.693128


Epoch 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 83.10it/s]


Epoch 7: avg_loss=0.686644


Epoch 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 85.16it/s]


Epoch 8: avg_loss=0.678146


Epoch 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 112.60it/s]


Epoch 9: avg_loss=0.666274


Epoch 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 83.53it/s]


Epoch 10: avg_loss=0.650796


Eval LOO: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 217/217 [00:00<00:00, 3189.63it/s]

Precision@10: 0.3548
MAP@10:       0.1263





In [13]:
# =====================================================
# LNCM (Latent Neural Context Model)
# =====================================================
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# =====================================================
# 1Ô∏è‚É£ LOAD & CHU·∫®N B·ªä D·ªÆ LI·ªÜU
# =====================================================
# ƒê·ªçc file d·ªØ li·ªáu g·ªìm user, item, label (rating) v√† context_id
df = pd.read_csv('output_carskit_clean/ratings_with_context_id.csv')

# üî∏ M·ª•c ƒë√≠ch: √°nh x·∫° user/item/context sang ID li√™n t·ª•c (0,1,2,...)
#  -> gi√∫p embedding layer c√≥ th·ªÉ x·ª≠ l√Ω ƒë∆∞·ª£c
user2id = {u: i for i, u in enumerate(df['user'].unique())}
item2id = {i: j for j, i in enumerate(df['item'].unique())}
context2id = {c: k for k, c in enumerate(df['context_id'].unique())}

# Thay th·∫ø c√°c gi√° tr·ªã g·ªëc b·∫±ng ID
df['user'] = df['user'].map(user2id)
df['item'] = df['item'].map(item2id)
df['context'] = df['context_id'].map(context2id)

# Th·ªëng k√™ s·ªë l∆∞·ª£ng ph·∫ßn t·ª≠
n_users = len(user2id)
n_items = len(item2id)
n_contexts = len(context2id)
print(f"Users: {n_users}, Items: {n_items}, Contexts: {n_contexts}")

# üîπ Chia d·ªØ li·ªáu th√†nh t·∫≠p hu·∫•n luy·ªán v√† ki·ªÉm th·ª≠
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# =====================================================
# 1.1Ô∏è‚É£ ƒê·ªäNH NGHƒ®A Dataset CHO PYTORCH
# =====================================================
class RatingDataset(torch.utils.data.Dataset):
    """Dataset c∆° b·∫£n cho b√†i to√°n g·ª£i √Ω c√≥ ng·ªØ c·∫£nh"""
    def __init__(self, df):
        # √âp ki·ªÉu v·ªÅ tensor ƒë·ªÉ ti·ªán cho GPU
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.contexts = torch.tensor(df['context'].values, dtype=torch.long)
        self.labels = torch.tensor(df['label'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.contexts[idx], self.labels[idx]

# T·∫°o DataLoader ƒë·ªÉ hu·∫•n luy·ªán theo batch
train_data = RatingDataset(train_df)
test_data = RatingDataset(test_df)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=256, shuffle=False)

# =====================================================
# 2Ô∏è‚É£ M√î H√åNH: LNCM (Latent Neural Context Model)
# =====================================================
class LNCM_Fixed(nn.Module):
    """
    üîπ √ù t∆∞·ªüng:
        - LNCM kh√¥ng d√πng embedding context tr·ª±c ti·∫øp.
        - Thay v√†o ƒë√≥, m√¥ h√¨nh "t·ª± sinh" vector ng·ªØ c·∫£nh ti·ªÅm ·∫©n (latent context)
          t·ª´ c·∫∑p (user, item) th√¥ng qua m·ªôt encoder ki·ªÉu VAE (Variational Autoencoder).
    üîπ M·ª•c ti√™u:
        - H·ªçc ƒë∆∞·ª£c t√°c ƒë·ªông ng·∫ßm c·ªßa ng·ªØ c·∫£nh ngay c·∫£ khi context kh√¥ng r√µ r√†ng.
    """

    def __init__(self, n_users, n_items, n_contexts, emb_dim=32, latent_dim=16):
        super().__init__()

        # 1Ô∏è‚É£ Embedding cho user v√† item
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)

        # 2Ô∏è‚É£ Encoder sinh ng·ªØ c·∫£nh ti·ªÅm ·∫©n (latent context)
        #    Input: [user_emb, item_emb]
        #    Output: [mu, logvar] d√πng cho VAE reparameterization trick
        self.context_encoder = nn.Sequential(
            nn.Linear(emb_dim * 2, latent_dim),
            nn.ReLU(),
            nn.Linear(latent_dim, emb_dim * 2)  # output g·ªìm 2 ph·∫ßn: mu & logvar
        )

        # 3Ô∏è‚É£ M·∫°ng fully-connected d·ª± ƒëo√°n x√°c su·∫•t user th√≠ch item
        self.fc = nn.Sequential(
            nn.Linear(emb_dim * 3, 64),  # user + item + latent context
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

        # 4Ô∏è‚É£ Kh·ªüi t·∫°o tr·ªçng s·ªë embedding
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

    # === H√†m t√°i tham s·ªë h√≥a trong VAE ===
    def reparameterize(self, mu, logvar):
        """Chuy·ªÉn (mu, logvar) ‚Üí latent vector theo c√¥ng th·ª©c: z = mu + eps * sigma"""
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, user, item, context=None):
        """Lan truy·ªÅn ti·∫øn (forward pass)"""
        # Embedding user v√† item
        u = self.user_emb(user)
        i = self.item_emb(item)

        # Encoder sinh (mu, logvar) ‚Üí latent context
        context_params = self.context_encoder(torch.cat([u, i], dim=1))
        mu, logvar = torch.chunk(context_params, 2, dim=1)
        c_latent = self.reparameterize(mu, logvar)

        # Gh√©p t·∫•t c·∫£ embedding l·∫°i ƒë·ªÉ d·ª± ƒëo√°n
        x = torch.cat([u, i, c_latent], dim=1)
        return self.fc(x).squeeze()

# =====================================================
# 3Ô∏è‚É£ V√íNG HU·∫§N LUY·ªÜN M√î H√åNH
# =====================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_lncm = LNCM_Fixed(n_users, n_items, n_contexts).to(device)
optimizer = optim.Adam(model_lncm.parameters(), lr=0.001)
loss_fn = nn.BCELoss()  # Binary Cross-Entropy cho nh√£n 0/1

print("\n=== Training LNCM_Fixed ===")
for epoch in range(10):
    model_lncm.train()
    total_loss = 0

    # Duy·ªát t·ª´ng batch d·ªØ li·ªáu
    for users, items, contexts, labels in tqdm(train_loader, desc=f"[LNCM] Epoch {epoch+1}"):
        users, items, labels = users.to(device), items.to(device), labels.to(device)

        preds = model_lncm(users, items)      # D·ª± ƒëo√°n
        loss = loss_fn(preds, labels)         # T√≠nh loss

        optimizer.zero_grad()
        loss.backward()                       # Lan truy·ªÅn ng∆∞·ª£c
        optimizer.step()                       # C·∫≠p nh·∫≠t tr·ªçng s·ªë

        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f}")

# =====================================================
# 4Ô∏è‚É£ ƒê√ÅNH GI√Å M√î H√åNH
# =====================================================
def evaluate_ranking_metrics(model, test_loader, device, K=10):
    """
    ƒê√°nh gi√° m√¥ h√¨nh theo:
    - Basic metrics: AUC, RMSE, MAE
    - Ranking metrics: Precision@10, Recall@10, Hit@10, MAP@10
    """
    model.eval()
    user_preds = {}
    all_preds, all_labels = [], []

    with torch.no_grad():
        for users, items, contexts, labels in test_loader:
            users, items, contexts = users.to(device), items.to(device), contexts.to(device)
            preds = model(users, items, contexts).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

            for u, i, p, l in zip(users.cpu().numpy(), items.cpu().numpy(), preds, labels.numpy()):
                if u not in user_preds:
                    user_preds[u] = []
                user_preds[u].append((i, p, l))

    # === Basic metrics ===
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    auc = roc_auc_score(all_labels, all_preds)
    rmse = np.sqrt(np.mean((all_labels - all_preds) ** 2))
    mae = np.mean(np.abs(all_labels - all_preds))

    print(f"\n--- Basic Metrics ---")
    print(f"AUC: {auc:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")

    # === Ranking metrics @K ===
    precisions, recalls, hits, maps = [], [], [], []

    for user, predictions in user_preds.items():
        predictions.sort(key=lambda x: x[1], reverse=True)
        total_relevant = sum([l for _, _, l in predictions])
        if total_relevant == 0:
            continue

        top_k = predictions[:K]
        top_k_labels = [l for _, _, l in top_k]

        # Precision, Recall, Hit
        precision = sum(top_k_labels) / K
        recall = sum(top_k_labels) / total_relevant
        hit = 1.0 if sum(top_k_labels) > 0 else 0.0

        # MAP@K
        ap_sum, correct = 0.0, 0
        for idx, (_, _, label) in enumerate(top_k, start=1):
            if label == 1:
                correct += 1
                ap_sum += correct / idx
        map_k = ap_sum / min(total_relevant, K)

        precisions.append(precision)
        recalls.append(recall)
        hits.append(hit)
        maps.append(map_k)

    # In k·∫øt qu·∫£ trung b√¨nh
    print(f"\n--- Ranking Metrics @ {K} ---")
    print(f"Precision@{K}: {np.mean(precisions):.4f}")
    print(f"Recall@{K}:    {np.mean(recalls):.4f}")
    print(f"Hit@{K}:       {np.mean(hits):.4f}")
    print(f"MAP@{K}:       {np.mean(maps):.4f}")


# =====================================================
# 5Ô∏è‚É£ CH·∫†Y ƒê√ÅNH GI√Å SAU HU·∫§N LUY·ªÜN
# =====================================================
print("\n=== Evaluating  ===")
evaluate_ranking_metrics(model_lncm, test_loader, device)

print("\n‚úÖ LNCM Training & Evaluation Complete!")


Users: 42, Items: 139, Contexts: 27
Using device: cpu

=== Training LNCM_Fixed ===


[LNCM] Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 390.36it/s]


Epoch 1 | Train Loss: 0.6332


[LNCM] Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 357.57it/s]


Epoch 2 | Train Loss: 0.5569


[LNCM] Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 416.91it/s]


Epoch 3 | Train Loss: 0.5260


[LNCM] Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 367.78it/s]


Epoch 4 | Train Loss: 0.4934


[LNCM] Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 402.99it/s]


Epoch 5 | Train Loss: 0.4596


[LNCM] Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 382.61it/s]


Epoch 6 | Train Loss: 0.4543


[LNCM] Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 382.17it/s]


Epoch 7 | Train Loss: 0.4434


[LNCM] Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 431.81it/s]


Epoch 8 | Train Loss: 0.4252


[LNCM] Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 341.40it/s]


Epoch 9 | Train Loss: 0.4247


[LNCM] Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 350.93it/s]

Epoch 10 | Train Loss: 0.4158

=== Evaluating  ===

--- Basic Metrics ---
AUC: 0.7841
RMSE: 0.3911
MAE: 0.2851

--- Ranking Metrics @ 10 ---
Precision@10: 0.3414
Recall@10:    0.7727
Hit@10:       1.0000
MAP@10:       0.5073

‚úÖ LNCM Training & Evaluation Complete!





In [14]:
# =====================================================
# üìö Explicit Neural Context Model (ENCM)
# =====================================================

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import numpy as np

# -----------------------------------------------------
# T·∫¢I V√Ä X·ª¨ L√ù D·ªÆ LI·ªÜU
# -----------------------------------------------------
# D·ªØ li·ªáu g·ªìm 4 c·ªôt ch√≠nh: user, item, label (0/1), context_id
# context_id th·ªÉ hi·ªán c√°c ƒëi·ªÅu ki·ªán nh∆∞ th·ªùi ti·∫øt, t√¢m tr·∫°ng, ƒë∆∞·ªùng, v.v.
df = pd.read_csv('output_carskit_clean/ratings_with_context_id.csv')

# √Ånh x·∫° id r·ªùi r·∫°c th√†nh ch·ªâ s·ªë li√™n t·ª•c ƒë·ªÉ d√πng cho embedding
user2id = {u: i for i, u in enumerate(df['user'].unique())}
item2id = {i: j for j, i in enumerate(df['item'].unique())}
context2id = {c: k for k, c in enumerate(df['context_id'].unique())}

# G√°n l·∫°i id li√™n t·ª•c
df['user'] = df['user'].map(user2id)
df['item'] = df['item'].map(item2id)
df['context'] = df['context_id'].map(context2id)

# Th·ªëng k√™ s·ªë l∆∞·ª£ng th·ª±c th·ªÉ
n_users = len(user2id)
n_items = len(item2id)
n_contexts = len(context2id)
print(f"Users: {n_users}, Items: {n_items}, Contexts: {n_contexts}")

# Chia train/test (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# -----------------------------------------------------
# T·∫†O DATASET CHO PYTORCH
# -----------------------------------------------------
class RatingDataset(torch.utils.data.Dataset):
    """Dataset chu·∫©n cho b√†i to√°n recommendation."""
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.contexts = torch.tensor(df['context'].values, dtype=torch.long)
        self.labels = torch.tensor(df['label'].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.contexts[idx], self.labels[idx]

# DataLoader ƒë·ªÉ chia batch
train_data = RatingDataset(train_df)
test_data = RatingDataset(test_df)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=256, shuffle=False)

# -----------------------------------------------------
# ƒê·ªäNH NGHƒ®A M√î H√åNH ENCM
# -----------------------------------------------------
class ENCM(nn.Module):
    """
    ENCM - Explicit Neural Context Model
    ------------------------------------
    √ù t∆∞·ªüng:
      - M·ªói user, item, context ƒë∆∞·ª£c bi·ªÉu di·ªÖn b·∫±ng embedding.
      - Context kh√¥ng ch·ªâ ƒë∆∞·ª£c n·ªëi v√†o m√† c√≤n *ƒëi·ªÅu ch·ªânh (modulate)*
        embedding c·ªßa user v√† item th√¥ng qua *gating mechanism*.
      - M·ª•c ti√™u: h·ªçc ·∫£nh h∆∞·ªüng c·ªßa context ƒë·∫øn h√†nh vi user r√µ r√†ng h∆°n.

    Ki·∫øn tr√∫c:
      user/item embedding ‚Üí context gate ‚Üí modulation ‚Üí concat ‚Üí MLP ‚Üí output
    """
    def __init__(self, n_users, n_items, n_contexts, emb_dim=32):
        super().__init__()
        
        # --- 3 lo·∫°i embedding c∆° b·∫£n ---
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)
        self.context_emb = nn.Embedding(n_contexts, emb_dim)
        
        # --- Gating: context ·∫£nh h∆∞·ªüng l√™n user/item ---
        # Gate l√† 1 h√†m sigmoid ƒë·ªÉ t·∫°o tr·ªçng s·ªë ƒëi·ªÅu ch·ªânh ƒë·ªông cho t·ª´ng chi·ªÅu embedding
        self.user_gate = nn.Sequential(
            nn.Linear(emb_dim * 2, emb_dim),  # [user, context] -> gate vector
            nn.Sigmoid()
        )
        self.item_gate = nn.Sequential(
            nn.Linear(emb_dim * 2, emb_dim),  # [item, context] -> gate vector
            nn.Sigmoid()
        )
        
        # --- M·∫°ng d·ª± ƒëo√°n (prediction network) ---
        # Nh·∫≠n ƒë·∫ßu v√†o l√† [user_modulated, item_modulated, context]
        self.fc = nn.Sequential(
            nn.Linear(emb_dim * 3, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # ƒë·∫ßu ra x√°c su·∫•t
        )
        
        # --- Kh·ªüi t·∫°o tr·ªçng s·ªë embedding ---
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.xavier_uniform_(self.context_emb.weight)
    
    def forward(self, user, item, context):
        # 1Ô∏è‚É£ L·∫•y embedding c∆° b·∫£n
        u = self.user_emb(user)
        i = self.item_emb(item)
        c = self.context_emb(context)
        
        # 2Ô∏è‚É£ Context ƒëi·ªÅu ch·ªânh user
        u_context = torch.cat([u, c], dim=1)
        u_gate = self.user_gate(u_context)      # gate ‚àà [0,1]^emb_dim
        u_modulated = u * u_gate                # nh√¢n t·ª´ng chi·ªÅu ƒë·ªÉ ƒëi·ªÅu ch·ªânh
        
        # 3Ô∏è‚É£ Context ƒëi·ªÅu ch·ªânh item
        i_context = torch.cat([i, c], dim=1)
        i_gate = self.item_gate(i_context)
        i_modulated = i * i_gate
        
        # 4Ô∏è‚É£ K·∫øt h·ª£p t·∫•t c·∫£ embeddings l·∫°i
        x = torch.cat([u_modulated, i_modulated, c], dim=1)
        
        # 5Ô∏è‚É£ D·ª± ƒëo√°n x√°c su·∫•t user th√≠ch item trong context
        return self.fc(x).squeeze()

# -----------------------------------------------------
# HU·∫§N LUY·ªÜN M√î H√åNH
# -----------------------------------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = ENCM(n_users, n_items, n_contexts, emb_dim=32).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
loss_fn = nn.BCELoss()  # V√¨ b√†i to√°n nh·ªã ph√¢n (0/1)

print("\n=== Training ENCM ===")
for epoch in range(10):
    model.train()
    total_loss = 0
    
    # Duy·ªát t·ª´ng batch trong d·ªØ li·ªáu hu·∫•n luy·ªán
    for users, items, contexts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        users, items, contexts, labels = users.to(device), items.to(device), contexts.to(device), labels.to(device)
        
        preds = model(users, items, contexts)          # d·ª± ƒëo√°n
        loss = loss_fn(preds, labels)                  # t√≠nh l·ªói
        optimizer.zero_grad()
        loss.backward()                                # lan truy·ªÅn ng∆∞·ª£c
        optimizer.step()                               # c·∫≠p nh·∫≠t tr·ªçng s·ªë
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f}")

# -----------------------------------------------------
#  ƒê√ÅNH GI√Å HI·ªÜU NƒÇNG M√î H√åNH
# -----------------------------------------------------
print("\n=== Evaluating ENCM ===")

# -----------------------------------------------------
# ƒê√ÅNH GI√Å HI·ªÜU NƒÇNG M√î H√åNH (CH·ªà T·∫†I K=10)
# -----------------------------------------------------
print("\n=== Evaluating ENCM ===")

def evaluate_ranking_metrics(model, test_loader, device, K=10):
    """
    ƒê√°nh gi√° m√¥ h√¨nh theo:
    - Basic metrics: AUC, RMSE, MAE
    - Ranking metrics: Precision@10, Recall@10, Hit@10, MAP@10
    """
    model.eval()
    user_preds = {}
    all_preds, all_labels = [], []

    with torch.no_grad():
        for users, items, contexts, labels in test_loader:
            users, items, contexts = users.to(device), items.to(device), contexts.to(device)
            preds = model(users, items, contexts).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

            for u, i, p, l in zip(users.cpu().numpy(), items.cpu().numpy(), preds, labels.numpy()):
                if u not in user_preds:
                    user_preds[u] = []
                user_preds[u].append((i, p, l))

    # === Basic metrics ===
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    auc = roc_auc_score(all_labels, all_preds)
    rmse = np.sqrt(np.mean((all_labels - all_preds) ** 2))
    mae = np.mean(np.abs(all_labels - all_preds))

    print(f"\n--- Basic Metrics ---")
    print(f"AUC: {auc:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")

    # === Ranking metrics @K ===
    precisions, recalls, hits, maps = [], [], [], []

    for user, predictions in user_preds.items():
        predictions.sort(key=lambda x: x[1], reverse=True)
        total_relevant = sum([l for _, _, l in predictions])
        if total_relevant == 0:
            continue

        top_k = predictions[:K]
        top_k_labels = [l for _, _, l in top_k]

        # Precision, Recall, Hit
        precision = sum(top_k_labels) / K
        recall = sum(top_k_labels) / total_relevant
        hit = 1.0 if sum(top_k_labels) > 0 else 0.0

        # MAP@K
        ap_sum, correct = 0.0, 0
        for idx, (_, _, label) in enumerate(top_k, start=1):
            if label == 1:
                correct += 1
                ap_sum += correct / idx
        map_k = ap_sum / min(total_relevant, K)

        precisions.append(precision)
        recalls.append(recall)
        hits.append(hit)
        maps.append(map_k)

    # In k·∫øt qu·∫£ trung b√¨nh
    print(f"\n--- Ranking Metrics @ {K} ---")
    print(f"Precision@{K}: {np.mean(precisions):.4f}")
    print(f"Recall@{K}:    {np.mean(recalls):.4f}")
    print(f"Hit@{K}:       {np.mean(hits):.4f}")
    print(f"MAP@{K}:       {np.mean(maps):.4f}")

# G·ªçi h√†m ƒë√°nh gi√° ch√≠nh x√°c
evaluate_ranking_metrics(model, test_loader, device)

print("\n‚úÖ ENCM Training & Evaluation Complete!")



Users: 42, Items: 139, Contexts: 27
Using device: cpu

=== Training ENCM ===


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 307.46it/s]

Epoch 1 | Train Loss: 0.6332



Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 312.33it/s]


Epoch 2 | Train Loss: 0.5592


Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 346.29it/s]


Epoch 3 | Train Loss: 0.5106


Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 253.39it/s]


Epoch 4 | Train Loss: 0.4671


Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 306.38it/s]


Epoch 5 | Train Loss: 0.4573


Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 309.94it/s]


Epoch 6 | Train Loss: 0.4310


Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 305.10it/s]


Epoch 7 | Train Loss: 0.4215


Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 318.53it/s]


Epoch 8 | Train Loss: 0.4126


Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 328.97it/s]


Epoch 9 | Train Loss: 0.3998


Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:00<00:00, 307.72it/s]

Epoch 10 | Train Loss: 0.3941

=== Evaluating ENCM ===

=== Evaluating ENCM ===

--- Basic Metrics ---
AUC: 0.7968
RMSE: 0.3882
MAE: 0.2812

--- Ranking Metrics @ 10 ---
Precision@10: 0.3448
Recall@10:    0.7876
Hit@10:       1.0000
MAP@10:       0.5520

‚úÖ ENCM Training & Evaluation Complete!





In [15]:
# =====================================================
# üéØ Context Similarity Post-filtering
# =====================================================
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

# ƒê∆∞·ªùng d·∫´n file
base_dir = './output_carskit_clean'
ratings_path = os.path.join(base_dir, 'ratings_with_context_id.csv')
context_matrix_path = os.path.join(base_dir, 'item_context_matrix.csv')

# 1Ô∏è‚É£ ƒê·ªçc d·ªØ li·ªáu
ratings_df = pd.read_csv(ratings_path)
context_matrix = pd.read_csv(context_matrix_path)
print(f"üìÑ ratings: {ratings_df.shape}, context_matrix: {context_matrix.shape}")

# 2Ô∏è‚É£ X√°c ƒë·ªãnh context hi·ªán t·∫°i (v√≠ d·ª• context_id = 4)
CURRENT_CONTEXT = 4

# 3Ô∏è‚É£ Chu·∫©n b·ªã vector context hi·ªán t·∫°i
context_cols = [c for c in context_matrix.columns if c not in ['item', 'genre']]
current_context_vector = np.zeros(len(context_cols))
if str(CURRENT_CONTEXT) in context_cols:
    idx = context_cols.index(str(CURRENT_CONTEXT))
    current_context_vector[idx] = 1
else:
    raise ValueError(f"Context_id {CURRENT_CONTEXT} kh√¥ng c√≥ trong context_matrix")

# 4Ô∏è‚É£ T√≠nh cosine similarity gi·ªØa context hi·ªán t·∫°i v√† t·ª´ng item
item_vectors = context_matrix[context_cols].values
similarity_scores = cosine_similarity(item_vectors, current_context_vector.reshape(1, -1)).flatten()

# 5Ô∏è‚É£ G·ªôp similarity v√†o ratings_df
sim_df = pd.DataFrame({'item': context_matrix['item'], 'context_similarity': similarity_scores})
ratings_df = ratings_df.merge(sim_df, on='item', how='left')

# 6Ô∏è‚É£ T√°i x·∫øp h·∫°ng
# N·∫øu mu·ªën k·∫øt h·ª£p ƒëi·ªÉm m√¥ h√¨nh g·ªëc (rating) v·ªõi context_similarity:
ALPHA = 0.0  # 0.0 = ch·ªâ d√πng context similarity
ratings_df['final_score'] = ALPHA * ratings_df['label'] + (1 - ALPHA) * ratings_df['context_similarity']

# 7Ô∏è‚É£ H√†m ƒë√°nh gi√° Precision@10 v√† MAP@10
def precision_at_k(y_true, y_score, k=10):
    top_k = np.argsort(y_score)[-k:][::-1]
    return np.mean(np.array(y_true)[top_k])

def map_at_k(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.array(y_true)[order][:k]
    cum_precisions = [np.mean(y_true[:i+1]) for i in range(len(y_true)) if y_true[i]]
    return np.mean(cum_precisions) if cum_precisions else 0.0

precisions, maps = [], []
for uid, group in ratings_df.groupby('user'):
    if group['label'].sum() == 0:
        continue
    precisions.append(precision_at_k(group['label'], group['final_score'], k=10))
    maps.append(map_at_k(group['label'], group['final_score'], k=10))

print(f"üéØ Precision@10: {np.mean(precisions):.4f}")
print(f"üéØ MAP@10: {np.mean(maps):.4f}")

# 8Ô∏è‚É£ Xu·∫•t file k·∫øt qu·∫£
output_path = os.path.join(base_dir, 'context_similarity_postfiltered.csv')
ratings_df.to_csv(output_path, index=False, encoding='utf-8')
print(f"‚úÖ K·∫øt qu·∫£ post-filtering ƒë√£ l∆∞u: {output_path}")


üìÑ ratings: (4012, 4), context_matrix: (139, 29)
üéØ Precision@10: 0.2776
üéØ MAP@10: 0.3981
‚úÖ K·∫øt qu·∫£ post-filtering ƒë√£ l∆∞u: ./output_carskit_clean\context_similarity_postfiltered.csv
