In [1]:
# Import Library
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time
from tqdm.notebook import tqdm
import warnings
import joblib
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

np.random.seed(42)

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

warnings.filterwarnings('ignore')
print(f"PyTorch version: {torch.__version__}")

PyTorch version: 2.2.2+cu121


In [2]:
# Cek ketersediaan GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Menggunakan device: {device}")

Menggunakan device: cuda


In [3]:
# Load CSV
df = pd.read_csv("Final_Data.csv")
df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,rating,Usia,Gender,Lokasi Tinggal,category
0,1433221332117,257597,view,355908,,1,25-34,Tidak Ingin Menjawab,Bali,Fashion
1,1433224214164,992329,view,248676,,1,35-44,Tidak Ingin Menjawab,Papua Barat,Otomotif
2,1433221999827,111016,view,318965,,1,<18,Laki - laki,Sulawesi Tengah,Kesehatan & Kecantikan
3,1433221955914,483717,view,253185,,1,25-34,Perempuan,Kalimantan Tengah,Fashion
4,1433221337106,951259,view,367447,,1,<18,Perempuan,Nusa Tenggara Timur,Kesehatan & Kecantikan


In [4]:
# Label Encoding
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['user_idx'] = user_encoder.fit_transform(df['visitorid'])
df['item_idx'] = item_encoder.fit_transform(df['itemid'])

n_users = df['user_idx'].nunique()
n_items = df['item_idx'].nunique()
print(f"Data simulasi: {len(df)} interaksi, {n_users} user, {n_items} item.")

Data simulasi: 2755641 interaksi, 1407580 user, 235061 item.


In [5]:
# Save Encoding
df.to_csv('encoded_data_ori.csv', index=False)

joblib.dump(user_encoder, 'user_encoder_ori.pkl')
joblib.dump(item_encoder, 'item_encoder_ori.pkl')

['item_encoder_ori.pkl']

In [None]:
# Load Encoding
df = pd.read_csv('encoded_data_ori.csv')

user_encoder = load('user_encoder_ori.pkl')
item_encoder = load('item_encoder_ori.pkl')

In [6]:
# Parameter yang bisa disesuaikan
MIN_INTERACTIONS = 3
SAMPLE_SIZE = 200000
NUM_NEGATIVES = 4 

# Pisahkan pengguna aktif dan tidak aktif
user_counts = df['user_idx'].value_counts()
inactive_users = user_counts[user_counts < MIN_INTERACTIONS].index
active_users = user_counts[user_counts >= MIN_INTERACTIONS].index

# Buat set tes untuk Cold Start User
user_cold_start_raw = df[df['user_idx'].isin(inactive_users)]
user_cold_start_test_df = user_cold_start_raw.sample(n=min(len(user_cold_start_raw), SAMPLE_SIZE), random_state=42)

# Buat DataFrame 'Warm' dari pengguna aktif
warm_df = df[df['user_idx'].isin(active_users)]
train_df, test_warm_df = train_test_split(warm_df, test_size=0.2, random_state=42, stratify=warm_df['user_idx'])

# Buat set tes untuk Cold Start Item
train_items_set = set(train_df['item_idx'])
item_cold_start_test_df = test_warm_df[~test_warm_df['item_idx'].isin(train_items_set)]

print(f"Data Training (Warm): {len(train_df)}")
print(f"Data Test (Warm): {len(test_warm_df)}")
print(f"Data Test (User Cold): {len(user_cold_start_test_df)}")
print(f"Data Test (Item Cold): {len(item_cold_start_test_df)}")

Data Training (Warm): 1073630
Data Test (Warm): 268408
Data Test (User Cold): 200000
Data Test (Item Cold): 9962


In [7]:
# Buat set semua interaksi untuk pengecekan cepat
user_item_set = set(zip(df['user_idx'], df['item_idx']))

# Ambil data positif dari train_df
users = train_df['user_idx'].values
items = train_df['item_idx'].values
ratings = train_df['rating'].values

# Siapkan list untuk data training final
train_users, train_items, train_labels = [], [], []

# Tambahkan semua data positif
train_users.extend(users)
train_items.extend(items)
train_labels.extend(ratings) # Label untuk data positif adalah rating aslinya

In [8]:
# Generate data negatif
neg_items = np.random.randint(0, n_items, size=(len(train_df), NUM_NEGATIVES))
for i in range(len(train_df)):
    user = users[i]
    for j in range(NUM_NEGATIVES):
        # Jika item negatif yang di-generate ternyata pernah diinteraksikan, generate ulang
        while (user, neg_items[i, j]) in user_item_set:
            neg_items[i, j] = np.random.randint(0, n_items)
    
    # Tambahkan data negatif
    train_users.extend([user] * NUM_NEGATIVES)
    train_items.extend(neg_items[i])
    train_labels.extend([0] * NUM_NEGATIVES) # Label untuk data negatif adalah 0

print(f"Ukuran data training diperluas (dengan negatif): {len(train_labels)} sampel.")

Ukuran data training diperluas (dengan negatif): 5368150 sampel.


In [9]:
# Save negatif train
train_neg_df = pd.DataFrame({
    'user_idx': train_users,
    'item_idx': train_items,
    'label': train_labels
})

train_neg_df.to_csv('train_negsamp_ori.csv', index=False)

In [None]:
# Load negatif train
train_neg_df = pd.read_csv('train_negsamp_ori.csv')

In [10]:
def generate_negative_samples(test_df, all_items, user_item_dict, num_negatives=99, seed=42):
    
    np.random.seed(seed)
    all_items = np.array(all_items)
    test_with_neg = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating Negatives", mininterval=1.0):
        user = int(row['user_idx'])
        pos_item = int(row['item_idx'])
        
        # Set item yang sudah pernah dilihat user
        seen_items = user_item_dict.get(user, set())

        # Ambil sample acak lebih banyak untuk mengurangi looping
        neg_items = []
        while len(neg_items) < num_negatives:
            candidates = np.random.choice(all_items, size=num_negatives*2, replace=False)
            valid = [i for i in candidates if i not in seen_items and i != pos_item]
            neg_items.extend(valid)
        
        neg_items = neg_items[:num_negatives]  # ambil hanya jumlah yang dibutuhkan
        
        test_with_neg.append({
            'user_idx': user,
            'item_idx': pos_item,
            'rating': row['rating'],
            'neg_items': neg_items
        })

    return pd.DataFrame(test_with_neg)

In [11]:
# Generate data negatif
all_items = df['item_idx'].unique().tolist()
user_item_dict = train_df.groupby('user_idx')['item_idx'].apply(set).to_dict()

test_warm_with_neg = generate_negative_samples(test_warm_df, all_items, user_item_dict, num_negatives=99)
user_cold_start_test_with_neg = generate_negative_samples(user_cold_start_test_df, 
                                                               all_items, user_item_dict, num_negatives=99)
item_cold_start_test_with_neg = generate_negative_samples(item_cold_start_test_df, 
                                                               all_items, user_item_dict, num_negatives=99)

Generating Negatives:   0%|          | 0/268408 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/200000 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/9962 [00:00<?, ?it/s]

In [12]:
# Save negatif test
with open("test_warm_neg_ori.pkl", "wb") as f:
    pickle.dump(test_warm_with_neg, f)

with open("user_cold_start_test_neg_ori.pkl", "wb") as f:
    pickle.dump(user_cold_start_test_with_neg, f)

with open("item_cold_start_test_neg_ori.pkl", "wb") as f:
    pickle.dump(item_cold_start_test_with_neg, f)

In [None]:
# Load negatif test
with open("test_warm_neg_ori.pkl", "rb") as f:
    test_warm_with_neg = pickle.load(f)

with open("user_cold_start_test_neg_ori.pkl", "rb") as f:
    user_cold_start_test_with_neg = pickle.load(f)

with open("item_cold_start_test_neg_ori.pkl", "rb") as f:
    item_cold_start_test_with_neg = pickle.load(f)

In [13]:
# Dataset Class
class FastTensorDataset(Dataset):
    def __init__(self, users, items, labels):
        self.users = torch.tensor(users, dtype=torch.long)
        self.items = torch.tensor(items, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

In [14]:
# Model NCF
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.item_embed = nn.Embedding(num_items, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)

    def forward(self, user, item):
        u = self.user_embed(user)
        i = self.item_embed(item)
        x = torch.cat([u, i], dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x).squeeze()

In [15]:
# Fungsi helper evaluasi
def hit_ratio_at_k(predictions, true_item_idx, k):
    _, top_k_indices = torch.topk(predictions, k)
    return 1 if true_item_idx in top_k_indices else 0

def ndcg_at_k(predictions, true_item_idx, k):
    
    # Ambil indeks dari k item dengan prediksi skor tertinggi
    _, top_k_indices = torch.topk(predictions, k)
    
    # Cari di mana posisi item yang benar (true_item_idx) di dalam top_k
    indices = (top_k_indices == true_item_idx).nonzero(as_tuple=True)[0]
    
    # Cek apakah tensor 'indices' memiliki elemen atau tidak
    if indices.numel() == 0:
        # Jika tidak ada elemen, berarti item tidak ditemukan di top_k.
        return 0.0
    else:
        # Jika item ditemukan, ambil rank-nya (posisinya)
        rank = indices.item()
        # Hitung skor NDCG berdasarkan rank
        return 1.0 / np.log2(rank + 2)

In [19]:
# Fungsi evaluasi
def evaluate_model(model, test_df, description, k=10):
    print(f"\n--- {description} ---")
    model.eval()

    # ===== RMSE (positif saja) =====
    test_pos_df = test_df[test_df['rating'] > 0]
    y_true = test_pos_df['rating'].values
    user_tensor = torch.tensor(test_pos_df['user_idx'].values, dtype=torch.long).to(device)
    item_tensor = torch.tensor(test_pos_df['item_idx'].values, dtype=torch.long).to(device)

    with torch.no_grad():
        y_pred = model(user_tensor, item_tensor).cpu().numpy()

    rmse = sqrt(mean_squared_error(y_true, y_pred))
    print(f"RMSE: {rmse:.4f}")

    # ===== Hit Ratio & NDCG =====
    hr_list, ndcg_list = [], []
    for _, row in test_df.iterrows():
        user = int(row['user_idx'])
        pos_item = int(row['item_idx'])
        neg_items = row['neg_items']

        test_items = [pos_item] + neg_items
        user_tensor = torch.tensor([user] * len(test_items), dtype=torch.long).to(device)
        item_tensor = torch.tensor(test_items, dtype=torch.long).to(device)

        with torch.no_grad():
            predictions = model(user_tensor, item_tensor)

        hr = hit_ratio_at_k(predictions, 0, k)
        ndcg = ndcg_at_k(predictions, 0, k)

        hr_list.append(hr)
        ndcg_list.append(ndcg)

    print(f"Hit Ratio@{k}: {np.mean(hr_list):.4f}")
    print(f"NDCG@{k}: {np.mean(ndcg_list):.4f}")

    return {'RMSE': rmse, 'HitRatio': np.mean(hr_list), 'NDCG': np.mean(ndcg_list)}

In [17]:
# Dataset dan dataloader
final_train_dataset = FastTensorDataset(train_users, train_items, train_labels)
train_loader = DataLoader(final_train_dataset, batch_size=4096, shuffle=True, num_workers=0)

# Inisialisasi model dan optimizer
model = NCF(n_users, n_items).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Loop training
start_time = time.time()
for epoch in range(10): # 10 epochs
    model.train()
    total_loss = 0
    for users_b, items_b, labels_b in train_loader:
        users_b, items_b, labels_b = users_b.to(device), items_b.to(device), labels_b.to(device)
        optimizer.zero_grad()
        predictions = model(users_b, items_b)
        loss = criterion(predictions, labels_b)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/10 Selesai | Rata-rata Loss: {avg_loss:.4f}")

print(f"\nTraining selesai dalam {time.time() - start_time:.2f} detik.")

Epoch 1/10 Selesai | Rata-rata Loss: 0.1848
Epoch 2/10 Selesai | Rata-rata Loss: 0.1354
Epoch 3/10 Selesai | Rata-rata Loss: 0.1227
Epoch 4/10 Selesai | Rata-rata Loss: 0.1168
Epoch 5/10 Selesai | Rata-rata Loss: 0.1113
Epoch 6/10 Selesai | Rata-rata Loss: 0.1053
Epoch 7/10 Selesai | Rata-rata Loss: 0.0988
Epoch 8/10 Selesai | Rata-rata Loss: 0.0920
Epoch 9/10 Selesai | Rata-rata Loss: 0.0853
Epoch 10/10 Selesai | Rata-rata Loss: 0.0790

Training selesai dalam 1508.20 detik.


In [20]:
# Evaluasi Akhir
result_warm = evaluate_model(model, test_warm_with_neg, "Evaluasi Warm Start", k=10)
result_ucold = evaluate_model(model, user_cold_start_test_with_neg, "Evaluasi User Cold Start", k=10)
result_icold = evaluate_model(model, item_cold_start_test_with_neg, "Evaluasi Item Cold Start", k=10)



--- Evaluasi Warm Start ---
RMSE: 0.6060
Hit Ratio@10: 0.7473
NDCG@10: 0.4665

--- Evaluasi User Cold Start ---
RMSE: 0.7524
Hit Ratio@10: 0.4038
NDCG@10: 0.2189

--- Evaluasi Item Cold Start ---
RMSE: 1.0000
Hit Ratio@10: 0.0053
NDCG@10: 0.0017


In [21]:
# Simpan model
torch.save(model.state_dict(), "NCF_Original.pth")