In [1]:
# Import Library
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time
from tqdm.notebook import tqdm
import warnings
import joblib
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

np.random.seed(42)

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

warnings.filterwarnings('ignore')
print(f"PyTorch version: {torch.__version__}")

# Cek ketersediaan GPU dan atur device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Menggunakan device: {device}")

PyTorch version: 2.2.2+cu121
Menggunakan device: cuda


In [2]:
# Load CSV
df = pd.read_csv("Final_Data.csv")
df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,rating,Usia,Gender,Lokasi Tinggal,category
0,1433221332117,257597,view,355908,,1,25-34,Tidak Ingin Menjawab,Bali,Fashion
1,1433224214164,992329,view,248676,,1,35-44,Tidak Ingin Menjawab,Papua Barat,Otomotif
2,1433221999827,111016,view,318965,,1,<18,Laki - laki,Sulawesi Tengah,Kesehatan & Kecantikan
3,1433221955914,483717,view,253185,,1,25-34,Perempuan,Kalimantan Tengah,Fashion
4,1433221337106,951259,view,367447,,1,<18,Perempuan,Nusa Tenggara Timur,Kesehatan & Kecantikan


In [3]:
# Label Encoding untuk user, item, dan fitur demografis
encoders = {
    'user': LabelEncoder(),
    'item': LabelEncoder(),
    'age': LabelEncoder(),
    'gender': LabelEncoder(),
    'location': LabelEncoder(),
    'category': LabelEncoder()
}

df['user_idx'] = encoders['user'].fit_transform(df['visitorid'])
df['item_idx'] = encoders['item'].fit_transform(df['itemid'])
df['age_idx'] = encoders['age'].fit_transform(df['Usia'])
df['gender_idx'] = encoders['gender'].fit_transform(df['Gender'])
df['location_idx'] = encoders['location'].fit_transform(df['Lokasi Tinggal'])
df['category_idx'] = encoders['category'].fit_transform(df['category'])

# Simpan jumlah kategori unik untuk setiap fitur
n_users = df['user_idx'].nunique()
n_items = df['item_idx'].nunique()
n_ages = df['age_idx'].nunique()
n_genders = df['gender_idx'].nunique()
n_locations = df['location_idx'].nunique()
n_categories = df['category_idx'].nunique()

print(f"Data siap: {len(df)} interaksi, {n_users} user, {n_items} item, {n_categories} kategori.")
print(f"Fitur demografis: {n_ages} rentang usia, {n_genders} gender, {n_locations} lokasi.")

Data siap: 2755641 interaksi, 1407580 user, 235061 item, 8 kategori.
Fitur demografis: 5 rentang usia, 3 gender, 34 lokasi.


In [4]:
# Save Encoding
df.to_csv('encoded_data_2tower.csv', index=False)

for name, encoder in encoders.items():
    joblib.dump(encoder, f"{name}_encoder_2tower.pkl")

In [None]:
# Load Encoding
df = pd.read_csv('encoded_data_2tower.csv')

encoders = {name: joblib.load(f"{name}_encoder_2tower.pkl") for name in ['user','item','age','gender','location','category']}

In [5]:
# Split Data
MIN_INTERACTIONS = 3
SAMPLE_SIZE = 200000
NUM_NEGATIVES = 4

user_counts = df['user_idx'].value_counts()
inactive_users = user_counts[user_counts < MIN_INTERACTIONS].index
active_users = user_counts[user_counts >= MIN_INTERACTIONS].index

user_cold_start_raw = df[df['user_idx'].isin(inactive_users)]
user_cold_start_test_df = user_cold_start_raw.sample(n=min(len(user_cold_start_raw), SAMPLE_SIZE), random_state=42)

warm_df = df[df['user_idx'].isin(active_users)]
train_df, test_warm_df = train_test_split(warm_df, test_size=0.2, random_state=42, stratify=warm_df['user_idx'])

train_items_set = set(train_df['item_idx'])
item_cold_start_test_df = test_warm_df[~test_warm_df['item_idx'].isin(train_items_set)]

print(f"Data Training (Warm): {len(train_df)}")
print(f"Data Test (Warm): {len(test_warm_df)}")
print(f"Data Test (User Cold): {len(user_cold_start_test_df)}")
print(f"Data Test (Item Cold): {len(item_cold_start_test_df)}")

Data Training (Warm): 1073630
Data Test (Warm): 268408
Data Test (User Cold): 200000
Data Test (Item Cold): 9962


In [6]:
# Buat set semua interaksi untuk pengecekan cepat
user_item_set = set(zip(df['user_idx'], df['item_idx']))

# Buat mapping item_idx ke category_idx untuk lookup cepat
item_to_category = df.drop_duplicates('item_idx').set_index('item_idx')['category_idx'].to_dict()

# Ambil data positif
pos_users = train_df['user_idx'].values
pos_items = train_df['item_idx'].values
pos_ages = train_df['age_idx'].values
pos_genders = train_df['gender_idx'].values
pos_locations = train_df['location_idx'].values
pos_categories = train_df['category_idx'].values
pos_ratings = train_df['rating'].values

# Siapkan list final
train_data = {k: [] for k in ['users', 'items', 'ages', 'genders', 'locations', 'categories', 'labels']}

# Tambahkan data positif
for i in range(len(train_df)):
    for key, L in [('users',pos_users), ('items',pos_items), ('ages',pos_ages), ('genders',pos_genders), 
                   ('locations',pos_locations), ('categories', pos_categories), ('labels', pos_ratings)]:
        train_data[key].append(L[i])

# Generate data negatif
neg_items_array = np.random.randint(0, n_items, size=(len(train_df), NUM_NEGATIVES))
for i in tqdm(range(len(train_df)), desc="Generating Negative Samples"):
    user = pos_users[i]
    for j in range(NUM_NEGATIVES):
        neg_item = neg_items_array[i, j]
        while (user, neg_item) in user_item_set:
            neg_item = np.random.randint(0, n_items)
        neg_items_array[i, j] = neg_item
        
        # Tambahkan data negatif
        train_data['users'].append(user)
        train_data['items'].append(neg_item)
        train_data['labels'].append(0)
        # Salin fitur demografi user & kategori item
        train_data['ages'].append(pos_ages[i])
        train_data['genders'].append(pos_genders[i])
        train_data['locations'].append(pos_locations[i])
        train_data['categories'].append(item_to_category.get(neg_item, 0))

training_df_final = pd.DataFrame(train_data)
print(f"Ukuran data training diperluas: {len(training_df_final['labels'])} sampel.")

Generating Negative Samples:   0%|          | 0/1073630 [00:00<?, ?it/s]

Ukuran data training diperluas: 5368150 sampel.


In [7]:
# Save negatif train
training_df_final.to_csv("train_negsamp_2tower.csv", index=False)

In [None]:
# load negatif train
training_df_final = pd.read_csv("train_negsamp_2tower.csv")

In [8]:
def generate_negative_samples(test_df, all_items, user_item_dict, item_to_category, num_negatives=99, seed=42):
    np.random.seed(seed)
    all_items = np.array(all_items)
    test_with_neg = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating Negatives", mininterval=1.0):
        user = int(row['user_idx'])
        pos_item = int(row['item_idx'])
        seen_items = user_item_dict.get(user, set())
        
        # ambil calon item negatif acak
        neg_items = []
        while len(neg_items) < num_negatives:
            candidates = np.random.choice(all_items, size=num_negatives*2, replace=False)
            valid = [i for i in candidates if i not in seen_items and i != pos_item]
            neg_items.extend(valid)
        neg_items = neg_items[:num_negatives]
        
        test_with_neg.append({
            'user_idx': user,
            'item_idx': pos_item,
            'age_idx': int(row['age_idx']),
            'gender_idx': int(row['gender_idx']),
            'location_idx': int(row['location_idx']),
            'category_idx': int(row['category_idx']),
            'rating': row['rating'],
            'neg_items': neg_items
        })
    
    return pd.DataFrame(test_with_neg)

In [9]:
# Generate negatif test
all_items = df['item_idx'].unique().tolist()
user_item_dict = train_df.groupby('user_idx')['item_idx'].apply(set).to_dict()

test_warm_with_neg = generate_negative_samples(test_warm_df, all_items, user_item_dict, item_to_category)
user_cold_start_test_with_neg = generate_negative_samples(user_cold_start_test_df, all_items, user_item_dict, item_to_category)
item_cold_start_test_with_neg = generate_negative_samples(item_cold_start_test_df, all_items, user_item_dict, item_to_category)


Generating Negatives:   0%|          | 0/268408 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/200000 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/9962 [00:00<?, ?it/s]

In [10]:
# Save negatif test
with open("test_warm_neg_2tower.pkl", "wb") as f:
    pickle.dump(test_warm_with_neg, f)

with open("user_cold_start_test_neg_2tower.pkl", "wb") as f:
    pickle.dump(user_cold_start_test_with_neg, f)

with open("item_cold_start_test_neg_2tower.pkl", "wb") as f:
    pickle.dump(item_cold_start_test_with_neg, f)

In [None]:
# Load negatif test
with open("test_warm_neg_2tower.pkl", "rb") as f:
    test_warm_with_neg = pickle.load(f)

with open("user_cold_start_test_neg_2tower.pkl", "rb") as f:
    user_cold_start_test_with_neg = pickle.load(f)

with open("item_cold_start_test_neg_2tower.pkl", "rb") as f:
    item_cold_start_test_with_neg = pickle.load(f)

In [11]:
# Dataset Class
class FinalHybridDataset(Dataset):
    def __init__(self, data_dict):
        self.users = torch.tensor(data_dict['users'], dtype=torch.long)
        self.items = torch.tensor(data_dict['items'], dtype=torch.long)
        self.ages = torch.tensor(data_dict['ages'], dtype=torch.long)
        self.genders = torch.tensor(data_dict['genders'], dtype=torch.long)
        self.locations = torch.tensor(data_dict['locations'], dtype=torch.long)
        self.categories = torch.tensor(data_dict['categories'], dtype=torch.long)
        
        self.labels = torch.tensor(data_dict['labels'], dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        # Kembalikan tensor dari atribut class
        return (self.users[idx], self.items[idx], self.ages[idx], 
                self.genders[idx], self.locations[idx], 
                self.categories[idx], self.labels[idx])

In [12]:
# Model NCF Final Two-Tower
class FinalTwoTowerModel(nn.Module):
    def __init__(self, num_users, num_items, num_ages, num_genders, num_locations, num_categories, embedding_dim=32):
        super(FinalTwoTowerModel, self).__init__()
        self.embedding_dim = embedding_dim
        
        # --- User Tower ---
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.age_embed = nn.Embedding(num_ages, 8)
        self.gender_embed = nn.Embedding(num_genders, 4)
        self.location_embed = nn.Embedding(num_locations, 8)
        user_mlp_input_dim = embedding_dim + 8 + 4 + 8
        self.user_tower = nn.Sequential(nn.Linear(user_mlp_input_dim, 64), nn.ReLU(), nn.Linear(64, embedding_dim))
        
        # --- Item Tower ---
        self.item_embed = nn.Embedding(num_items, embedding_dim)
        self.category_embed = nn.Embedding(num_categories, 16)
        item_mlp_input_dim = embedding_dim + 16
        self.item_tower = nn.Sequential(nn.Linear(item_mlp_input_dim, 64), nn.ReLU(), nn.Linear(64, embedding_dim))

    def forward(self, user, item, age, gender, location, category):
        # Proses User Tower
        user_features = torch.cat([self.user_embed(user), self.age_embed(age), 
                                   self.gender_embed(gender), self.location_embed(location)], dim=-1)
        user_vector = self.user_tower(user_features)
        
        # Proses Item Tower
        item_features = torch.cat([self.item_embed(item), self.category_embed(category)], dim=-1)
        item_vector = self.item_tower(item_features)
        
        # Prediksi dengan dot product
        prediction = torch.sum(user_vector * item_vector, dim=1)
        return prediction


In [13]:
# Fungsi helper
def hit_ratio_at_k(predictions, true_item_idx, k):
    _, top_k_indices = torch.topk(predictions, k)
    return 1 if true_item_idx in top_k_indices else 0

def ndcg_at_k(predictions, true_item_idx, k):
    _, top_k_indices = torch.topk(predictions, k)
    indices = (top_k_indices == true_item_idx).nonzero(as_tuple=True)[0]
    return 0.0 if indices.numel() == 0 else (1.0 / np.log2(indices.item() + 2))

In [14]:
# Fungsi evaluasi
def evaluate_model(model, test_data, description):
    if test_data.empty:
        print(f"\n--- {description} ---\nTidak ada data untuk dievaluasi.")
        return
    
    model.eval()
    all_rmse, all_hr_at_10, all_ndcg_at_10 = [], [], []
    print(f"\n--- {description} ---")

    with torch.no_grad():
        # === RMSE (hanya data positif) ===
        test_loader = DataLoader(
            TensorDataset(
                torch.tensor(test_data['user_idx'].values),
                torch.tensor(test_data['item_idx'].values),
                torch.tensor(test_data['age_idx'].values),
                torch.tensor(test_data['gender_idx'].values),
                torch.tensor(test_data['location_idx'].values),
                torch.tensor(test_data['category_idx'].values),
                torch.tensor(test_data['rating'].values)
            ), batch_size=2048, shuffle=False
        )

        for users, items, ages, genders, locs, cats, ratings in test_loader:
            users, items, ages, genders, locs, cats, ratings = [
                t.to(device) for t in [users, items, ages, genders, locs, cats, ratings]
            ]
            predictions = model(users, items, ages, genders, locs, cats)
            all_rmse.extend((predictions - ratings).pow(2).cpu().numpy().tolist())

        # === HR/NDCG (dengan negative sampling) ===
        for row in test_data.itertuples():
            user_idx = row.user_idx
            pos_item_idx = row.item_idx
            age_idx = row.age_idx
            gender_idx = row.gender_idx
            loc_idx = row.location_idx
            cat_idx = row.category_idx
            neg_items = row.neg_items

            test_items = [pos_item_idx] + neg_items
            item_cats = [item_to_category.get(i, 0) for i in test_items]
            num_items = len(test_items)

            tensors_pred = [
                torch.tensor([val] * num_items, device=device)
                for val in [user_idx, age_idx, gender_idx, loc_idx]
            ]
            tensors_pred.insert(1, torch.tensor(test_items, device=device))
            tensors_pred.append(torch.tensor(item_cats, device=device))

            predictions = model(*tensors_pred)
            all_hr_at_10.append(hit_ratio_at_k(predictions, 0, 10))
            all_ndcg_at_10.append(ndcg_at_k(predictions, 0, 10))

    print(f"RMSE: {np.sqrt(np.mean(all_rmse)):.4f}")
    print(f"Hit Ratio @10: {np.mean(all_hr_at_10):.4f}")
    print(f"NDCG @10: {np.mean(all_ndcg_at_10):.4f}")

In [15]:
# Dataset dan dataloader
final_train_dataset = FinalHybridDataset(training_df_final)
train_loader = DataLoader(final_train_dataset, batch_size=4096, shuffle=True, num_workers=0)

# Inisialisasi model dan optimizer
model = FinalTwoTowerModel(n_users, n_items, n_ages, n_genders, n_locations, n_categories).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Loop training
start_time = time.time()
for epoch in range(10): # 10 epochs
    model.train()
    total_loss = 0
    for batch_data in train_loader:
        tensors = [t.to(device) for t in batch_data]
        users, items, ages, genders, locations, categories, labels = tensors
        
        optimizer.zero_grad()
        predictions = model(users, items, ages, genders, locations, categories)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/10 Selesai | Rata-rata Loss: {total_loss / len(train_loader):.4f}")

print(f"\nTraining selesai dalam {time.time() - start_time:.2f} detik.")


Epoch 1/10 Selesai | Rata-rata Loss: 0.1903
Epoch 2/10 Selesai | Rata-rata Loss: 0.1373
Epoch 3/10 Selesai | Rata-rata Loss: 0.1234
Epoch 4/10 Selesai | Rata-rata Loss: 0.1189
Epoch 5/10 Selesai | Rata-rata Loss: 0.1150
Epoch 6/10 Selesai | Rata-rata Loss: 0.1105
Epoch 7/10 Selesai | Rata-rata Loss: 0.1050
Epoch 8/10 Selesai | Rata-rata Loss: 0.0991
Epoch 9/10 Selesai | Rata-rata Loss: 0.0932
Epoch 10/10 Selesai | Rata-rata Loss: 0.0874

Training selesai dalam 3527.29 detik.


In [16]:
# Evaluasi Akhir
evaluate_model(model, test_warm_with_neg, "Evaluasi Warm Start")
evaluate_model(model, user_cold_start_test_with_neg, "Evaluasi User Cold Start")
evaluate_model(model, item_cold_start_test_with_neg, "Evaluasi Item Cold Start")


--- Evaluasi Warm Start ---
RMSE: 0.5988
Hit Ratio @10: 0.7461
NDCG @10: 0.4777

--- Evaluasi User Cold Start ---
RMSE: 0.7481
Hit Ratio @10: 0.4050
NDCG @10: 0.2252

--- Evaluasi Item Cold Start ---
RMSE: 0.9869
Hit Ratio @10: 0.0096
NDCG @10: 0.0034


In [18]:
# Simpan model
torch.save(model.state_dict(), "NCF_TwoTower.pth")