In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time
from tqdm.notebook import tqdm
import warnings
import joblib
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

np.random.seed(42)

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

warnings.filterwarnings('ignore')
print(f"PyTorch version: {torch.__version__}")

# Cek ketersediaan GPU dan atur device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Menggunakan device: {device}")

PyTorch version: 2.2.2+cu121
Menggunakan device: cuda


In [2]:
# Load CSV
df = pd.read_csv("Final_Data.csv")
df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,rating,Usia,Gender,Lokasi Tinggal,category
0,1433221332117,257597,view,355908,,1,25-34,Tidak Ingin Menjawab,Bali,Fashion
1,1433224214164,992329,view,248676,,1,35-44,Tidak Ingin Menjawab,Papua Barat,Otomotif
2,1433221999827,111016,view,318965,,1,<18,Laki - laki,Sulawesi Tengah,Kesehatan & Kecantikan
3,1433221955914,483717,view,253185,,1,25-34,Perempuan,Kalimantan Tengah,Fashion
4,1433221337106,951259,view,367447,,1,<18,Perempuan,Nusa Tenggara Timur,Kesehatan & Kecantikan


In [3]:
# Label Encoding untuk user, item, dan fitur demografis
encoders = {
    'user': LabelEncoder(),
    'item': LabelEncoder(),
    'age': LabelEncoder(),
    'gender': LabelEncoder(),
    'location': LabelEncoder()
}

df['user_idx'] = encoders['user'].fit_transform(df['visitorid'])
df['item_idx'] = encoders['item'].fit_transform(df['itemid'])
df['age_idx'] = encoders['age'].fit_transform(df['Usia'])
df['gender_idx'] = encoders['gender'].fit_transform(df['Gender'])
df['location_idx'] = encoders['location'].fit_transform(df['Lokasi Tinggal'])

# Simpan jumlah kategori unik untuk setiap fitur
n_users = df['user_idx'].nunique()
n_items = df['item_idx'].nunique()
n_ages = df['age_idx'].nunique()
n_genders = df['gender_idx'].nunique()
n_locations = df['location_idx'].nunique()

print(f"Data siap: {len(df)} interaksi, {n_users} user, {n_items} item.")
print(f"Fitur demografis: {n_ages} rentang usia, {n_genders} gender, {n_locations} lokasi.")

Data siap: 2755641 interaksi, 1407580 user, 235061 item.
Fitur demografis: 5 rentang usia, 3 gender, 34 lokasi.


In [4]:
# Save Encoding
df.to_csv('encoded_data_utower.csv', index=False)

for name, encoder in encoders.items():
    joblib.dump(encoder, f"{name}_encoder_utower.pkl")

In [None]:
# Load Encoding
df = pd.read_csv('encoded_data_utower.csv')

encoders = {name: joblib.load(f"{name}_encoder_utower.pkl") for name in ['user','item','age','gender','location']}

In [5]:
# Split Data
MIN_INTERACTIONS = 3
SAMPLE_SIZE = 200000
NUM_NEGATIVES = 4

user_counts = df['user_idx'].value_counts()
inactive_users = user_counts[user_counts < MIN_INTERACTIONS].index
active_users = user_counts[user_counts >= MIN_INTERACTIONS].index

user_cold_start_raw = df[df['user_idx'].isin(inactive_users)]
user_cold_start_test_df = user_cold_start_raw.sample(n=min(len(user_cold_start_raw), SAMPLE_SIZE), random_state=42)

warm_df = df[df['user_idx'].isin(active_users)]
train_df, test_warm_df = train_test_split(warm_df, test_size=0.2, random_state=42, stratify=warm_df['user_idx'])

train_items_set = set(train_df['item_idx'])
item_cold_start_test_df = test_warm_df[~test_warm_df['item_idx'].isin(train_items_set)]

print(f"Data Training (Warm): {len(train_df)}")
print(f"Data Test (Warm): {len(test_warm_df)}")
print(f"Data Test (User Cold): {len(user_cold_start_test_df)}")
print(f"Data Test (Item Cold): {len(item_cold_start_test_df)}")

Data Training (Warm): 1073630
Data Test (Warm): 268408
Data Test (User Cold): 200000
Data Test (Item Cold): 9962


In [6]:
# Generate negatif train
user_item_set = set(zip(df['user_idx'], df['item_idx']))
pos_df = train_df[['user_idx','item_idx','age_idx','gender_idx','location_idx','rating']]
train_data_list = pos_df.to_dict('records')

neg_items_array = np.random.randint(0, df['item_idx'].nunique(), size=(len(pos_df), NUM_NEGATIVES))
for i, row in enumerate(pos_df.itertuples(index=False)):
    user = row.user_idx
    for j in range(NUM_NEGATIVES):
        neg_item = neg_items_array[i, j]
        while (user, neg_item) in user_item_set:
            neg_item = np.random.randint(0, df['item_idx'].nunique())
        train_data_list.append({
            'user_idx': user,
            'item_idx': neg_item,
            'age_idx': row.age_idx,
            'gender_idx': row.gender_idx,
            'location_idx': row.location_idx,
            'rating': 0.0
        })

training_df_final = pd.DataFrame(train_data_list)
print(f"Ukuran data training diperluas: {len(training_df_final)} sampel.")

Ukuran data training diperluas: 5368150 sampel.


In [7]:
# Save negatif train
training_df_final.to_csv("train_negsamp_utower.csv", index=False)

In [None]:
# load negatif train
training_df_final = pd.read_csv("train_negsamp_utower.csv")

In [8]:
def generate_negative_test_set(test_df, all_items, user_item_set, num_negatives=99, seed=42):
    
    np.random.seed(seed)
    all_items = np.array(all_items)
    test_with_neg = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating Negatives", mininterval=1.0):
        user = int(row['user_idx'])
        pos_item = int(row['item_idx'])

        # Generate item negatif (yang belum pernah diinteraksi oleh user)
        neg_items = []
        while len(neg_items) < num_negatives:
            candidates = np.random.choice(all_items, size=num_negatives*2, replace=False)
            valid = [i for i in candidates if (user, i) not in user_item_set and i != pos_item]
            neg_items.extend(valid)
        neg_items = neg_items[:num_negatives]

        # Simpan baris
        new_row = row.to_dict()
        new_row['neg_items'] = neg_items
        test_with_neg.append(new_row)

    return pd.DataFrame(test_with_neg)

In [9]:
# Generate negatif test
all_items = df['item_idx'].unique().tolist()
user_item_set = set(zip(train_df['user_idx'], train_df['item_idx']))

test_warm_with_neg = generate_negative_test_set(test_warm_df, all_items, user_item_set, num_negatives=99)

user_cold_start_test_with_neg = generate_negative_test_set(user_cold_start_test_df, all_items, user_item_set, num_negatives=99)

item_cold_start_test_with_neg = generate_negative_test_set(item_cold_start_test_df, all_items, user_item_set, num_negatives=99)

Generating Negatives:   0%|          | 0/268408 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/200000 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/9962 [00:00<?, ?it/s]

In [10]:
# Save negatif test
with open("test_warm_neg_utower.pkl", "wb") as f:
    pickle.dump(test_warm_with_neg, f)

with open("user_cold_start_test_neg_utower.pkl", "wb") as f:
    pickle.dump(user_cold_start_test_with_neg, f)

with open("item_cold_start_test_neg_utower.pkl", "wb") as f:
    pickle.dump(item_cold_start_test_with_neg, f)

In [None]:
# Load negatif test
with open("test_warm_neg_utower.pkl", "rb") as f:
    test_warm_with_neg = pickle.load(f)

with open("user_cold_start_test_neg_utower.pkl", "rb") as f:
    user_cold_start_test_with_neg = pickle.load(f)

with open("item_cold_start_test_neg_utower.pkl", "rb") as f:
    item_cold_start_test_with_neg = pickle.load(f)

In [11]:
# Dataset Class
class HybridDataset(Dataset):
    def __init__(self, dataframe):
        self.users = torch.tensor(dataframe['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(dataframe['item_idx'].values, dtype=torch.long)
        self.ages = torch.tensor(dataframe['age_idx'].values, dtype=torch.long)
        self.genders = torch.tensor(dataframe['gender_idx'].values, dtype=torch.long)
        self.locations = torch.tensor(dataframe['location_idx'].values, dtype=torch.long)
        self.labels = torch.tensor(dataframe['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (self.users[idx], self.items[idx], self.ages[idx], 
                self.genders[idx], self.locations[idx], self.labels[idx])

In [12]:
# Model Two-Tower (User)
class TwoTowerModel(nn.Module):
    def __init__(self, num_users, num_items, num_ages, num_genders, num_locations, embedding_dim=32):
        super(TwoTowerModel, self).__init__()
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.age_embed = nn.Embedding(num_ages, embedding_dim // 2)
        self.gender_embed = nn.Embedding(num_genders, embedding_dim // 4)
        self.location_embed = nn.Embedding(num_locations, embedding_dim // 2)
        
        user_mlp_input_dim = embedding_dim + (embedding_dim // 2) * 2 + (embedding_dim // 4)
        self.user_tower = nn.Sequential(nn.Linear(user_mlp_input_dim, 64), nn.ReLU(), nn.Linear(64, embedding_dim))
        self.item_tower = nn.Embedding(num_items, embedding_dim)

    def forward(self, user, item, age, gender, location):
        user_features = torch.cat([self.user_embed(user), self.age_embed(age), self.gender_embed(gender), 
                                   self.location_embed(location)], dim=-1)
        user_vector = self.user_tower(user_features)
        item_vector = self.item_tower(item)
        prediction = torch.sum(user_vector * item_vector, dim=1)
        return prediction.squeeze()

In [13]:
# Fungsi helper
def hit_ratio_at_k(predictions, true_item_idx, k):
    _, top_k_indices = torch.topk(predictions, k)
    return 1 if true_item_idx in top_k_indices else 0

def ndcg_at_k(predictions, true_item_idx, k):
    _, top_k_indices = torch.topk(predictions, k)
    indices = (top_k_indices == true_item_idx).nonzero(as_tuple=True)[0]
    return 0.0 if indices.numel() == 0 else (1.0 / np.log2(indices.item() + 2))

In [20]:
# Fungsi Evaluasi
def evaluate_model(model, test_data, description):
    if test_data.empty: print(f"\n--- {description} ---\nTidak ada data untuk dievaluasi."); return
    model.eval()
    all_rmse, all_hr_at_10, all_ndcg_at_10 = [], [], []
    print(f"\n--- {description} ---")
    with torch.no_grad():
        test_loader = DataLoader(
            TensorDataset(
                torch.tensor(test_data['user_idx'].values), torch.tensor(test_data['item_idx'].values),
                torch.tensor(test_data['age_idx'].values), torch.tensor(test_data['gender_idx'].values),
                torch.tensor(test_data['location_idx'].values), torch.tensor(test_data['rating'].values, dtype=torch.float)
            ), batch_size=2048, shuffle=False
        )
        for users, items, ages, genders, locations, ratings in tqdm(test_loader, desc="RMSE Calculation", leave=False):
            tensors = [t.to(device) for t in [users, items, ages, genders, locations, ratings]]
            predictions = model(*tensors[:-1])
            all_rmse.extend((predictions - tensors[-1]).pow(2).cpu().numpy().tolist())
        
        pbar = tqdm(test_data.itertuples(), total=len(test_data), desc="HR/NDCG Calculation", mininterval=1.5, leave=False)
        for row in pbar:
            user_idx, item_pos_idx, age_idx, gender_idx, location_idx = (
                row.user_idx, row.item_idx, row.age_idx, row.gender_idx, row.location_idx)
            if user_idx >= n_users or item_pos_idx >= n_items: continue
            
            if 'neg_items' in test_data.columns:
                negative_items = row.neg_items
            else:
                # Generate random jika belum ada
                negative_items = []
                while len(negative_items) < 100:
                    item_neg = np.random.randint(0, n_items)
                    if (user_idx, item_neg) not in user_item_set and item_neg != item_pos_idx:
                        negative_items.append(item_neg)
                       
            test_items = [item_pos_idx] + row.neg_items  # gunakan list dari kolom neg_items
            num_items = len(test_items)  # total item yang akan dievaluasi

            tensors_pred = [
                torch.tensor([val] * num_items, device=device) for val in [user_idx, age_idx, gender_idx, location_idx]
            ]
            tensors_pred.insert(1, torch.tensor(test_items, device=device))
            
            predictions = model(*tensors_pred)
            all_hr_at_10.append(hit_ratio_at_k(predictions, 0, 10))
            all_ndcg_at_10.append(ndcg_at_k(predictions, 0, 10))

    print(f"RMSE: {np.sqrt(np.mean(all_rmse)):.4f}")
    print(f"Hit Ratio @10: {np.mean(all_hr_at_10):.4f}")
    print(f"NDCG @10: {np.mean(all_ndcg_at_10):.4f}")

In [15]:
# Training Model
final_train_dataset = HybridDataset(training_df_final)
train_loader = DataLoader(final_train_dataset, batch_size=4096, shuffle=True, num_workers=0)

model = TwoTowerModel(n_users, n_items, n_ages, n_genders, n_locations).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

start_time = time.time()
for epoch in range(10):
    model.train()
    total_loss = 0
    for users_b, items_b, ages_b, genders_b, locations_b, labels_b in train_loader:
        tensors = [t.to(device) for t in [users_b, items_b, ages_b, genders_b, locations_b, labels_b]]
        users, items, ages, genders, locations, labels = tensors
        
        optimizer.zero_grad()
        predictions = model(users, items, ages, genders, locations)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/10 Selesai | Rata-rata Loss: {avg_loss:.4f}")

print(f"\nTraining selesai dalam {time.time() - start_time:.2f} detik.")

Epoch 1/10 Selesai | Rata-rata Loss: 0.2791
Epoch 2/10 Selesai | Rata-rata Loss: 0.2214
Epoch 3/10 Selesai | Rata-rata Loss: 0.1490
Epoch 4/10 Selesai | Rata-rata Loss: 0.1259
Epoch 5/10 Selesai | Rata-rata Loss: 0.1204
Epoch 6/10 Selesai | Rata-rata Loss: 0.1182
Epoch 7/10 Selesai | Rata-rata Loss: 0.1165
Epoch 8/10 Selesai | Rata-rata Loss: 0.1148
Epoch 9/10 Selesai | Rata-rata Loss: 0.1129
Epoch 10/10 Selesai | Rata-rata Loss: 0.1110

Training selesai dalam 2707.59 detik.


In [21]:
# Evaluasi Akhir
evaluate_model(model, test_warm_with_neg, "Evaluasi Warm Start")
evaluate_model(model, user_cold_start_test_with_neg, "Evaluasi User Cold Start")
evaluate_model(model, item_cold_start_test_with_neg, "Evaluasi Item Cold Start")


--- Evaluasi Warm Start ---


RMSE Calculation:   0%|          | 0/132 [00:00<?, ?it/s]

HR/NDCG Calculation:   0%|          | 0/268408 [00:00<?, ?it/s]

RMSE: 0.6303
Hit Ratio @10: 0.7215
NDCG @10: 0.4519

--- Evaluasi User Cold Start ---


RMSE Calculation:   0%|          | 0/98 [00:00<?, ?it/s]

HR/NDCG Calculation:   0%|          | 0/200000 [00:00<?, ?it/s]

RMSE: 0.7172
Hit Ratio @10: 0.4548
NDCG @10: 0.2636

--- Evaluasi Item Cold Start ---


RMSE Calculation:   0%|          | 0/5 [00:00<?, ?it/s]

HR/NDCG Calculation:   0%|          | 0/9962 [00:00<?, ?it/s]

RMSE: 1.0148
Hit Ratio @10: 0.0032
NDCG @10: 0.0011


In [22]:
# Simpan model yang sudah dilatih
torch.save(model.state_dict(), 'NCF_UserTower.pth')