In [None]:
import gzip
import ast
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import random

In [None]:
# Load the data
data = []
with gzip.open("dataset/australian_users_items.json.gz", "rt", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading data"):
        obj = ast.literal_eval(line)
        data.append(obj)

print(f"Loaded {len(data)} users")

# Compile interactions
user_games = defaultdict(list)  # user_id -> list of (game_id, playtime)
game_users = defaultdict(list)  # game_id -> list of (user_id, playtime)

for user_data in tqdm(data, desc="Compiling interactions"):
    user_id = user_data['user_id']
    
    for item in user_data['items']:
        game_id = item['item_id']
        playtime = item['playtime_forever']
        
        # Only include games that have been played (playtime > 0)
        if playtime > 0:
            user_games[user_id].append((game_id, playtime))
            game_users[game_id].append((user_id, playtime))

print(f"\nStatistics:")
print(f"Total users with playtime > 0: {len(user_games)}")
print(f"Total unique games played: {len(game_users)}")
print(f"Total interactions (user-game pairs): {sum(len(games) for games in user_games.values())}")
print(f"Average games per user: {sum(len(games) for games in user_games.values()) / len(user_games):.2f}")
print(f"Average users per game: {sum(len(users) for users in game_users.values()) / len(game_users):.2f}")

In [None]:
random.seed(42)
np.random.seed(42)

all_users = list(user_games.keys())
all_games = list(game_users.keys())

user_id_to_idx = {uid: idx for idx, uid in enumerate(all_users)}
game_id_to_idx = {gid: idx for idx, gid in enumerate(all_games)}
idx_to_user_id = {idx: uid for uid, idx in user_id_to_idx.items()}
idx_to_game_id = {idx: gid for gid, idx in game_id_to_idx.items()}

print(f"Users: {len(all_users)}, Games: {len(all_games)}")

# collect all positive interactions
positive_interactions = []
user_positive_games = defaultdict(set)

for user_id, games in tqdm(user_games.items(), desc="Collecting Positive Samples"):
    user_idx = user_id_to_idx[user_id]
    for game_id, playtime in games:
        game_idx = game_id_to_idx[game_id]
        positive_interactions.append((user_idx, game_idx, 1, playtime)) # label = 1
        user_positive_games[user_idx].add(game_idx)

print(f"Total positive interactions: {len(positive_interactions)}")

# split positive interactions into train/test 
random.shuffle(positive_interactions)
split_idx = int(0.8 * len(positive_interactions))
train_positive = positive_interactions[:split_idx]
test_positive = positive_interactions[split_idx:]

print(f"Train positive: {len(train_positive)}, Test positive: {len(test_positive)}")

In [None]:
# generate negative samples
def generate_negative_samples(positive_samples, user_positive_games, num_games, neg_ratio=1):
    negative_samples = []

    for user_idx, game_idx, _, _ in tqdm(positive_samples, desc="Generating negative samples"):
        for _ in range(neg_ratio):
            neg_game_idx = random.randint(0, num_games - 1)
            while neg_game_idx in user_positive_games[user_idx]:
                neg_game_idx = random.randint(0, num_games - 1)
            
            negative_samples.append((user_idx, neg_game_idx, 0, 0))
    return negative_samples

In [None]:
train_negative = generate_negative_samples(train_positive, user_positive_games, len(all_games), neg_ratio=1)
test_negative = generate_negative_samples(test_positive, user_positive_games, len(all_games), neg_ratio=1)

print(f"Train negative: {len(train_negative)}, Test negative: {len(test_negative)}")

train_data = train_positive + train_negative
test_data = test_positive + test_negative

random.shuffle(train_data)
random.shuffle(test_data)

print(f"\nFinal dataset sizes:")
print(f"Train: {len(train_data)} ({len(train_positive)} pos, {len(train_negative)} neg)")
print(f"Test: {len(test_data)} ({len(test_positive)} pos, {len(test_negative)} neg)")

# Convert to arrays for easy use
train_users = np.array([x[0] for x in train_data])
train_games = np.array([x[1] for x in train_data])
train_labels = np.array([x[2] for x in train_data])
train_playtimes = np.array([x[3] for x in train_data])

test_users = np.array([x[0] for x in test_data])
test_games = np.array([x[1] for x in test_data])
test_labels = np.array([x[2] for x in test_data])
test_playtimes = np.array([x[3] for x in test_data])

print(f"\nTrain labels distribution: {np.bincount(train_labels)}")
print(f"Test labels distribution: {np.bincount(test_labels)}")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np

In [None]:
# Hyperparameters
EMBEDDING_DIM = 64
LEARNING_RATE = 0.001
BATCH_SIZE = 8192
EPOCHS = 10

# Dataset class
class GameDataset(Dataset):
    def __init__(self, users, games, labels):
        self.users = torch.LongTensor(users)
        self.games = torch.LongTensor(games)
        self.labels = torch.FloatTensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.users[idx], self.games[idx], self.labels[idx]

# Matrix Factorization Model
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_games, embedding_dim=64):
        super(MatrixFactorization, self).__init__()
        
        # User and game embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.game_embedding = nn.Embedding(num_games, embedding_dim)
        
        # Bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.game_bias = nn.Embedding(num_games, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        # Initialize embeddings
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.game_embedding.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.game_bias.weight)
    
    def forward(self, user_ids, game_ids):
        # Get embeddings
        user_emb = self.user_embedding(user_ids)
        game_emb = self.game_embedding(game_ids)
        
        # Dot product of embeddings
        dot_product = (user_emb * game_emb).sum(dim=1, keepdim=True)
        
        # Add biases
        user_b = self.user_bias(user_ids)
        game_b = self.game_bias(game_ids)
        
        # Final prediction
        prediction = dot_product + user_b + game_b + self.global_bias
        
        return prediction.squeeze()

# Create datasets and dataloaders
train_dataset = GameDataset(train_users, train_games, train_labels)
test_dataset = GameDataset(test_users, test_games, test_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Initialize model
# device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device = torch.device ('cpu')
print(f"Using device: {device}")

model = MatrixFactorization(
    num_users=len(all_users),
    num_games=len(all_games),
    embedding_dim=EMBEDDING_DIM
).to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

print(f"\nModel: {sum(p.numel() for p in model.parameters())} parameters")
print(f"Training on {len(train_dataset)} samples, testing on {len(test_dataset)} samples")

# Training function
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    pbar = tqdm(loader, desc="Training")
    for users, games, labels in pbar:
        users, games, labels = users.to(device), games.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(users, games)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for users, games, labels in loader:
            users, games, labels = users.to(device), games.to(device), labels.to(device)
            predictions = model(users, games)
            
            # Apply sigmoid for probabilities
            probs = torch.sigmoid(predictions)
            
            all_preds.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    auc = roc_auc_score(all_labels, all_preds)
    acc = accuracy_score(all_labels, all_preds > 0.5)
    
    return auc, acc

# Training loop
print("\nStarting training...")
for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    train_auc, train_acc = evaluate(model, train_loader, device)
    test_auc, test_acc = evaluate(model, test_loader, device)
    
    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f}, AUC: {train_auc:.4f}, Acc: {train_acc:.4f}")
    print(f"  Test  AUC: {test_auc:.4f}, Acc: {test_acc:.4f}")