In [12]:
import gzip
import ast
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import random
from sklearn.metrics import roc_auc_score, accuracy_score

In [8]:
# Load the data
data = []
with gzip.open("dataset/australian_users_items.json.gz", "rt", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading data"):
        obj = ast.literal_eval(line)
        data.append(obj)

print(f"Loaded {len(data)} users")

# Compile interactions
user_games = defaultdict(list)  # user_id -> list of (game_id, playtime)
game_users = defaultdict(list)  # game_id -> list of (user_id, playtime)

for user_data in tqdm(data, desc="Compiling interactions"):
    user_id = user_data['user_id']
    
    for item in user_data['items']:
        game_id = item['item_id']
        playtime = item['playtime_forever']
        
        # Only include games that have been played (playtime > 0)
        if playtime > 0:
            user_games[user_id].append((game_id, playtime))
            game_users[game_id].append((user_id, playtime))

Loading data: 0it [00:00, ?it/s]

Loading data: 88310it [01:23, 1051.36it/s]


Loaded 88310 users


Compiling interactions: 100%|██████████| 88310/88310 [00:18<00:00, 4846.35it/s]


In [9]:
random.seed(42)
np.random.seed(42)

all_users = list(user_games.keys())
all_games = list(game_users.keys())

user_id_to_idx = {uid: idx for idx, uid in enumerate(all_users)}
game_id_to_idx = {gid: idx for idx, gid in enumerate(all_games)}
idx_to_user_id = {idx: uid for uid, idx in user_id_to_idx.items()}
idx_to_game_id = {idx: gid for gid, idx in game_id_to_idx.items()}

print(f"Users: {len(all_users)}, Games: {len(all_games)}")

# collect all positive interactions
positive_interactions = []
user_positive_games = defaultdict(set)

for user_id, games in tqdm(user_games.items(), desc="Collecting Positive Samples"):
    user_idx = user_id_to_idx[user_id]
    for game_id, playtime in games:
        game_idx = game_id_to_idx[game_id]
        positive_interactions.append((user_idx, game_idx, 1, playtime)) # label = 1
        user_positive_games[user_idx].add(game_idx)

print(f"Total positive interactions: {len(positive_interactions)}")

# split positive interactions into train/test 
random.shuffle(positive_interactions)
split_idx = int(0.8 * len(positive_interactions))
train_positive = positive_interactions[:split_idx]
test_positive = positive_interactions[split_idx:]

print(f"Train positive: {len(train_positive)}, Test positive: {len(test_positive)}")

Users: 68403, Games: 10050


Collecting Positive Samples: 100%|██████████| 68403/68403 [00:13<00:00, 5104.70it/s]


Total positive interactions: 3285246
Train positive: 2628196, Test positive: 657050


In [10]:
# generate negative samples
def generate_negative_samples(positive_samples, user_positive_games, num_games, neg_ratio=1):
    negative_samples = []

    for user_idx, game_idx, _, _ in tqdm(positive_samples, desc="Generating negative samples"):
        for _ in range(neg_ratio):
            neg_game_idx = random.randint(0, num_games - 1)
            while neg_game_idx in user_positive_games[user_idx]:
                neg_game_idx = random.randint(0, num_games - 1)
            
            negative_samples.append((user_idx, neg_game_idx, 0, 0))
    return negative_samples

In [11]:
train_negative = generate_negative_samples(train_positive, user_positive_games, len(all_games), neg_ratio=1)
test_negative = generate_negative_samples(test_positive, user_positive_games, len(all_games), neg_ratio=1)

print(f"Train negative: {len(train_negative)}, Test negative: {len(test_negative)}")

train_data = train_positive + train_negative
test_data = test_positive + test_negative

random.shuffle(train_data)
random.shuffle(test_data)

print(f"\nFinal dataset sizes:")
print(f"Train: {len(train_data)} ({len(train_positive)} pos, {len(train_negative)} neg)")
print(f"Test: {len(test_data)} ({len(test_positive)} pos, {len(test_negative)} neg)")

# Convert to arrays for easy use
train_users = np.array([x[0] for x in train_data])
train_games = np.array([x[1] for x in train_data])
train_labels = np.array([x[2] for x in train_data])
train_playtimes = np.array([x[3] for x in train_data])

test_users = np.array([x[0] for x in test_data])
test_games = np.array([x[1] for x in test_data])
test_labels = np.array([x[2] for x in test_data])
test_playtimes = np.array([x[3] for x in test_data])

print(f"\nTrain labels distribution: {np.bincount(train_labels)}")
print(f"Test labels distribution: {np.bincount(test_labels)}")

Generating negative samples: 100%|██████████| 2628196/2628196 [00:32<00:00, 81797.52it/s] 
Generating negative samples: 100%|██████████| 657050/657050 [00:11<00:00, 56072.04it/s] 


Train negative: 2628196, Test negative: 657050

Final dataset sizes:
Train: 5256392 (2628196 pos, 2628196 neg)
Test: 1314100 (657050 pos, 657050 neg)

Train labels distribution: [2628196 2628196]
Test labels distribution: [657050 657050]


# Random Prediction

In [None]:
# Random predictions
random_preds = np.random.rand(len(test_labels))
random_auc = roc_auc_score(test_labels, random_preds)
print(f"Random AUC: {random_auc:.4f}")

binary_preds = (random_preds > 0.5).astype(int)
accuracy = (binary_preds == test_labels).mean()
print(f"Random Accuracy: {accuracy:.4f}")

Random Accuracy: 0.4994
Random AUC: 0.4996


# Popularity Baseline

In [19]:
# Count how many users played each game
game_popularity = {}
for game_idx in range(len(all_games)):
    game_popularity[game_idx] = len(game_users[idx_to_game_id[game_idx]])

# Predict based on game popularity AUC
pop_preds = np.array([game_popularity[game_idx] for game_idx in test_games])
pop_auc = roc_auc_score(test_labels, pop_preds)
print(f"Popularity AUC: {pop_auc:.4f}")

# Accuracy
threshold = np.median(pop_preds)
pop_preds_binary = (pop_preds >= threshold).astype(int)
accuracy = (pop_preds_binary == test_labels).mean()
print(f"Popularity Accuracy: {accuracy:.4f}")


Popularity AUC: 0.9454
Popularity Accuracy: 0.8749
