In [20]:
import gzip
import ast
import json
import random
import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import scipy.sparse as sp

In [21]:
# load user-item interactions
data = []
with gzip.open("dataset/australian_users_items.json.gz", "rt", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading user-item data"):
        obj = ast.literal_eval(line)
        data.append(obj)

print(f"Loaded {len(data)} users")

# load item metadata
item_metadata = {}
with gzip.open("dataset/steam_games.json.gz", "rt", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading item metadata"):
        obj = ast.literal_eval(line)
        item_id = obj.get('id')
        if item_id:
            item_metadata[obj['id']] = obj
        else:
            print(f"Object {obj} has no id")

print(f"Loaded metadata for {len(item_metadata)} games")

Loading user-item data: 88310it [01:26, 1017.44it/s]
Loading user-item data: 88310it [01:26, 1017.44it/s]


Loaded 88310 users


Loading item metadata: 3848it [00:00, 19327.93it/s]

Object {'url': 'http://store.steampowered.com/', 'price': 19.99, 'discount_price': 14.99, 'early_access': False} has no id


Loading item metadata: 32135it [00:02, 11556.79it/s]

Object {'publisher': 'Warner Bros. Interactive Entertainment, Feral Interactive (Mac)', 'genres': ['Action', 'Adventure'], 'app_name': 'Batman: Arkham City - Game of the Year Edition', 'sentiment': 'Overwhelmingly Positive', 'title': 'Batman: Arkham City - Game of the Year Edition', 'url': 'http://store.steampowered.com/app/200260', 'release_date': '2012-09-07', 'tags': ['Action', 'Open World', 'Batman', 'Adventure', 'Stealth', 'Third Person', 'Superhero', 'Singleplayer', "Beat 'em up", 'Comic Book', 'Detective', 'Fighting', 'Atmospheric', 'Story Rich', 'Action-Adventure', 'Cinematic', 'Controller', '3D Vision', 'Metroidvania', 'Sandbox'], 'metascore': 91, 'price': 19.99, 'early_access': False, 'specs': ['Single-player', 'Steam Achievements', 'Steam Trading Cards', 'Partial Controller Support', 'Steam Cloud'], 'developer': 'Rocksteady Studios,Feral Interactive (Mac)'} has no id
Loaded metadata for 32132 games





In [22]:
# take only subset of user-item interactions
data = data[:(len(data) // 10)]

In [23]:
user_games = defaultdict(list)
game_users = defaultdict(list)

for user_data in tqdm(data, desc="Compiling interactions"):
    user_id = user_data['user_id']
    
    for item in user_data['items']:
        game_id = item['item_id']
        playtime = item['playtime_forever']
        
        if playtime > 0:
            user_games[user_id].append((game_id, playtime)) # user_games maps user_id -> (game_id, playtime)
            game_users[game_id].append((user_id, playtime)) # game_users maps game_id -> (user_id, playtime)

random.seed(42)
np.random.seed(42)

all_users = list(user_games.keys()) # unique list of all user IDs
all_games = list(game_users.keys()) # unique list of all game IDs

user_id_to_idx = {uid: idx for idx, uid in enumerate(all_users)} # user_id_to_idx maps user_id -> unique index
game_id_to_idx = {gid: idx for idx, gid in enumerate(all_games)} # game_id_to_idx maps game_id -> unique index
idx_to_user_id = {idx: uid for uid, idx in user_id_to_idx.items()} # idx_to_user_id maps index -> user_id
idx_to_game_id = {idx: gid for gid, idx in game_id_to_idx.items()} # idx_to_game_id maps index -> game_id

print(f"Users: {len(all_users)}, Games: {len(all_games)}")

Compiling interactions: 100%|██████████| 8831/8831 [00:00<00:00, 10547.79it/s]

Users: 7679, Games: 8255





In [24]:
print("\n=== Building Game Features ===")

# Prepare text data for TF-IDF
game_genre_text = {}
game_tag_text = {}
game_spec_text = {}

for game_idx in tqdm(range(len(all_games)), desc="Preparing game text features"):
    game_id = idx_to_game_id[game_idx]
    
    if game_id in item_metadata:
        meta = item_metadata[game_id]
        
        # Join genres, tags, specs as space-separated strings
        game_genre_text[game_idx] = ' '.join(meta.get('genres', [])) # game_genre_text maps game index -> string of the game genres
        game_tag_text[game_idx] = ' '.join(meta.get('tags', [])) # game_tag_text maps game index -> string of game tags
        game_spec_text[game_idx] = ' '.join(meta.get('specs', [])) # game_spec_text maps game spec -> string of game specs
    else:
        game_genre_text[game_idx] = ''
        game_tag_text[game_idx] = ''
        game_spec_text[game_idx] = ''

# Create TF-IDF vectors for genres and tags
genre_vectorizer = TfidfVectorizer(max_features=50)
tag_vectorizer = TfidfVectorizer(max_features=100)
spec_vectorizer = TfidfVectorizer(max_features=20)

genre_corpus = [game_genre_text[i] for i in range(len(all_games))] # genre_corpus is list of all game genre strings
tag_corpus = [game_tag_text[i] for i in range(len(all_games))] # tag_corpus is a list of all game tag strings
spec_corpus = [game_spec_text[i] for i in range(len(all_games))] # spec_corpous is a list of all game spec strings

game_genre_features = genre_vectorizer.fit_transform(genre_corpus) # shape num_games x 50 (each col is the tf-idf score for that genre for the game)
game_tag_features = tag_vectorizer.fit_transform(tag_corpus)
game_spec_features = spec_vectorizer.fit_transform(spec_corpus)

# Additional game features (numeric)
game_numeric_features = np.zeros((len(all_games), 4))

for game_idx in tqdm(range(len(all_games)), desc="Building game numeric features"):
    game_id = idx_to_game_id[game_idx]
    
    if game_id in item_metadata:
        meta = item_metadata[game_id]
        
        # Price (handle missing values)
        price = meta.get('price', 0)
        if isinstance(price, str):
            price = 0
        game_numeric_features[game_idx, 0] = price if price else 0 
        
        discount = meta.get('discount_price', 0)
        early_access = meta.get('early_access', False)
        game_numeric_features[game_idx, 1] = discount if discount else 0
        game_numeric_features[game_idx, 2] = 1 if early_access else 0
        game_numeric_features[game_idx, 3] = len(game_users.get(game_id, []))

# Normalize numeric features
scaler_game = StandardScaler()
game_numeric_features = scaler_game.fit_transform(game_numeric_features)
game_numeric_features_sparse = sp.csr_matrix(game_numeric_features) # shape n_games x 4 (price, discount, early access, popularity)

# Combine all game features
game_features = sp.hstack([
    game_genre_features,
    game_tag_features,
    game_spec_features,
    game_numeric_features_sparse
]) # shape n_games x 174 (50 + 100 + 20 + 4)

print(f"Game feature matrix shape: {game_features.shape}")

# --- User-Game Preference Features ---
print("\n=== Building User Preference Features ===")

# Calculate user's genre preferences based on played games
user_genre_preferences = np.zeros((len(all_users), game_genre_features.shape[1]))
user_tag_preferences = np.zeros((len(all_users), game_tag_features.shape[1]))

for user_idx in tqdm(range(len(all_users)), desc="Building user preferences"):
    user_id = idx_to_user_id[user_idx]
    user_played_games = user_games.get(user_id, [])
    
    if user_played_games:
        for game_id, playtime in user_played_games:
            if game_id in game_id_to_idx:
                game_idx = game_id_to_idx[game_id]
                
                # Weight by playtime (normalized)
                weight = np.log1p(playtime)
                
                # Accumulate genre and tag preferences
                user_genre_preferences[user_idx] += game_genre_features[game_idx].toarray()[0] * weight
                user_tag_preferences[user_idx] += game_tag_features[game_idx].toarray()[0] * weight
        
        # Normalize
        if np.sum(user_genre_preferences[user_idx]) > 0:
            user_genre_preferences[user_idx] /= np.sum(user_genre_preferences[user_idx])
        if np.sum(user_tag_preferences[user_idx]) > 0:
            user_tag_preferences[user_idx] /= np.sum(user_tag_preferences[user_idx])

user_genre_preferences_sparse = sp.csr_matrix(user_genre_preferences)
user_tag_preferences_sparse = sp.csr_matrix(user_tag_preferences)


=== Building Game Features ===


Preparing game text features: 100%|██████████| 8255/8255 [00:00<00:00, 492685.69it/s]
Preparing game text features: 100%|██████████| 8255/8255 [00:00<00:00, 492685.69it/s]
Building game numeric features: 100%|██████████| 8255/8255 [00:00<00:00, 701486.68it/s]
Building game numeric features: 100%|██████████| 8255/8255 [00:00<00:00, 701486.68it/s]


Game feature matrix shape: (8255, 156)

=== Building User Preference Features ===


Building user preferences: 100%|██████████| 7679/7679 [00:21<00:00, 351.96it/s]



In [25]:
# Training Data

positive_interactions = []
user_positive_games = defaultdict(set)

for user_id, games in tqdm(user_games.items(), desc="Collecting Positive Samples"):
    user_idx = user_id_to_idx[user_id]
    for game_id, playtime in games:
        game_idx = game_id_to_idx[game_id]
        positive_interactions.append((user_idx, game_idx, 1, playtime))
        user_positive_games[user_idx].add(game_idx)

print(f"Total positive interactions: {len(positive_interactions)}")

# Split positive interactions
random.shuffle(positive_interactions)
split_idx = int(0.8 * len(positive_interactions))
train_positive = positive_interactions[:split_idx]
test_positive = positive_interactions[split_idx:]

# Generate negative samples
def generate_negative_samples(positive_samples, user_positive_games, num_games, neg_ratio=1):
    negative_samples = []
    for user_idx, game_idx, _, _ in tqdm(positive_samples, desc="Generating negative samples"):
        for _ in range(neg_ratio):
            neg_game_idx = random.randint(0, num_games - 1)
            while neg_game_idx in user_positive_games[user_idx]:
                neg_game_idx = random.randint(0, num_games - 1)
            negative_samples.append((user_idx, neg_game_idx, 0, 0))
    return negative_samples

train_negative = generate_negative_samples(train_positive, user_positive_games, len(all_games), neg_ratio=2)
test_negative = generate_negative_samples(test_positive, user_positive_games, len(all_games), neg_ratio=2)

# Combine positive and negative samples
train_data = train_positive + train_negative
test_data = test_positive + test_negative

random.shuffle(train_data)
random.shuffle(test_data)

print(f"Train samples: {len(train_data)}, Test samples: {len(test_data)}")

Collecting Positive Samples: 100%|██████████| 7679/7679 [00:00<00:00, 12766.71it/s]



Total positive interactions: 574041


Generating negative samples: 100%|██████████| 459232/459232 [00:01<00:00, 345520.19it/s]
Generating negative samples: 100%|██████████| 459232/459232 [00:01<00:00, 345520.19it/s]
Generating negative samples: 100%|██████████| 114809/114809 [00:00<00:00, 315442.95it/s]



Train samples: 1377696, Test samples: 344427


In [26]:
def build_interaction_features(interactions):
    """Build feature matrix for user-game interactions"""
    features_list = []
    labels = []
    
    for user_idx, game_idx, label, _ in tqdm(interactions, desc="Building interaction features"):
        # Game features
        game_feat = game_features[game_idx].toarray()[0]
        
        # User preference features (genre and tag match)
        user_genre_pref = user_genre_preferences_sparse[user_idx].toarray()[0]
        user_tag_pref = user_tag_preferences_sparse[user_idx].toarray()[0]
        
        game_genre_feat = game_genre_features[game_idx].toarray()[0]
        game_tag_feat = game_tag_features[game_idx].toarray()[0]
        
        # Compute preference matching scores (dot product)
        genre_match = np.dot(user_genre_pref, game_genre_feat)
        tag_match = np.dot(user_tag_pref, game_tag_feat)
        
        # Combine all features
        combined_features = np.concatenate([
            game_feat,
            [genre_match, tag_match]
        ])
        
        features_list.append(combined_features)
        labels.append(label)
    
    return np.array(features_list), np.array(labels)

print("\n=== Building Training Features ===")
X_train, y_train = build_interaction_features(train_data)

print("\n=== Building Test Features ===")
X_test, y_test = build_interaction_features(test_data)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


=== Building Training Features ===


Building interaction features: 100%|██████████| 1377696/1377696 [01:49<00:00, 12587.07it/s]
Building interaction features: 100%|██████████| 1377696/1377696 [01:49<00:00, 12587.07it/s]



=== Building Test Features ===


Building interaction features: 100%|██████████| 344427/344427 [00:27<00:00, 12608.59it/s]

X_train shape: (1377696, 158), X_test shape: (344427, 158)





In [27]:
print("\n=== Training Logistic Regression ===")
model = LogisticRegression(max_iter=1500, random_state=42, verbose=1)
model.fit(X_train, y_train)


=== Training Logistic Regression ===


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.5s finished


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1500


In [28]:
print("\n=== Evaluation ===")
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_proba = model.predict_proba(X_train)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]

print("\nTrain Metrics:")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred):.4f}")
print(f"F1: {f1_score(y_train, y_train_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_train, y_train_proba):.4f}")

print("\nTest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1: {f1_score(y_test, y_test_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_proba):.4f}")



=== Evaluation ===

Train Metrics:
Accuracy: 0.8459
Precision: 0.8552
Recall: 0.6473
F1: 0.7369

Train Metrics:
Accuracy: 0.8459
Precision: 0.8552
Recall: 0.6473
F1: 0.7369
ROC-AUC: 0.9068

Test Metrics:
Accuracy: 0.8470
Precision: 0.8565
Recall: 0.6497
F1: 0.7389
ROC-AUC: 0.9073
ROC-AUC: 0.9068

Test Metrics:
Accuracy: 0.8470
Precision: 0.8565
Recall: 0.6497
F1: 0.7389
ROC-AUC: 0.9073


In [29]:
print("\n=== Top 20 Most Important Features ===")
feature_names = (
    [f'user_feat_{i}' for i in range(6)] +
    [f'game_feat_{i}' for i in range(game_features.shape[1])] +
    ['genre_match', 'tag_match']
)

feature_importance = np.abs(model.coef_[0])
top_indices = np.argsort(feature_importance)[-20:][::-1]

for idx in top_indices:
    print(f"{feature_names[idx]}: {feature_importance[idx]:.4f}")


=== Top 20 Most Important Features ===
game_feat_151: 67.5216
game_feat_150: 6.0401
game_feat_71: 2.6579
game_feat_67: 2.5478
game_feat_124: 2.3784
game_feat_121: 2.2118
game_feat_75: 2.1830
user_feat_5: 2.1606
user_feat_1: 2.0076
game_feat_78: 2.0006
game_feat_39: 2.0006
game_feat_28: 1.8367
game_feat_18: 1.7265
game_feat_22: 1.7265
game_feat_99: 1.6754
game_feat_4: 1.6382
game_feat_110: 1.5969
game_feat_29: 1.5132
game_feat_137: 1.5054
game_feat_105: 1.5037


In [34]:
print("\n=== Popularity Baseline ===")

# Calculate game popularity (already have this)
game_popularity = {}
for game_idx in range(len(all_games)):
    game_id = idx_to_game_id[game_idx]
    game_popularity[game_idx] = len(game_users.get(game_id, []))

# Create a popularity-based predictor
def predict_popularity_baseline(interactions, threshold_percentile=50):
    """
    Predict based on game popularity.
    If game is in top threshold_percentile%, predict 1, else 0
    """
    predictions = []
    threshold = np.percentile(list(game_popularity.values()), threshold_percentile)
    
    for user_idx, game_idx, label, _ in interactions:
        popularity = game_popularity.get(game_idx, 0)
        pred = 1 if popularity >= threshold else 0
        predictions.append(pred)
    
    return np.array(predictions)

# Get predictions
y_train_baseline = predict_popularity_baseline(train_data, threshold_percentile=67)
y_test_baseline = predict_popularity_baseline(test_data, threshold_percentile=67)

# Evaluate baseline
print("\nPopularity Baseline - Train Metrics:")
print(f"Accuracy: {accuracy_score(y_train, y_train_baseline):.4f}")
print(f"Precision: {precision_score(y_train, y_train_baseline):.4f}")
print(f"Recall: {recall_score(y_train, y_train_baseline):.4f}")
print(f"F1: {f1_score(y_train, y_train_baseline):.4f}")

print("\nPopularity Baseline - Test Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_test_baseline):.4f}")
print(f"Precision: {precision_score(y_test, y_test_baseline):.4f}")
print(f"Recall: {recall_score(y_test, y_test_baseline):.4f}")
print(f"F1: {f1_score(y_test, y_test_baseline):.4f}")


=== Popularity Baseline ===

Popularity Baseline - Train Metrics:
Accuracy: 0.7659
Precision: 0.5936
Recall: 0.9437
F1: 0.7288

Popularity Baseline - Test Metrics:
Accuracy: 0.7660

Popularity Baseline - Train Metrics:
Accuracy: 0.7659
Precision: 0.5936
Recall: 0.9437
F1: 0.7288

Popularity Baseline - Test Metrics:
Accuracy: 0.7660
Precision: 0.5936
Recall: 0.9444
F1: 0.7290
Precision: 0.5936
Recall: 0.9444
F1: 0.7290
