In [208]:
import pandas as pd
import numpy as np
import faiss
import dask.dataframe as dd
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.neighbors import NearestNeighbors

In [209]:
reviews_df = pd.read_json("../../data/raw/reviews.json", lines=True)
reviews_df.head()

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id
0,The A S C E N D E D man,4.3,18.0,10,7,2017-12-31,For an old game it is still very relevant and ...,False,16,,,7.65612e+16
1,jej,0.1,20.0,10,1,2017-12-26,Don't play this game not even good,False,35,1.0,,
2,🎉2018 🎊,2.1,6.0,10,5,2017-12-27,Awesome!!,False,32,,Product received for free,7.65612e+16
3,MaG1k,40.2,26.0,10,4,2017-12-29,Exelente juego!.,False,24,,Product received for free,
4,raymd.,64.3,15.0,10,5,2018-01-01,7/10...,False,16,,,


In [210]:
items_df = pd.read_parquet("../../data/processed/reviews_item_cleaned.parquet")
items_df.head()

Unnamed: 0,genres,Game,Game_ID,sentiment,num_genres
0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,761140,4.0,5
1,"[Free to Play, Indie, RPG, Strategy]",Ironbound,643980,4.5,4
2,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,670290,4.5,5
3,"[Action, Adventure, Casual]",弹炸人2222,767400,4.0,3
4,"[Action, Adventure, Simulation]",Battle Royale Trainer,772540,3.0,3


In [211]:
# Step 1: Efficient user & product ID mapping
reviews_df['Game_ID'] = reviews_df['product_id']
reviews_df['uid'] = reviews_df['username'].astype('category').cat.codes
reviews_df['product_id'] = reviews_df['product_id'].astype('category').cat.codes
reviews_df['hours'] = reviews_df['hours'].fillna(0)

# Create mappings between Game_ID and matrix indices
# First ensure product_id is categorical
reviews_df['product_id'] = reviews_df['product_id'].astype('category')

reviews_df['count_games'] = reviews_df.groupby('uid')['Game_ID'].transform('nunique')

In [212]:
# Remove null bytes in the entire DataFrame
# items_df = items_df.astype(str).apply(lambda x: x.str.replace("\x00", "", regex=True))

items_df['genres'] = items_df['genres'].astype(str).str.replace("\x00", "", regex=True)

# Step 4: Genre-Based Recommendation for Missing Games
items_df['genres'] = items_df['genres'].apply(eval)  # Convert string to list

In [213]:

def split_train_test(user_df):
    # Step 1: Get users with count_games >= 6 (only unique user_ids)
    eligible_users = user_df[user_df['count_games'] >= 6]['uid'].unique()
    
    # Step 2: Select 1% of these users randomly
    num_users_to_pick = max(1, int(len(eligible_users) * 0.05))  # Ensure at least 1 user is picked
    selected_users = np.random.choice(eligible_users, size=num_users_to_pick, replace=False)

    test_set = []
    # Step 3: For each selected user, pick 1 to 3 random entries
    for user_id in selected_users:
        user_entries = user_df[user_df['uid'] == user_id]
        num_test_entries = min(len(user_entries), np.random.randint(1, 4))  # Min 1, Max 3
        test_rows = user_entries.sample(n=num_test_entries, random_state=42)
        test_set.append(test_rows)

    # Step 4: Create test_df
    test_df = pd.concat(test_set) if test_set else pd.DataFrame(columns=user_df.columns)

    # Step 5: Create train_df (original user_df minus test_df entries)
    train_df = user_df[~user_df.index.isin(test_df.index)]
    


    return train_df, test_df

# Apply function
reviews_train_df, reviews_test_df = split_train_test(reviews_df)

In [214]:
# Step 1: Create proper mappings for users and items
# Get unique users and items
unique_users = reviews_train_df['uid'].unique()
unique_items = reviews_train_df['product_id'].unique()

# Create mappings
user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}

# Step 2: Create a Sparse User-Item Interaction Matrix with proper indexing
row = np.array([user_to_idx[user] for user in reviews_train_df['uid'].values])
col = np.array([item_to_idx[item] for item in reviews_train_df['product_id'].values])
data = reviews_train_df['hours'].values

interaction_sparse = coo_matrix((data, (row, col)), 
                               shape=(len(unique_users), len(unique_items))).tocsr()

In [215]:

# Create a mapping between the Game_ID and the matrix index
game_id_to_product_id = dict(zip(reviews_train_df['Game_ID'], reviews_train_df['product_id']))
game_id_to_matrix_idx = {}
for game_id, product_id in game_id_to_product_id.items():
    if product_id in item_to_idx:
        game_id_to_matrix_idx[game_id] = item_to_idx[product_id]

# Reverse mapping for retrieval
matrix_idx_to_game_id = {idx: game_id for game_id, idx in game_id_to_matrix_idx.items()}

In [216]:
# Step 3: Memory-Efficient Nearest Neighbors Search
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
knn.fit(interaction_sparse.T)  # Transpose because we want item-item similarity

In [221]:

def genre_based_recommendation(input_genres, items_df, k=10):
    """
    Recommend games based on genre similarity and sentiment score.
    """
    filtered_items = items_df[items_df['genres'].apply(lambda x: any(genre in x for genre in input_genres))]

    if filtered_items.empty:
        return []  # Return empty list instead of string for consistency

    recommended_games = filtered_items.sort_values(by='sentiment', ascending=False)['Game_ID'].head(k).tolist()
    return recommended_games

def recommend_hybrid(user_input_games, interaction_sparse, knn_model, items_df, missing_game_genres, k=10):
    """
    Hybrid recommender that combines kNN-based recommendations (if available)
    with genre-based recommendations (if no user interactions exist).
    
    :param user_input_games: List of Game_IDs provided by user
    :param interaction_sparse: User-item interaction sparse matrix
    :param knn_model: Trained kNN model
    :param items_df: DataFrame containing game metadata
    :param missing_game_genres: List of genres for games not in the interaction matrix
    :param k: Number of total recommendations
    :return: List of recommended games (mixed approach)
    """
    if not isinstance(user_input_games, list):
        user_input_games = [user_input_games]

    # Step 1: Convert Game_IDs to matrix indices with validation
    existing_games = []
    for g in user_input_games:
        if g in game_id_to_matrix_idx:
            matrix_idx = game_id_to_matrix_idx[g]
            if matrix_idx < interaction_sparse.shape[1]:
                existing_games.append(matrix_idx)
    
    missing_games = [g for g in user_input_games if g not in game_id_to_matrix_idx]

    # print(f"Existing Games in Matrix: {existing_games}")
    # print(f"Missing Games: {missing_games}")

    recommendations = []

    # Step 2: Use kNN if any games exist in the matrix
    if existing_games:
        # Get specific rows from the transposed matrix for KNN
        game_vectors = interaction_sparse.T[existing_games]
        
        # Find nearest neighbors
        distances, indices = knn_model.kneighbors(game_vectors, n_neighbors=k+5)
        
        # Convert kNN indices to Game_IDs
        knn_recommendations = []
        for idx_list in indices:
            for idx in idx_list:
                if idx in matrix_idx_to_game_id:
                    knn_recommendations.append(matrix_idx_to_game_id[idx])
        
        # Remove input games and duplicates from recommendations
        knn_recommendations = list(dict.fromkeys([g for g in knn_recommendations if g not in user_input_games]))

        # print(f"KNN recommendations (after filtering input games): {knn_recommendations}")

        if missing_games:
            recommendations.extend(knn_recommendations[:k//2])  # Use half of k
        else:
            recommendations.extend(knn_recommendations[:k])  # Use all if no missing games

    # Step 3: Use genre-based filtering if any games are missing or we need more recommendations
    if missing_games or len(recommendations) < k:
        # print(f"Using genre-based recommendations for: {missing_games}")
        # print(f"Extracted unique genres for missing games: {missing_game_genres}")

        # Get genre-based recommendations using the extracted genres
        genre_recommendations = genre_based_recommendation(missing_game_genres, items_df, k=k*2)
        
        # Remove input games and already recommended games
        genre_recommendations = [g for g in genre_recommendations 
                               if g not in user_input_games and g not in recommendations]

        # Add genre recommendations to fill remaining slots
        recommendations.extend(genre_recommendations[:(k - len(recommendations))])

    # Debugging: Check if recommended games exist in `items_df`
    matching_games = set(items_df['Game_ID'].astype(str)) & set(str(rec) for rec in recommendations)
    # print(f"Number of matching recommended Game_IDs in items_df: {len(matching_games)}")

    return recommendations[:k]  # Ensure we return only k recommendations




In [218]:
# Example Usage
user_input_games = [230330, 578080, 1203620]  # Replace with actual user input
missing_game_genres = ["Massively Multiplayer", "Early Access"]

recommendations = recommend_hybrid(user_input_games, interaction_sparse, knn, items_df, missing_game_genres, k=10)
print(recommendations)
# Fetch game details
recommended_games_df = items_df[items_df['Game_ID'].isin(recommendations)][['Game_ID', 'Game', 'genres', 'sentiment']]
print("\nRecommended Games:")
recommended_games_df

Existing Games in Matrix: [1774]
Missing Games: [578080, 1203620]
KNN recommendations (after filtering input games): [18050, 314150, 319140, 107310, 229480, 47790, 620, 288470, 255420, 768060, 766970, 766850, 766700, 766530]
Using genre-based recommendations for: [578080, 1203620]
Extracted unique genres for missing games: ['Massively Multiplayer', 'Early Access']
Number of matching recommended Game_IDs in items_df: 10
[18050, 314150, 319140, 107310, 229480, 362300, 439700, 378280, 405930, 290810]

Recommended Games:


Unnamed: 0,Game_ID,Game,genres,sentiment
1379,229480,Dungeons & Dragons: Chronicles of Mystara,[ActionAdventureRPG],4.0
5294,439700,H1Z1 Test Server,[Massively Multiplayer],3.0
22166,405930,Metal Reaper Online - Newbie Package,[Massively Multiplayer],3.0
22637,378280,Pump-Action Captain,[Early Access],3.0
23753,362300,Just Survive Test Server,[Massively Multiplayer],4.5
24424,314150,Double Dragon Trilogy,[Action],3.0
24600,319140,Xeodrifter™,[ActionIndie],4.5
25336,290810,Colossal Kaiju Combat™: Kaijuland Battles,[Early Access],3.0
27978,107310,Cthulhu Saves the World,[IndieRPG],4.0
28099,18050,DeathSpank: Thongs of Virtue,[ActionRPGIndie],4.0


In [219]:
def evaluate_hybrid_recommendations(recommend_hybrid, train_df, test_df, interaction_sparse, knn, items_df, k=10, n_users=None):
    """
    Evaluate hybrid recommendations based on genre similarity between user's played games and recommendations
    
    Parameters:
    -----------
    recommend_hybrid : function
        Your hybrid recommendation function
    train_df : DataFrame
        Training data with user_id and game_id
    test_df : DataFrame
        Test data with user_id and game_id
    interaction_sparse : sparse matrix
        Sparse matrix of user-item interactions
    knn : model
        Trained KNN model
    items_df : DataFrame
        DataFrame containing game information including genres
    k : int
        Number of recommendations to evaluate
    n_users : int, optional
        Number of users to evaluate (None = all users)
    """
    
    # Create a mapping of game_id to genres
    game_genre_mapping = {}
    for index, row in items_df.iterrows():
        game_id = row['Game_ID']
        genres = row['genres']  # Assuming genres is a list in the dataframe
        game_genre_mapping[game_id] = genres
    
    # Extract unique users and their respective games from both dataframes
    unique_users_games_test = test_df.groupby('uid')['Game_ID'].apply(list).to_dict()
    unique_users_games_train = train_df.groupby('uid')['Game_ID'].apply(list).to_dict()
    
    # Get list of users to evaluate
    test_users = list(unique_users_games_test.keys())
    
    # Limit number of users if specified
    if n_users is not None:
        # np.random.seed(42)  # For reproducibility
        test_users = np.random.choice(test_users, size=min(n_users, len(test_users)), replace=False)
    
    print(f"Evaluating on {len(test_users)} users out of {len(unique_users_games_test)} total users")
    
    # Training metrics to track
    train_genre_precision = []
    train_genre_recall = []
    train_genre_hit_rate = 0
    
    # Testing metrics to track
    test_genre_precision = []
    test_genre_recall = []
    test_genre_hit_rate = 0
    
    evaluated_users = 0
    
    # Store sample user data
    sample_user = None
    sample_user_test_games = None
    sample_user_train_games = None
    sample_user_recommended_games = None
    
    for user_id in test_users:
        # Skip users not in training set
        if user_id not in unique_users_games_train:
            continue
            
        # Get games from training and test sets for this user
        train_games = unique_users_games_train[user_id]
        test_games = unique_users_games_test[user_id]
        
        # Skip users with no games in train set or test set
        if len(train_games) == 0 or len(test_games) == 0:
            continue
        
        # Get all genres from user's training games
        train_genres = set()
        for game in train_games:
            if game in game_genre_mapping:
                train_genres.update(game_genre_mapping[game])
        
        # Get all genres from user's test games
        test_genres = set()
        for game in test_games:
            if game in game_genre_mapping:
                test_genres.update(game_genre_mapping[game])
        
        # Skip if no genres found in either train or test
        if len(train_genres) == 0 or len(test_genres) == 0:
            continue
        
        # # Find missing genres for this user
        # all_genres = set()
        # for genres_list in game_genre_mapping.values():
        #     all_genres.update(genres_list)
        
        missing_game_genres = []
            
        # Get recommendations using the hybrid model
        try:
            recommendations = recommend_hybrid(train_games, interaction_sparse, knn, items_df, missing_game_genres, k=k)
            recommendations = recommendations[:k]  # Limit to top-k
        except Exception as e:
            print(f"Error getting recommendations for user {user_id}: {e}")
            continue
        
        # Get genres from recommended games
        recommended_genres = set()
        for game in recommendations:
            if game in game_genre_mapping:
                recommended_genres.update(game_genre_mapping[game])
        
        # Skip if no genres found in recommendations
        if len(recommended_genres) == 0:
            continue
        
        # Calculate TRAINING metrics (train games vs recommendations)
        train_relevant_genres = train_genres.intersection(recommended_genres)
        
        if len(recommended_genres) > 0:
            train_precision = len(train_relevant_genres) / len(recommended_genres)
            train_genre_precision.append(train_precision)
            
        if len(train_genres) > 0:
            train_recall = len(train_relevant_genres) / len(train_genres)
            train_genre_recall.append(train_recall)
            
        # Hit rate (1 if at least one genre matches)
        if len(train_relevant_genres) > 0:
            train_genre_hit_rate += 1
        
        # Calculate TESTING metrics (test games vs recommendations)
        test_relevant_genres = test_genres.intersection(recommended_genres)
        
        if len(recommended_genres) > 0:
            test_precision = len(test_relevant_genres) / len(recommended_genres)
            test_genre_precision.append(test_precision)
            
        if len(test_genres) > 0:
            test_recall = len(test_relevant_genres) / len(test_genres)
            test_genre_recall.append(test_recall)
            
        # Hit rate (1 if at least one genre matches)
        if len(test_relevant_genres) > 0:
            test_genre_hit_rate += 1
            
        evaluated_users += 1
        
        # Store the last user's data for display
        sample_user = user_id
        sample_user_test_games = test_games
        sample_user_train_games = train_games
        sample_user_recommended_games = recommendations
    
    # Calculate final metrics
    metrics = {
        # Training metrics
        'train_genre_precision': np.mean(train_genre_precision) if train_genre_precision else 0,
        'train_genre_recall': np.mean(train_genre_recall) if train_genre_recall else 0,
        'train_genre_hit_rate': train_genre_hit_rate / evaluated_users if evaluated_users else 0,
        
        # Testing metrics
        'test_genre_precision': np.mean(test_genre_precision) if test_genre_precision else 0,
        'test_genre_recall': np.mean(test_genre_recall) if test_genre_recall else 0,
        'test_genre_hit_rate': test_genre_hit_rate / evaluated_users if evaluated_users else 0,
        
        'num_evaluated_users': evaluated_users
    }
    
    # Print the training metrics
    print("\nTraining Genre-Based Evaluation Metrics (Train Games vs Recommendations):")
    print(f"Genre Precision: {metrics['train_genre_precision']:.4f}")
    print(f"Genre Recall: {metrics['train_genre_recall']:.4f}")
    print(f"Genre Hit Rate: {metrics['train_genre_hit_rate']:.4f}")
    
    # Print the testing metrics
    print("\nTesting Genre-Based Evaluation Metrics (Test Games vs Recommendations):")
    print(f"Genre Precision: {metrics['test_genre_precision']:.4f}")
    print(f"Genre Recall: {metrics['test_genre_recall']:.4f}")
    print(f"Genre Hit Rate: {metrics['test_genre_hit_rate']:.4f}")
    
    print(f"\nNumber of Evaluated Users: {metrics['num_evaluated_users']}")

    # Print the last user's data
    print("\nLast User Evaluated:")
    print(f"User ID: {sample_user}")
    print(f"Train Games: {sample_user_train_games}")
    print(f"Test Games: {sample_user_test_games}")
    print(f"Recommended Games: {sample_user_recommended_games}")
    
    return metrics


In [223]:
# Evaluate the hybrid recommendation model
metrics = evaluate_hybrid_recommendations(
    recommend_hybrid=recommend_hybrid,
    train_df=reviews_train_df,  # Your training dataframe
    test_df=reviews_test_df,    # Your test dataframe
    interaction_sparse=interaction_sparse,
    knn=knn,
    items_df=items_df,
    k=10,
    n_users=None  # Adjust as needed
)


Evaluating on 835 users out of 835 total users

Training Genre-Based Evaluation Metrics (Train Games vs Recommendations):
Genre Precision: 0.1555
Genre Recall: 0.2035
Genre Hit Rate: 0.7365

Testing Genre-Based Evaluation Metrics (Test Games vs Recommendations):
Genre Precision: 0.0539
Genre Recall: 0.2469
Genre Hit Rate: 0.3976

Number of Evaluated Users: 835

Last User Evaluated:
User ID: 738710
Train Games: [8930, 50300, 268500, 302510, 319630]
Test Games: [440, 440]
Recommended Games: [364820, 425070, 515690, 246880, 26900, 496460, 526250, 696190, 337040, 299720]
