## Imports

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from tabulate import tabulate
from pathlib import Path
import gdown

## Load the Data from Google Drive

In [None]:
def download_from_drive(drive_link, file_name):
    data_folder = Path('../../data/final')
    data_file = data_folder / file_name

    # Final output has been downloaded
    if data_file.exists():
        print(f'{file_name} exists. Skipping download.')
        return
    
    # Download the file
    print(f'Downloading {file_name}...')
    data_folder.mkdir(parents=True, exist_ok=True)
    gdown.download(f"https://drive.google.com/uc?id={drive_link}", str(data_file), quiet=False)
    return data_file

games_file_name = 'steam_games.csv'
reviews_file_name = 'steam_reviews.csv'

# Download the final data files - progress bar will appear
download_from_drive('1AF_HhGdWC_8t8AlVwcMJ0bf3ZPd1BzNJ', games_file_name)
download_from_drive('1KAgcR4Sbp4yZYLnmV1ruS1g2cMtNXUEF', reviews_file_name)

# Load the data
df_games = pd.read_csv(f'../../data/final/{games_file_name}')
df_reviews = pd.read_csv(f'../../data/final/{reviews_file_name}')

Downloading steam_games.csv...


Downloading...
From (original): https://drive.google.com/uc?id=1AF_HhGdWC_8t8AlVwcMJ0bf3ZPd1BzNJ
From (redirected): https://drive.google.com/uc?id=1AF_HhGdWC_8t8AlVwcMJ0bf3ZPd1BzNJ&confirm=t&uuid=eddcd46f-afa4-49ca-b290-456c4ff7b73a
To: /Users/jakemileham/Documents/2024/UNI/COS781/Semester Project/SteamDeepDive/data/final/steam_games.csv
100%|██████████| 13.3M/13.3M [00:06<00:00, 2.01MB/s]


Downloading steam_reviews.csv...


Downloading...
From (original): https://drive.google.com/uc?id=1KAgcR4Sbp4yZYLnmV1ruS1g2cMtNXUEF
From (redirected): https://drive.google.com/uc?id=1KAgcR4Sbp4yZYLnmV1ruS1g2cMtNXUEF&confirm=t&uuid=4e78526a-d951-44aa-8aed-eb9bbed73b54
To: /Users/jakemileham/Documents/2024/UNI/COS781/Semester Project/SteamDeepDive/data/final/steam_reviews.csv
100%|██████████| 925M/925M [06:42<00:00, 2.30MB/s] 


## Create User Matrix

In [4]:
# Create user matrix where rows are users and columns are games
# Cell values are the game ratings
df_user_matrix = df_reviews.pivot(index='username', columns='product_id', values='rating_sentiment')

# Games that have not been reviewed by a user must be filled with 0
df_user_matrix.fillna(0, inplace=True)

In [5]:
# View the user matrix
df_user_matrix.head()

product_id,10,20,30,40,50,60,70,80,130,220,...,735570,738060,738380,740470,741670,745760,745880,754530,758230,763410
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
I Need Healing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Æñç,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
! Taz the Husky,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Calculate similarity matrix (cosine similarity)
user_similarity = cosine_similarity(df_user_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=df_user_matrix.index, columns=df_user_matrix.index)

# Takes on average 2 min 20 - run on MacBook Pro M3 Silicon Chip 

In [None]:
def predict_ratings(user_id, user_item_matrix, user_similarity_df, k=10):
    # Identify top-k similar users
    similar_users = user_similarity_df[user_id].nlargest(k+1).iloc[1:]  # Exclude self-similarity
    # Calculate weighted rating predictions
    weighted_ratings = user_item_matrix.loc[similar_users.index].multiply(similar_users.values, axis=0)
    predicted_ratings = weighted_ratings.sum(axis=0) / similar_users.sum()
    return predicted_ratings

# Get a random user ID from the user-item matrix index
random_user_id = random.choice(df_user_matrix.index)

# Retrieve games that the user rated highly (e.g., rating sentiment of 4 or higher)
user_reviews = df_reviews[(df_reviews['username'] == random_user_id) & (df_reviews['rating_sentiment'] == 5)]

# Choose 5 random games that the user rated highly
random_game_ids = random.sample(list(user_reviews['product_id']), 5)

# Filter df_games to get the records for these game IDs
random_games_details = df_games[df_games['id'].isin(random_game_ids)]

# Select the columns you want to display
columns_to_display = ['app_name', 'developer', 'price', 'genres', 'url']

# Prepare the data for tabulation
games_to_display = random_games_details[columns_to_display]

# Print the table using tabulate
print(f'Random Games Rated Highly by {random_user_id}:')
print(tabulate(games_to_display, headers='keys', tablefmt='pretty', showindex=False))


# Generate predicted ratings for this random user
predicted_ratings = predict_ratings(random_user_id, df_user_matrix, user_similarity_df)

# Remove games that the user has already rated
predicted_ratings.drop(random_game_ids, errors='ignore', inplace=True)

# Get the top 5 recommended games
top_recommendations = predicted_ratings.nlargest(5)

recommended_game_ids = top_recommendations.index.astype(int)

# Filter df_games to get the records for these game IDs
recommended_games_details = df_games[df_games['id'].isin(recommended_game_ids)]

# Select the columns you want to display
columns_to_display = ['app_name', 'developer', 'price', 'genres', 'url']

# Prepare the data for tabulation
games_to_display = recommended_games_details[columns_to_display]

# Print the table using tabulate
print(f'Top 5 Recommended Games for {random_user_id}:')
print(tabulate(games_to_display, headers='keys', tablefmt='pretty', showindex=False))

Random Games Rated Highly by Canti:
+------------------------------+--------------------------+-------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+
|           app_name           |        developer         | price |                                      genres                                      |                                 url                                  |
+------------------------------+--------------------------+-------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+
|    DuckTales: Remastered     |        WayForward        | 14.99 |                        ['Action', 'Adventure', 'Casual']                         |    http://store.steampowered.com/app/237630/DuckTales_Remastered/    |
| Sid Meier’s Civilization® VI | FiraxisAspyr (Mac Linux) | 59.99 |         

: 