__COMP9727 PROJECT:A STEAM GAME RECOMMENDER SYSTEM__


# Introduction
This model is a recommender system for Steam games. It is based on the dataset from Steam, which is a digital distribution platform for video games. The dataset contains 316 games and 21 million users. The recommender system is based on collaborative filtering and content based model. The system integrates the recommender module and a user interaction module.

__PIPELINE__:
1. Data preprocessing, Analysis and Visualization
2. Extracting features from the game dataset, build feature vectors for games. The item vector is a hierarchical structure. First, for the description, use text vectorization technology (such as TF-IDF or Word2Vec) to process the description text of the game to generate a dense numerical vector for each game. For Tags, we still use TF-IDF. But since the game tag describes the game more significantly, we manually increase the TF-IDF weight of the tag.

__Inporting Libraries__

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

__Data Preprocessing__

In [2]:
file_path_reviews = "steam_reviews.csv"
file_path_desc = "desc.csv"
file_path_tags = "tags.csv"

df_reviews = pd.read_csv(file_path_reviews)
df_desc = pd.read_csv(file_path_desc)
df_tags = pd.read_csv(file_path_tags)

__Clean the data__

In [3]:
import ast
import re
def tags_preprocess(tag):
    tag = ast.literal_eval(tag)
    tag = [x.lower() for x in tag]
    tag = [x.replace(' ', '_') for x in tag]
    tag = [x.replace('-', '_') for x in tag]
    tag = ' '.join(tag)
    return tag
def desc_preprocess(desc):
    desc = '' if pd.isna(desc) else str(desc)
    desc = desc.lower()
    desc = re.sub(r'[^\w\s]', '', desc)  # Remove punctuation
    words = desc.split()  # Tokenize by splitting on whitespace
    return ' '.join(words)

In [4]:
df_desc = df_desc.dropna(subset=['short_description'])
df_tags['tags'] = df_tags['tags'].apply(lambda x: tags_preprocess(x))
df_desc['short_description'] = df_desc['short_description'].apply(desc_preprocess)

__Extracting Features from the description dataset and the tags dataset__

In [5]:
import ast
tfidf_vectorizer_desc = TfidfVectorizer(stop_words='english')
tfidf_matrix_desc = tfidf_vectorizer_desc.fit_transform(df_desc['short_description'].tolist())
print(tfidf_matrix_desc)

  (0, 944)	0.27707007158803415
  (0, 1488)	0.27707007158803415
  (0, 2527)	0.2585299893139412
  (0, 1561)	0.27707007158803415
  (0, 2179)	0.23517224803621373
  (0, 1692)	0.23517224803621373
  (0, 2924)	0.2386355452946131
  (0, 1743)	0.18493768577197395
  (0, 475)	0.27707007158803415
  (0, 1200)	0.24537559159788733
  (0, 406)	0.27707007158803415
  (0, 1126)	0.18493768577197395
  (0, 2758)	0.18493768577197395
  (0, 1124)	0.10053639446178186
  (0, 2037)	0.27707007158803415
  (0, 2542)	0.23517224803621373
  (0, 1814)	0.27707007158803415
  (1, 535)	0.1923484174817216
  (1, 1025)	0.2235605171169861
  (1, 1809)	0.29737536420845806
  (1, 2627)	0.263358129952708
  (1, 1075)	0.2774765577905971
  (1, 2874)	0.1734214546066264
  (1, 2077)	0.29737536420845806
  (1, 2397)	0.263358129952708
  :	:
  (313, 1038)	0.18556620844540345
  (313, 382)	0.18556620844540345
  (313, 25)	0.18556620844540345
  (313, 584)	0.18556620844540345
  (313, 263)	0.18556620844540345
  (313, 2108)	0.18556620844540345
  (313, 1

Create user class. The user class takes user model as input. A user could be an existed user in our reviews data set either a new user, needing cold start which will be implemented later.

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
class User:
    def __init__(self, user_mod, user_id):
        self.user_mod = user_mod
        self.user_id = user_id
        self.profile = [] # this array contains game ids.
        self.feature_arr = None # feature array(TF-IDF) depends on the profile array.
        self.tags = []
        self.recommendations = []
        self.limit = 30
    def hot_start(self):
        df_user_reviews = df_reviews[df_reviews['author.steamid'] == self.user_id]
        for i in df_user_reviews['app_id'].values:
            self.update_profile(i)
        self.generate_new_profile_vector()
    def cold_start(self):
        pass
    def update_profile(self, app_id):
        self.profile.append(app_id)
        self.generate_new_profile_vector()
    def generate_new_profile_vector(self):
        desc = df_desc.loc[df_desc['id'].isin(self.profile)]
        self.tags = df_tags[df_tags['app_id'].isin(self.profile)]['tags']
        self.feature_arr = tfidf_vectorizer_desc.transform(desc['short_description'])

    def like_game(self,app_id):
        self.update_profile(app_id)
    def run_model(self):
        cosine_similarities = cosine_similarity(self.feature_arr, tfidf_matrix_desc)
        mean_cosine_similarities = cosine_similarities.mean(axis=0)
        similarity_scores_df = pd.DataFrame(mean_cosine_similarities, index=df_desc.index, columns=['similarity_score'])


        sorted_similarity_scores_df = similarity_scores_df.sort_values(by='similarity_score', ascending=False)
        user_liked_indices = df_desc.index[df_desc['id'].isin(self.profile)]
        recommendations_indices = sorted_similarity_scores_df.drop(user_liked_indices).head(self.limit).index
        final_recommendations = pd.DataFrame({
            'app_id': df_desc.loc[recommendations_indices, 'id'].values,
            'similarity_score': sorted_similarity_scores_df.loc[recommendations_indices].values.flatten()
        })
        #print(final_recommendations)
        return final_recommendations['app_id']




In [60]:
user = User(0,76561199095369542)
user.hot_start()
#res = user.run_model()
print(user.tags)
user.like_game(240)
print(user.tags.tolist().split())
#df_games= df_reviews.drop_duplicates(subset='app_id')
# test = []
# for i in res:
#     test.append({'id': i, 'name': df_games.loc[df_games['app_id'] == i]['app_name'].tolist()[0]})
# print(test)



0      open_world rpg story_rich atmospheric mature f...
147    zombies survival_horror horror online_co_op op...
Name: tags, dtype: object
['open_world rpg story_rich atmospheric mature fantasy adventure singleplayer nudity choices_matter great_soundtrack third_person medieval action multiple_endings action_rpg magic dark_fantasy dark sandbox', 'shooter fps action multiplayer team_based first_person competitive tactical pvp e_sports military war strategy moddable singleplayer difficult atmospheric survival simulation sandbox', 'zombies survival_horror horror online_co_op open_world parkour first_person survival open_world_survival_craft gore fps action pve pvp stealth post_apocalyptic story_rich hack_and_slash action_rpg multiplayer']


__UI MODULE__

In [50]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import random

class SteamGameRecommenderUI:
    def __init__(self):
        self.user_id = None
        self.current_page = 0
        self.user = None
        self.recommended_games_ids = []  # 保存推荐游戏的ID列表
        self.login_input = widgets.Text(description='User ID:')
        self.login_button = widgets.Button(description='Login')
        self.next_page_button = widgets.Button(description='Next Page')
        self.previous_page_button = widgets.Button(description='Previous Page')
        self.search_input = widgets.Text(description='Search Game:')
        self.search_button = widgets.Button(description='Search')
        self.output_area = widgets.Output()

        self.login_button.on_click(self.on_login_clicked)
        self.next_page_button.on_click(self.on_next_page_clicked)
        self.previous_page_button.on_click(self.on_previous_page_clicked)
        self.search_button.on_click(self.on_search_clicked)

    def _load_games_db(self, recommendations):
        # 私有方法以更新推荐游戏数据库
        self.recommended_games_ids = recommendations

    def display(self):
        display(self.login_input, self.login_button, self.output_area)

    def on_login_clicked(self, b):
        self.user_id = int(self.login_input.value)
        self.user = User(0, self.user_id)
        self.user.hot_start()
        recommendations = self.user.run_model()  # 获取推荐的游戏ID列表
        self._load_games_db(recommendations)  # 加载游戏数据库
        self.display_recommended_games()

    def display_recommended_games(self):
        with self.output_area:
            clear_output()
            games_vbox = widgets.VBox()
            for game_id in self.get_recommended_games():
                game_name = df_games.loc[df_games['app_id'] == game_id]['app_name'].tolist()[0]
                game_label = widgets.Label(f'{game_id}: {game_name}')
                like_button = widgets.Button(description='Like', layout=widgets.Layout(width='auto'))
                dislike_button = widgets.Button(description='Dislike', layout=widgets.Layout(width='auto'))
                like_button.on_click(lambda b, game_id=game_id: self.on_like_clicked(b, game_id))
                dislike_button.on_click(lambda b, game_id=game_id: self.on_dislike_clicked(b, game_id))
                buttons_hbox = widgets.HBox([like_button, dislike_button], layout=widgets.Layout(justify_content='flex-end'))
                game_hbox = widgets.HBox([game_label, buttons_hbox], layout=widgets.Layout(justify_content='space-between'))
                games_vbox.children += (game_hbox,)
            navigation_box = widgets.HBox([self.previous_page_button, self.next_page_button], layout=widgets.Layout(justify_content='space-between'))
            display(widgets.VBox([games_vbox, navigation_box]))

    def get_recommended_games(self):
        start = self.current_page * 10
        end = start + 10
        return self.recommended_games_ids[start:end]

    def on_next_page_clicked(self, b):
        self.current_page += 1
        self.display_recommended_games()

    def on_previous_page_clicked(self, b):
        if self.current_page > 0:
            self.current_page -= 1
        self.display_recommended_games()

    def on_search_clicked(self, b):
        search_query = self.search_input.value
        with self.output_area:
            clear_output()
            print(f"Search results for '{search_query}':")
            for game_id in random.sample(self.recommended_games_ids, 5):
                game_name = df_games.loc[df_games['app_id'] == game_id]['app_name'].tolist()[0]
                print(f'{game_id}: {game_name}')

    def on_like_clicked(self, b, game_id):
        # 更新数据库逻辑
        print(f"Liked game ID: {game_id}")
        # 在这里添加代码来更新数据库

    def on_dislike_clicked(self, b, game_id):
        # 更新数据库逻辑
        print(f"Disliked game ID: {game_id}")
        # 在这里添加代码来更新数据库

# 使用示例
ui = SteamGameRecommenderUI()
ui.display()


Text(value='', description='User ID:')

Button(description='Login', style=ButtonStyle())

Output()