__COMP9727 PROJECT:A STEAM GAME RECOMMENDER SYSTEM__


# Introduction
This model is a recommender system for Steam games. It is based on the dataset from Steam, which is a digital distribution platform for video games. The dataset contains 316 games and 21 million users. The recommender system is based on collaborative filtering and content based model. The system integrates the recommender module and a user interaction module.

__PIPELINE__:
1. Data preprocessing, Analysis and Visualization
2. Extracting features from the game dataset, build feature vectors for games. The item vector is a hierarchical structure. First, for the description, use text vectorization technology (such as TF-IDF or Word2Vec) to process the description text of the game to generate a dense numerical vector for each game. For Tags, we still use TF-IDF. But since the game tag describes the game more significantly, we manually increase the TF-IDF weight of the tag.

__Inporting Libraries__

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

__Data Preprocessing__

In [2]:
file_path_reviews = "new_steam_reviews.csv"
file_path_desc = "new_desc.csv"
file_path_tags = "new_tags.csv"

df_reviews = pd.read_csv(file_path_reviews)
df_desc = pd.read_csv(file_path_desc)
df_tags = pd.read_csv(file_path_tags)

In [3]:
print(df_desc)
print(df_tags)

         id                                  short_description
0        70  This mod is to run Half-Life 2: Update (itself...
1       240  Counter-Strike: Source blends Counter-Strike's...
2       420  The second episodic sequel to one of the most ...
3       620  Get Portal™2 In Motion™, Adventure, Puzzle, Sh...
4      2870  In the distant future, the X universe faces a ...
..      ...                                                ...
270  355790  Embark upon a stealthy adventure with new enem...
271  574050  "DRAGON QUEST HEROES II is a hack-and-slash, f...
272  572410  Steel Division: Normandy 44 is a Tactical Real...
273  421020  Dirt 4 is all about embracing danger. It’s abo...
274  546560  Half-Life: Alyx is Valve’s VR return to the Ha...

[275 rows x 2 columns]
     app_id                                               tags
0        70                                        ['Shooter']
1       240                                        ['Shooter']
2       420                    

__Clean the data__

In [4]:
import ast
import re
def tags_preprocess(tag):
    tag = ast.literal_eval(tag)
    tag = [x.lower() for x in tag]
    tag = [x.replace(' ', '_') for x in tag]
    tag = [x.replace('-', '_') for x in tag]
    tag = ' '.join(tag)
    return tag
def desc_preprocess(desc):
    desc = '' if pd.isna(desc) else str(desc)
    desc = desc.lower()
    desc = re.sub(r'[^\w\s]', '', desc)  # Remove punctuation
    words = desc.split()  # Tokenize by splitting on whitespace
    return ' '.join(words)

In [5]:
# df_desc = df_desc.dropna(subset=['short_description'])
df_tags['tags'] = df_tags['tags'].apply(lambda x: tags_preprocess(x))
df_desc['short_description'] = df_desc['short_description'].apply(desc_preprocess)

__Extracting Features from the description dataset and the tags dataset__

In [6]:
print(df_desc.loc[161])

id                                                              578080
short_description    pubg battlegrounds is a battle royale shooter ...
Name: 161, dtype: object


In [7]:
import ast
tfidf_vectorizer_desc = TfidfVectorizer(stop_words='english')
tfidf_matrix_desc = tfidf_vectorizer_desc.fit_transform(df_desc['short_description'].tolist())
print(tfidf_matrix_desc)

  (0, 2342)	0.34266551688316627
  (0, 2557)	0.20954880668834158
  (0, 3875)	0.31922482778360933
  (0, 1695)	0.5793861041476892
  (0, 3115)	0.2557120159913009
  (0, 2345)	0.5793861041476892
  (1, 1561)	0.19705558180487137
  (1, 2403)	0.19705558180487137
  (1, 2657)	0.15540662450806467
  (1, 1975)	0.1835756183978875
  (1, 3375)	0.19705558180487137
  (1, 2451)	0.08627417555433464
  (1, 1644)	0.15096728250238467
  (1, 281)	0.15096728250238467
  (1, 3442)	0.1835756183978875
  (1, 1394)	0.12590762604844555
  (1, 3619)	0.29410301067932054
  (1, 139)	0.15540662450806467
  (1, 106)	0.20572695279440442
  (1, 3609)	0.19705558180487137
  (1, 341)	0.16053146874664415
  (1, 824)	0.19705558180487137
  (1, 450)	0.19705558180487137
  (1, 3377)	0.5220342964608841
  (1, 823)	0.39411116360974274
  :	:
  (274, 1178)	0.14277292503568564
  (274, 3906)	0.14277292503568564
  (274, 2854)	0.14277292503568564
  (274, 1886)	0.14277292503568564
  (274, 3904)	0.14277292503568564
  (274, 198)	0.2855458500713713
  (27

Create user class. The user class takes user model as input. A user could be an existed user in our reviews data set either a new user, needing cold start which will be implemented later.

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import pandas as pd

class User:
    def __init__(self, user_mod, user_id):
        self.user_mod = user_mod
        self.user_id = user_id
        self.profile = []  # This array contains game ids.
        self.feature_arr = None  # Feature array (TF-IDF) depends on the profile array.
        self.recommendations = []
        self.liked_tags = Counter()
        self.limit = 100
        self.total_sum = 0

    def hot_start(self):
        df_user_reviews = df_reviews[(df_reviews['author.steamid'] == self.user_id)]
        for app_id in df_user_reviews['app_id'].values:
            self.like_game(app_id)


    def cold_start(self):
        # Implement cold start logic if needed
        pass

    def update_profile(self, app_id):
        self.profile.append(app_id)
        # Get tags for the new game and update tag counts
        new_game_tags = df_tags[df_tags['app_id'] == app_id]['tags'].iloc[0].split()
        self.liked_tags.update(new_game_tags)
        self.generate_new_profile_vector()

    def generate_new_profile_vector(self):
        desc = df_desc.loc[df_desc['id'].isin(self.profile)]
        self.feature_arr = tfidf_vectorizer_desc.transform(desc['short_description'])


    def like_game(self, app_id):
        # if not app_id in self.profile:
        self.update_profile(app_id)

    def run_model(self):
        cosine_similarities = cosine_similarity(self.feature_arr, tfidf_matrix_desc)
        adjusted_similarity_scores = []

        for game_idx, game_id in enumerate(df_desc['id']):
            #if game_id not in self.profile:  # Skip games already liked
            custom_weight = self.calculate_custom_weight(game_id)
            adjusted_score = (cosine_similarities[0][game_idx]+1) * custom_weight
            adjusted_similarity_scores.append((game_id, adjusted_score))

        # Create a DataFrame from the adjusted scores
        similarity_scores_df = pd.DataFrame(adjusted_similarity_scores, columns=['game_id', 'similarity_score'])
        sorted_similarity_scores_df = similarity_scores_df.sort_values(by='similarity_score', ascending=False)
        print(sorted_similarity_scores_df)
        top_recommendations_indices = sorted_similarity_scores_df[~sorted_similarity_scores_df['game_id'].isin(self.profile)].head(self.limit).index


        final_recommendations = pd.DataFrame({
            'app_id': df_desc.loc[top_recommendations_indices]['id']
        })

        # Store the recommendations
        self.recommendations = final_recommendations
        return self.recommendations['app_id'].values

    def calculate_custom_weight(self, game_id):
        game_tags_list = df_tags[df_tags['app_id'] == game_id]['tags'].iloc[0].split()
        weight_sum = 0
        total_sum = sum(dict(self.liked_tags).values())
        for tag in game_tags_list:
            if tag in dict(self.liked_tags):
                weight_sum += self.liked_tags[tag]
        return (weight_sum/total_sum) if total_sum > 0 else 0




In [17]:
user = User(0,76561198030238209)
user.hot_start()
user.run_model()



     game_id  similarity_score
96     48700          1.636364
187   105600          0.909091
31    834910          0.860994
193   440900          0.740848
168   346110          0.738760
..       ...               ...
183   712100          0.000000
243   225540          0.000000
25    583950          0.000000
179   356190          0.000000
202   304390          0.000000

[275 rows x 2 columns]


array([  48700,  105600,  834910,  440900,  346110,  433340,  466560,
        518790,  629910,  339800,  590380,  701160,  758190,  773951,
        283640,  552500, 1017900,  240720,  420290,  477160,  512900,
        113200,  252490,  206190,  428550,  609320,  250900,  508440,
        233860,  526160,  666140,  428690,    2870,  772540,  322330,
        413150,  236510,  541210,  646570,    7510,  294100,  581320,
        275850,  272270,  585420,  212680,  420530,  107410,  555220,
        239140,  258180,  644930,  526870,  337340,  704850,  698780,
        311690, 1158310,  253230,  281990,  394360,  236850,   39210,
        723390,  524580,  250760,  213670,  435150,  214950,  205100,
        572410,  548430,  447820,  875210,  362890,  588650,  683320,
        285190,  247240,  367520,  493340,  760060,  638970,  332200,
        400940,  200900,  214560,  322110,  527230,  583470,  753420,
        688130,  393520,  504230,  495560,  219740,  857980,  863550,
        690830,  377

In [10]:
df_games= df_reviews.drop_duplicates(subset='app_id')

__UI MODULE__

In [20]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from IPython.display import HTML
class SteamGameRecommenderUI:
    def __init__(self):
        self.temp_liked_games = set()
        self.user_id = None
        self.current_page = 0
        self.user = None
        self.recommended_games_ids = []
        self.login_input = widgets.Text(description='User ID:')
        self.login_button = widgets.Button(description='Login')
        self.next_page_button = widgets.Button(description='Next Page')
        self.previous_page_button = widgets.Button(description='Previous Page')
        self.search_input = widgets.Text(description='Search Game:')
        self.search_button = widgets.Button(description='Search')
        self.output_area = widgets.Output()
        self.refresh_button = widgets.Button(description='Refresh')
        self.refresh_button.on_click(self.on_refresh_clicked)

        self.login_button.on_click(self.on_login_clicked)
        self.next_page_button.on_click(self.on_next_page_clicked)
        self.previous_page_button.on_click(self.on_previous_page_clicked)
        self.search_button.on_click(self.on_search_clicked)

    def _load_games_db(self, recommendations):
        self.recommended_games_ids = recommendations

    def display(self):
        display(self.login_input, self.login_button, self.output_area)

    def on_login_clicked(self, b):
        self.user_id = int(self.login_input.value)
        self.user = User(0, self.user_id)
        self.user.hot_start()
        recommendations = self.user.run_model()
        self._load_games_db(recommendations)
        self.display_recommended_games()

    def display_recommended_games(self):
        with self.output_area:
            clear_output()
            games_vbox = widgets.VBox()
            for game_id in self.get_recommended_games():
                game_name = df_games.loc[df_games['app_id'] == game_id]['app_name'].tolist()[0]
                game_label = widgets.Label(f'{game_id}: {game_name}')
                like_button = widgets.Button(description='Like', layout=widgets.Layout(width='auto'))
                view_button = widgets.Button(description='View', layout=widgets.Layout(width='auto'))

                if game_id in self.temp_liked_games:
                    like_button.description = 'Liked'
                    like_button.disabled = True
                else:
                    like_button.description = 'Like'
                    like_button.disabled = False

                like_button.on_click(lambda b, game_id=game_id: self.on_like_clicked(b, game_id))
                view_button.on_click(lambda b, game_id=game_id: self.display_game_details(b, game_id))
                buttons_hbox = widgets.HBox([like_button, view_button], layout=widgets.Layout(justify_content='flex-end'))
                game_hbox = widgets.HBox([game_label, buttons_hbox], layout=widgets.Layout(justify_content='space-between'))
                games_vbox.children += (game_hbox,)

            # 将 refresh_button 添加到导航栏中
            navigation_box = widgets.HBox([self.previous_page_button, self.refresh_button, self.next_page_button], layout=widgets.Layout(justify_content='space-between'))
            display(widgets.VBox([games_vbox, navigation_box]))


    def get_recommended_games(self):
        start = self.current_page * 10
        end = start + 10
        return self.recommended_games_ids[start:end]

    def on_next_page_clicked(self, b):
        self.current_page += 1
        self.display_recommended_games()

    def on_previous_page_clicked(self, b):
        if self.current_page > 0:
            self.current_page -= 1
        self.display_recommended_games()

    def on_back_clicked(self, b):
        self.display_recommended_games()

    def reset_current_page(self):
        self.current_page = 0

    def on_search_clicked(self, b):
        pass

    def display_game_details(self, b, game_id):
        with self.output_area:
            clear_output()
            game_tags_str = df_tags[df_tags['app_id'] == game_id]['tags'].iloc[0]
            game_tags = game_tags_str.split(' ')
            game_desc = df_desc[df_desc['id'] == game_id]['short_description'].iloc[0]
            tags_hbox = widgets.HBox(layout=widgets.Layout(flex_flow='row wrap', align_items='flex-start'))
            for tag in game_tags:
                tag = tag.replace('_', ' ').strip()
                tag_button = widgets.Button(description=tag, disabled=True,
                                            button_style='',
                                            layout=widgets.Layout(width='auto', margin='2px'))
                tags_hbox.children += (tag_button,)
            desc_html = widgets.HTML(
                value=f"<strong>Description:</strong> {game_desc}"
            )

            back_button = widgets.Button(description='Back')
            back_button.on_click(lambda b: self.on_back_clicked(b))
            display(widgets.VBox([tags_hbox, desc_html, back_button]))


    def on_like_clicked(self, b, game_id):
        self.temp_liked_games.add(game_id)
        b.description = 'Liked'
        b.disabled = True

    # 添加新的刷新方法
    def on_refresh_clicked(self, b):
        for game_id in self.temp_liked_games:
            self.user.like_game(game_id)
        self.user.run_model()
        self.temp_liked_games.clear()
        recommendations = self.user.run_model()
        self._load_games_db(recommendations)
        self.reset_current_page()
        self.display_recommended_games()

# Usage
ui = SteamGameRecommenderUI()
ui.display()
76561198119772292


Text(value='', description='User ID:')

Button(description='Login', style=ButtonStyle())

Output()

76561198119772292