__COMP9727 PROJECT:A STEAM GAME RECOMMENDER SYSTEM__


# Introduction
This model is a recommender system for Steam games. It is based on the dataset from Steam, which is a digital distribution platform for video games. The dataset contains 316 games and 21 million users. The recommender system is based on collaborative filtering and content based model. The system integrates the recommender module and a user interaction module.

__PIPELINE__:
1. Data preprocessing, Analysis and Visualization
2. Extracting features from the game dataset, build feature vectors for games. The item vector is a hierarchical structure. First, for the description, use text vectorization technology (such as TF-IDF or Word2Vec) to process the description text of the game to generate a dense numerical vector for each game. For Tags, we still use TF-IDF. But since the game tag describes the game more significantly, we manually increase the TF-IDF weight of the tag.

__Inporting Libraries__

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

__Data Preprocessing__

In [2]:
file_path_reviews = "steam_reviews.csv"
file_path_desc = "desc.csv"
file_path_tags = "tags.csv"

df_reviews = pd.read_csv(file_path_reviews)
df_desc = pd.read_csv(file_path_desc)
df_tags = pd.read_csv(file_path_tags)

__Clean the data__

In [3]:
import ast
import re
def tags_preprocess(tag):
    tag = ast.literal_eval(tag)
    tag = [x.lower() for x in tag]
    tag = [x.replace(' ', '_') for x in tag]
    tag = [x.replace('-', '_') for x in tag]
    tag = ' '.join(tag)
    return tag
def desc_preprocess(desc):
    desc = '' if pd.isna(desc) else str(desc)
    desc = desc.lower()
    desc = re.sub(r'[^\w\s]', '', desc)  # Remove punctuation
    words = desc.split()  # Tokenize by splitting on whitespace
    return ' '.join(words)

In [4]:
# df_desc = df_desc.dropna(subset=['short_description'])
df_tags['tags'] = df_tags['tags'].apply(lambda x: tags_preprocess(x))
df_desc['short_description'] = df_desc['short_description'].apply(desc_preprocess)

__Extracting Features from the description dataset and the tags dataset__

In [5]:
print(df_desc.loc[161])

id                                                              666140
short_description    start a new life in the enchanting town of por...
Name: 161, dtype: object


In [6]:
import ast
tfidf_vectorizer_desc = TfidfVectorizer(stop_words='english')
tfidf_matrix_desc = tfidf_vectorizer_desc.fit_transform(df_desc['short_description'].tolist())
print(tfidf_matrix_desc)

  (0, 944)	0.27707007158803415
  (0, 1488)	0.27707007158803415
  (0, 2527)	0.2585299893139412
  (0, 1561)	0.27707007158803415
  (0, 2179)	0.23517224803621373
  (0, 1692)	0.23517224803621373
  (0, 2924)	0.2386355452946131
  (0, 1743)	0.18493768577197395
  (0, 475)	0.27707007158803415
  (0, 1200)	0.24537559159788733
  (0, 406)	0.27707007158803415
  (0, 1126)	0.18493768577197395
  (0, 2758)	0.18493768577197395
  (0, 1124)	0.10053639446178186
  (0, 2037)	0.27707007158803415
  (0, 2542)	0.23517224803621373
  (0, 1814)	0.27707007158803415
  (1, 535)	0.1923484174817216
  (1, 1025)	0.2235605171169861
  (1, 1809)	0.29737536420845806
  (1, 2627)	0.263358129952708
  (1, 1075)	0.2774765577905971
  (1, 2874)	0.1734214546066264
  (1, 2077)	0.29737536420845806
  (1, 2397)	0.263358129952708
  :	:
  (313, 1038)	0.18556620844540345
  (313, 382)	0.18556620844540345
  (313, 25)	0.18556620844540345
  (313, 584)	0.18556620844540345
  (313, 263)	0.18556620844540345
  (313, 2108)	0.18556620844540345
  (313, 1

Create user class. The user class takes user model as input. A user could be an existed user in our reviews data set either a new user, needing cold start which will be implemented later.

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import pandas as pd

class User:
    def __init__(self, user_mod, user_id):
        self.user_mod = user_mod
        self.user_id = user_id
        self.profile = []  # This array contains game ids.
        self.feature_arr = None  # Feature array (TF-IDF) depends on the profile array.
        self.recommendations = []
        self.liked_games_ids = []  # Array to store IDs of games the user has liked.
        self.liked_tags = Counter()
        self.limit = 60
        self.total_sum = 0

    def hot_start(self):
        df_user_reviews = df_reviews[(df_reviews['author.steamid'] == self.user_id) & (df_reviews['recommended'] == True)]
        for app_id in df_user_reviews['app_id'].values:
            self.like_game(app_id)


    def cold_start(self):
        # Implement cold start logic if needed
        pass

    def update_profile(self, app_id):
        self.profile.append(app_id)
        # Get tags for the new game and update tag counts
        new_game_tags = df_tags[df_tags['app_id'] == app_id]['tags'].iloc[0].split()
        self.liked_tags.update(new_game_tags)
        self.generate_new_profile_vector()

    def generate_new_profile_vector(self):
        desc = df_desc.loc[df_desc['id'].isin(self.profile)]
        self.feature_arr = tfidf_vectorizer_desc.transform(desc['short_description'])

    def like_game(self, app_id):
        self.update_profile(app_id)
        # Add the game ID to the set of liked games
        self.liked_games_ids.append(app_id)

    def run_model(self):
        cosine_similarities = cosine_similarity(self.feature_arr, tfidf_matrix_desc)
        adjusted_similarity_scores = []

        for game_idx, game_id in enumerate(df_desc['id']):
            if game_id not in self.liked_games_ids:  # Skip games already liked
                custom_weight = self.calculate_custom_weight(game_id)
                adjusted_score = cosine_similarities[0][game_idx] * custom_weight
                adjusted_similarity_scores.append((game_id, adjusted_score))

        # Create a DataFrame from the adjusted scores
        similarity_scores_df = pd.DataFrame(adjusted_similarity_scores, columns=['game_id', 'similarity_score'])
        sorted_similarity_scores_df = similarity_scores_df.sort_values(by='similarity_score', ascending=False)

        # Get the top recommendations
        top_recommendations_indices = sorted_similarity_scores_df.head(self.limit).index

        final_recommendations = pd.DataFrame({
            'app_id': df_desc.loc[top_recommendations_indices]['id']
        })

        # Store the recommendations
        self.recommendations = final_recommendations
        return self.recommendations['app_id'].values

    def calculate_custom_weight(self, game_id):
        game_tags_list = df_tags[df_tags['app_id'] == game_id]['tags'].iloc[0].split()
        weight_sum = 0
        total_sum = sum(dict(self.liked_tags).values())
        for tag in game_tags_list:
            if tag in dict(self.liked_tags):
                weight_sum += self.liked_tags[tag]
        print(weight_sum/total_sum)
        return weight_sum/total_sum




In [34]:
user = User(0,76561198030238209)
user.hot_start()
user.run_model()
user.like_game(883710)
user.run_model()
user.like_game(883710)
user.run_model()
user.like_game(883710)
user.run_model()
user.like_game(883710)
user.run_model()
user.like_game(883710)
user.run_model()
user.like_game(883710)
user.run_model()


0.225
0.2
0.325
0.5
0.475
0.35
0.275
0.1
0.25
0.3
0.3
0.3
0.25
0.25
0.15
0.25
0.125
0.15
0.3
0.05
0.475
0.175
0.325
0.225
0.4
0.3
0.225
0.375
0.375
0.375
0.2
0.25
0.25
0.15
0.175
0.225
0.3
0.375
0.275
0.2
0.3
0.075
0.125
0.325
0.2
0.275
0.3
0.375
0.3
0.275
0.325
0.3
0.4
0.3
0.25
0.2
0.3
0.275
0.15
0.2
0.2
0.275
0.275
0.2
0.175
0.375
0.3
0.35
0.4
0.2
0.275
0.4
0.35
0.35
0.1
0.2
0.5
0.275
0.275
0.175
0.3
0.425
0.275
0.275
0.225
0.25
0.2
0.125
0.3
0.225
0.35
0.325
0.4
0.3
0.3
0.15
0.075
0.425
0.4
0.25
0.275
0.25
0.125
0.2
0.175
0.15
0.25
0.175
0.425
0.4
0.375
0.35
0.35
0.2
0.325
0.25
0.15
0.3
0.225
0.2
0.3
0.2
0.15
0.35
0.325
0.225
0.15
0.225
0.2
0.15
0.3
0.4
0.15
0.275
0.225
0.225
0.35
0.25
0.2
0.3
0.375
0.275
0.175
0.175
0.175
0.25
0.425
0.3
0.275
0.45
0.325
0.275
0.35
0.075
0.25
0.325
0.35
0.175
0.25
0.3
0.25
0.125
0.15
0.2
0.2
0.275
0.275
0.325
0.3
0.125
0.45
0.225
0.225
0.25
0.175
0.15
0.175
0.175
0.225
0.25
0.35
0.125
0.15
0.225
0.475
0.275
0.2
0.175
0.3
0.2
0.275
0.125
0.45
0.325
0

array([ 390340,  841370,  883710,  250900,  205100,     420,  364360,
        304390,  637650,   39210,  825630,  239140,  552520,  613830,
        560130,  589360,  219740,  242760,  447040,  105600,  421020,
       1145360,  225540,  239030,  632350,   47890,  219150,  397540,
        381210,  544750,  582660,  671440,  583950,  427520,  646910,
        723390,  322330, 1222700,  581320,  341800,  417290,  412830,
        683320,  311690,  631510,  590380,  253230,   72850,  236850,
        578080,  510510,  531510,  817130,  782330,  548430,  546560,
        606280,  731490,  250320,  666140], dtype=int64)

In [30]:
df_games= df_reviews.drop_duplicates(subset='app_id')

__UI MODULE__

In [39]:
import ipywidgets as widgets
from IPython.display import display, clear_output
class SteamGameRecommenderUI:
    def __init__(self):
        self.user_id = None
        self.current_page = 0
        self.user = None
        self.recommended_games_ids = []
        self.login_input = widgets.Text(description='User ID:')
        self.login_button = widgets.Button(description='Login')
        self.next_page_button = widgets.Button(description='Next Page')
        self.previous_page_button = widgets.Button(description='Previous Page')
        self.search_input = widgets.Text(description='Search Game:')
        self.search_button = widgets.Button(description='Search')
        self.output_area = widgets.Output()

        self.login_button.on_click(self.on_login_clicked)
        self.next_page_button.on_click(self.on_next_page_clicked)
        self.previous_page_button.on_click(self.on_previous_page_clicked)
        self.search_button.on_click(self.on_search_clicked)

    def _load_games_db(self, recommendations):
        self.recommended_games_ids = recommendations

    def display(self):
        display(self.login_input, self.login_button, self.output_area)

    def on_login_clicked(self, b):
        self.user_id = int(self.login_input.value)
        self.user = User(0, self.user_id)
        self.user.hot_start()
        recommendations = self.user.run_model()
        self._load_games_db(recommendations)
        self.display_recommended_games()

    def display_recommended_games(self):
        with self.output_area:
            clear_output()
            games_vbox = widgets.VBox()
            for game_id in self.get_recommended_games():
                game_name = df_games.loc[df_games['app_id'] == game_id]['app_name'].tolist()[0]
                game_label = widgets.Label(f'{game_id}: {game_name}')
                like_button = widgets.Button(description='Like', layout=widgets.Layout(width='auto'))
                dislike_button = widgets.Button(description='Dislike', layout=widgets.Layout(width='auto'))
                like_button.on_click(lambda b, game_id=game_id: self.on_like_clicked(b, game_id))
                dislike_button.on_click(lambda b, game_id=game_id: self.on_dislike_clicked(b, game_id))
                buttons_hbox = widgets.HBox([like_button, dislike_button], layout=widgets.Layout(justify_content='flex-end'))
                game_hbox = widgets.HBox([game_label, buttons_hbox], layout=widgets.Layout(justify_content='space-between'))
                games_vbox.children += (game_hbox,)
            navigation_box = widgets.HBox([self.previous_page_button, self.next_page_button], layout=widgets.Layout(justify_content='space-between'))
            display(widgets.VBox([games_vbox, navigation_box]))
            display(self.user.liked_tags)

    def get_recommended_games(self):
        start = self.current_page * 10
        end = start + 10
        return self.recommended_games_ids[start:end]

    def on_next_page_clicked(self, b):
        self.current_page += 1
        self.display_recommended_games()

    def on_previous_page_clicked(self, b):
        if self.current_page > 0:
            self.current_page -= 1
        self.display_recommended_games()
    def reset_current_page(self):
        self.current_page = 0

    def on_search_clicked(self, b):
        # search_query = self.search_input.value
        # with self.output_area:
        #     clear_output()
        #     print(f"Search results for '{search_query}':")
        #     for game_id in random.sample(self.recommended_games_ids, 5):
        #         game_name = df_games.loc[df_games['app_id'] == game_id]['app_name'].tolist()[0]
        #         print(f'{game_id}: {game_name}')
        pass

    def on_like_clicked(self, b, game_id):
        self.user.like_game(game_id)
        recommendations = self.user.run_model()  # 获取推荐的游戏ID列表
        self._load_games_db(recommendations)  # 加载游戏数据库
        self.reset_current_page()
        self.display_recommended_games()
        print(f"Liked game ID: {game_id}")

    def on_dislike_clicked(self, b, game_id):
        print(f"Disliked game ID: {game_id}")

ui = SteamGameRecommenderUI()
ui.display()
76561198030238209

Text(value='', description='User ID:')

Button(description='Login', style=ButtonStyle())

Output()

76561198030238209