# LIBRARY

In [2]:
import pandas as pd
import re

# DATASET

In [3]:
# path
path_games = "data/raw_data/steam_games.csv"
path_users = "data/raw_data/steam-200k.csv"

# Tập dữ liệu
dataGames_raw = pd.read_csv(path_games, header=None, usecols=[2, 9, 10, 11, 13, 14], names=["name", "tags", "details", "languages", "genre", "description"])
dataUsers_raw = pd.read_csv(path_users, header=None, usecols=[0, 1, 2, 3], names=["user", "name", "behavior", "hours"])

# DATA PROCESSING

In [4]:
def processing_user(data): 
    def format_user(data):
        # Định dạng DATA USERS: chia cột behavior thành cột play (1, 0) và cột purchase (1, 0)
        data['purchase'] = data['behavior'] == 'purchase'
        data['play'] = data['behavior'] == 'play'
        data['purchase'] = data['purchase'].astype(int)
        data['play'] = data['play'].astype(int)
        data['hours'] = data['hours'] - data['purchase']
        data = data.groupby(by=['user', 'name']).agg({'hours': 'sum', 'purchase': 'sum', 'play': 'sum'}).reset_index()
        return data

    def filter_user(data):
        '''
            Steam cho phép người dùng hoàn tiề n cho những trò chơi họ đã chơi dưới 2 giờ. 
            Chúng tôi quyết định xem xét thực tế này cho hệ thống gợi ý của mình. 
            Do đó, các tương tác giữa người dùng và mục có thời gian dưới 2 giờ sẽ không được xem xét.
        '''
        data_filter = data[(data['play'] == 1) & (data['hours'] > 2.0)][['user', 'name', 'hours']]
        ## Lọc ra những trò chơi có ít nhất 50 người chơi
        name_filter = data_filter['name'].value_counts()[data_filter['name'].value_counts() >= 50].index
        ## Lọc dữ liệu cho những trò chơi đó
        data_filter = data_filter[data_filter['name'].isin(name_filter)].reset_index(drop=True)

        return data_filter

    # Gán nhãn cho DATA USERS
    def range_5_stars(data, column_name):
        bins = [2.0, 4.0, 8.0, 12.0, 24.0, float('inf')]
        labels = [1.0, 2.0, 3.0, 4.0, 5.0]
        data[column_name + "_rating"] = pd.cut(data[column_name], bins=bins, labels=labels).astype('float64')
        return data

    data_format = format_user(data)
    data_filter = filter_user(data_format)
    data_Users = range_5_stars(data_filter, "hours")

    return data_Users


dataUsers = processing_user(dataUsers_raw)
dataUsers.to_csv("data/process_data/steam_users.csv", index=False, encoding="UTF-8")
dataUsers.info()
dataUsers.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28459 entries, 0 to 28458
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   user          28459 non-null  int64  
 1   name          28459 non-null  object 
 2   hours         28459 non-null  float64
 3   hours_rating  28459 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 889.5+ KB


Unnamed: 0,user,name,hours,hours_rating
0,5250,Alien Swarm,4.9,2.0
1,5250,Cities Skylines,144.0,5.0
2,5250,Deus Ex Human Revolution,62.0,5.0
3,5250,Portal 2,13.6,4.0
4,76767,Age of Empires II HD Edition,13.1,4.0


In [14]:
def processing_game(data):
    data = data.drop(data.index[0])
    data = data.dropna().drop_duplicates()

    return data


dataGames = processing_game(dataGames_raw)
dataGames.to_csv("data/process_data/steam_games.csv", index=False, encoding="UTF-8")
dataGames.info()
dataGames.head()

<class 'pandas.core.frame.DataFrame'>
Index: 37224 entries, 1 to 40833
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         37224 non-null  object
 1   tags         37224 non-null  object
 2   details      37224 non-null  object
 3   languages    37224 non-null  object
 4   genre        37224 non-null  object
 5   description  37224 non-null  object
dtypes: object(6)
memory usage: 2.0+ MB


Unnamed: 0,name,tags,details,languages,genre,description
1,DOOM,"FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","Single-player,Multi-player,Co-op,Steam Achieve...","English,French,Italian,German,Spanish - Spain,...",Action,"About This Game Developed by id software, the..."
2,PLAYERUNKNOWN'S BATTLEGROUNDS,"Survival,Shooter,Multiplayer,Battle Royale,PvP...","Multi-player,Online Multi-Player,Stats","English,Korean,Simplified Chinese,French,Germa...","Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...
3,BATTLETECH,"Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","Single-player,Multi-player,Online Multi-Player...","English,French,German,Russian","Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...
4,DayZ,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","Multi-player,Online Multi-Player,Steam Worksho...","English,French,Italian,German,Spanish - Spain,...","Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...
5,EVE Online,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","Multi-player,Online Multi-Player,MMO,Co-op,Onl...","English,German,Russian,French","Action,Free to Play,Massively Multiplayer,RPG,...",About This Game
