In [1]:
import json
from tqdm import tqdm

In [2]:
import json

def load_steam_games(file_path):
    """
    Load the steam_games.json file and return a list of game dictionaries.
    """
    games = []
    with open(file_path, 'r') as file:
        for line in file:
            # Remove any leading or trailing whitespace
            line = line.strip()
            # Replace the 'u' prefix with nothing
            line = line.replace("u'", "'")
            # Replace the 'u"' with '"'
            line = line.replace('u"', '"')
            try:
                game = eval(line)
                games.append(game)
            except Exception as e:
                print(f"Error parsing line: {line}. Error: {e}")
    return games

# Example usage
file_path = 'steam_games.json'
games = load_steam_games(file_path)

In [3]:
print(len(games))
print(games[0])

32135
{'publisher': 'Kotoshiro', 'genres': ['Action', 'Casual', 'Indie', 'Simulation', 'Strategy'], 'app_name': 'Lost Summoner Kitty', 'title': 'Lost Summoner Kitty', 'url': 'http://store.steampowered.com/app/761140/Lost_Summoner_Kitty/', 'release_date': '2018-01-04', 'tags': ['Strategy', 'Action', 'Indie', 'Casual', 'Simulation'], 'discount_price': 4.49, 'reviews_url': 'http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1', 'specs': ['Single-player'], 'price': 4.99, 'early_access': False, 'id': '761140', 'developer': 'Kotoshiro'}


In [4]:
name_genres_dict = {}
id_genres_dict = {}
id_name_dict = {}
genre_set = set()
for game in games:
    if 'genres' in game:
        genre = game['genres'][0].replace('amp;','')
    # elif 'tags' in game:
    #     genre = game['tags'][0].replace('amp;','')
    else:
        continue
    if 'app_name' in game:
        name = game['app_name']
    else:
        continue
    if 'id' in game:
        id = game['id']
    else: 
        continue
    genre_set.add(genre)
    name_genres_dict[name] = genre
    id_genres_dict[id] = genre
    id_name_dict[id] = name

In [5]:
print(len(genre_set), len(id_name_dict), len(name_genres_dict))
print(genre_set)

22 28849 28826
{'RPG', 'Design & Illustration', 'Indie', 'Web Publishing', 'Accounting', 'Software Training', 'Education', 'Action', 'Sports', 'Strategy', 'Casual', 'Audio Production', 'Photo Editing', 'Video Production', 'Massively Multiplayer', 'Free to Play', 'Early Access', 'Animation & Modeling', 'Racing', 'Utilities', 'Simulation', 'Adventure'}


In [6]:
import pandas as pd

def load_steam_interactions(file_path):
    interactions = []
    with open(file_path, 'r') as file:
        for line in tqdm(file):
            line = line.strip()
            line = line.replace("u'", "'")
            line = line.replace('u"', '"')
            try:
                interaction = eval(line)
                if ('username' in interaction) & ('product_id' in interaction) & ('hours' in interaction) & ('date' in interaction):
                    interactions.append({'user_id': interaction['username'], 'item_id': interaction['product_id'], 'rating': interaction['hours'], 'timestamp': interaction['date']})
            except Exception as e:
                print(f"Error parsing line: {line}. Error: {e}")
    return interactions

In [7]:
interactions = load_steam_interactions("./steam_new.json")

7793069it [04:16, 30370.20it/s]


In [8]:
import datetime

user_id = []
item_id = []
rating = []
timestamp = []
for interaction in tqdm(interactions):
    if interaction['rating'] < 3: continue
    else:
        user_id.append(interaction['user_id'])
        item_id.append(interaction['item_id'])
        rating.append(1)
        date_obj = datetime.datetime.strptime(interaction['timestamp'], "%Y-%m-%d")
        stamp = int(date_obj.timestamp())
        timestamp.append(stamp)

        

  0%|          | 0/7766532 [00:00<?, ?it/s]

100%|██████████| 7766532/7766532 [00:40<00:00, 190280.09it/s]


In [9]:
all_data = pd.DataFrame({
    'user_id': user_id,
    'item_id': item_id,
    'rating': rating,
    'timestamp': timestamp
})
all_data

Unnamed: 0,user_id,item_id,rating,timestamp
0,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,1,1514304000
1,hello?<,328100,1,1508083200
2,Cyderine916,35140,1,1514995200
3,DarklyThinking,35140,1,1514995200
4,Ariman1,328100,1,1501603200
...,...,...,...,...
6158426,Wildman_,252490,1,1386691200
6158427,Stony,252490,1,1386691200
6158428,Deez Knees,252490,1,1386691200
6158429,Vidaar,252490,1,1386691200


In [10]:
user_counts = all_data['user_id'].value_counts()
users_over_100 = user_counts[user_counts >= 20].index # 25442

In [11]:
item_counts = all_data['item_id'].value_counts()
items_over_100 = item_counts[item_counts >= 20].index # 81562

In [12]:
save_tuples_over100 = []
items_over_100_selected = []
for item in list(items_over_100):
    if item in id_genres_dict:
        save_tuples_over100.append((item, id_name_dict[item], id_genres_dict[item]))
        items_over_100_selected.append(item)

In [13]:
len(items_over_100_selected)

6772

In [14]:
with open('steam_over20.dat', 'w', encoding='utf-8') as file:
    for record in save_tuples_over100:
        line = f"{record[0]} %% {record[1]} %% {record[2]}\n"
        file.write(line)

In [15]:
filtered_data_100 = all_data[(all_data['user_id'].isin(users_over_100)) & (all_data['item_id'].isin(items_over_100_selected))]
filtered_data_100.to_csv("ratings_over20.csv", index=False) # 2499706