In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer
import sqlite3
from fuzzywuzzy import fuzz
import time
from tqdm import tqdm

DB = sqlite3.connect('steamdata')

In [2]:
userdb = pd.read_csv('steam-200k.csv', header=None, names=['uid', 'name', 'action', 'playtime', '0'])

In [3]:
gamesdb = pd.read_sql_query('SELECT * FROM app', DB)

In [4]:
gamesdb[gamesdb['name'].apply(lambda x: fuzz.ratio(x.lower(), 'Spore'.lower())) > 90]

Unnamed: 0,id,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
100,101,17390,SPORE,Maxis™,Electronic Arts,,29194,2870,0,"1,000,000 .. 2,000,000",1362,157,470,168,1999,1999,0,871
30095,30096,17390,SPORE,Maxis™,Electronic Arts,,29213,2871,0,"1,000,000 .. 2,000,000",1322,183,473,217,1999,1999,0,936


In [5]:
def fuzzy_search(name):
    try:
        return gamesdb.iloc[np.argmax(gamesdb['name'].apply(lambda x: fuzz.ratio(x.lower(), name.lower())))].appid
    except IndexError:
        return None

In [6]:
fuzzy_search('dead island epidemic')

383150

In [7]:
len(userdb.name.unique())

5155

In [8]:
name_id = {}
for n in tqdm(userdb.name.unique()):
    name_id[n] = fuzzy_search(n)

100%|██████████| 5155/5155 [08:50<00:00,  9.71it/s]


In [9]:
userdb['appid'] = userdb['name'].apply(lambda x: name_id[x])
userdb = userdb.dropna().astype({'appid': 'int'})

In [10]:
userdb_noweights = userdb[userdb['action'] == 'play']
userdb_noweights

Unnamed: 0,uid,name,action,playtime,0,appid
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0,72850
3,151603712,Fallout 4,play,87.0,0,377160
5,151603712,Spore,play,14.9,0,17390
7,151603712,Fallout New Vegas,play,12.1,0,22380
9,151603712,Left 4 Dead 2,play,8.9,0,550
...,...,...,...,...,...,...
199991,128470551,Fallen Earth,play,2.4,0,1314730
199993,128470551,Magic Duels,play,2.2,0,316010
199995,128470551,Titan Souls,play,1.5,0,297130
199997,128470551,Grand Theft Auto Vice City,play,1.5,0,12110


In [11]:
userdb_noweights['appid']

1           72850
3          377160
5           17390
7           22380
9             550
           ...   
199991    1314730
199993     316010
199995     297130
199997      12110
199999      38720
Name: appid, Length: 70489, dtype: int64

In [12]:
len(userdb_noweights.appid.unique())

3337

In [13]:
sparseusers = userdb_noweights.pivot_table(index='uid', columns='appid', values='playtime', aggfunc='sum')

In [14]:
sparseusers = np.nan_to_num(sparseusers, 0)



In [15]:
usernorm = Normalizer(norm='l1').fit_transform(sparseusers)
usersims = cosine_similarity(usernorm)

In [16]:
np.savetxt('games_list.txt', userdb_noweights['name'].unique(), fmt='%s')