In [1]:
import pandas as pd
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import math

In [2]:
data = pd.read_csv('summaryAnalyze/game_summary.csv')
data.head()

Unnamed: 0,game_name,summary
0,The Elder Scrolls V Skyrim,The next chapter in the highly anticipated Eld...
1,Fallout 4,"Bethesda Game Studios, the award-winning creat..."
2,Spore,-1
3,Fallout New Vegas,"In this first-person Western RPG, the player t..."
4,Left 4 Dead 2,Left 4 Dead 2 is a cooperative first-person sh...


In [3]:
data_processed = data[data.summary != '-1']['summary']
all_data_processed = data[data.summary != '-1']
data_updated = data_processed.tolist()

In [4]:
all_data_processed = all_data_processed['game_name'].tolist()
all_data_processed

['The Elder Scrolls V Skyrim',
 'Fallout 4',
 'Fallout New Vegas',
 'Left 4 Dead 2',
 'HuniePop',
 'Path of Exile',
 'Poly Bridge',
 'Left 4 Dead',
 'Team Fortress 2',
 'Tomb Raider',
 'The Banner Saga',
 'Dead Island Epidemic',
 'BioShock Infinite',
 'Dragon Age Origins - Ultimate Edition',
 'SEGA Genesis & Mega Drive Classics',
 'Grand Theft Auto IV',
 'Realm of the Mad God',
 'Marvel Heroes 2015',
 'Eldevin',
 'Dota 2',
 'BioShock',
 'Robocraft',
 "Garry's Mod",
 'Jazzpunk',
 'Alan Wake',
 'BioShock 2',
 'Fallen Earth',
 "Fallout New Vegas Courier's Stash",
 'Fallout New Vegas Dead Money',
 'Fallout New Vegas Honest Hearts',
 'Grand Theft Auto Episodes from Liberty City',
 'Hitman Absolution',
 'The Elder Scrolls V Skyrim - Dawnguard',
 'The Elder Scrolls V Skyrim - Dragonborn',
 'The Elder Scrolls V Skyrim - Hearthfire',
 'Ultra Street Fighter IV',
 'FINAL FANTASY XIII',
 "Sid Meier's Civilization V",
 'L.A. Noire',
 'Company of Heroes Tales of Valor',
 '7 Days to Die',
 'Divekick'

In [5]:
tfidf_vectorizer=TfidfVectorizer(lowercase=True, stop_words='english', use_idf=True)

In [6]:
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(data_updated)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(data_updated)

In [50]:
results = {}
for idx, row in data.iterrows():
   similar_indices = tfidf_vectorizer_vectors[idx].argsort()[:-100:-1] 
   similar_items = [(tfidf_vectorizer_vectors[idx][i], data['game_name'][i]) for i in similar_indices] 
   results[row['id']] = similar_items[1:]

AttributeError: argsort not found

In [7]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)


Unnamed: 0,tfidf
skyrim,0.264183
scrolls,0.259900
elder,0.256069
virtual,0.201381
want,0.174213
...,...
fazbear,0.000000
faye,0.000000
faydwer,0.000000
favours,0.000000


In [8]:
tf_idf_dict = dict()
for x in range(len(all_data_processed)):
    tf_idf_dict[all_data_processed[x]] = tfidf_vectorizer_vectors[x]
tf_idf_dict

{'The Elder Scrolls V Skyrim': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 70 stored elements in Compressed Sparse Row format>,
 'Fallout 4': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 16 stored elements in Compressed Sparse Row format>,
 'Fallout New Vegas': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 34 stored elements in Compressed Sparse Row format>,
 'Left 4 Dead 2': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 48 stored elements in Compressed Sparse Row format>,
 'HuniePop': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 33 stored elements in Compressed Sparse Row format>,
 'Path of Exile': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 93 stored elements in Compressed Sparse Row format>,
 'Poly Bridge': <1x20891 sparse matrix of type '<class 'numpy.float64'>'
 	with 33 stored elements in Compressed Sparse Row format>,
 'Left 4 Dead': <1x20891 sparse matrix of type '<

In [9]:
tf_idf_dict_pandas = pd.DataFrame(list(tf_idf_dict.items()), columns=['Game Name', 'tf_idf vector'])
tf_idf_dict_pandas

Unnamed: 0,Game Name,tf_idf vector
0,The Elder Scrolls V Skyrim,"(0, 20730)\t0.08226313359032639\n (0, 20585..."
1,Fallout 4,"(0, 20585)\t0.1766533605040369\n (0, 20484)..."
2,Fallout New Vegas,"(0, 20473)\t0.1774632027598725\n (0, 20361)..."
3,Left 4 Dead 2,"(0, 20839)\t0.1005894584704967\n (0, 20286)..."
4,HuniePop,"(0, 20554)\t0.20294699632970287\n (0, 20542..."
...,...,...
4204,The Bug Butcher,"(0, 20433)\t0.07695517854501853\n (0, 20303..."
4205,Warriors & Castles,"(0, 20233)\t0.17794732411828398\n (0, 19523..."
4206,Romance of the Three Kingdoms Maker,"(0, 19223)\t0.14251235047885408\n (0, 18782..."
4207,Space Colony,"(0, 18741)\t0.08907896312244576\n (0, 18656..."


In [10]:
tfidf_vectorizer_vectors.shape

(4209, 20891)

In [11]:
cos = cosine_similarity(tfidf_vectorizer_vectors, tfidf_vectorizer_vectors)
cos

array([[1.        , 0.32801552, 0.00495592, ..., 0.02083611, 0.0192287 ,
        0.00594738],
       [0.32801552, 1.        , 0.        , ..., 0.04118646, 0.0757503 ,
        0.01258519],
       [0.00495592, 0.        , 1.        , ..., 0.        , 0.        ,
        0.01755842],
       ...,
       [0.02083611, 0.04118646, 0.        , ..., 1.        , 0.04982168,
        0.00269423],
       [0.0192287 , 0.0757503 , 0.        , ..., 0.04982168, 1.        ,
        0.10254166],
       [0.00594738, 0.01258519, 0.01755842, ..., 0.00269423, 0.10254166,
        1.        ]])

In [48]:
type(cos)

numpy.ndarray

In [12]:
cos.shape

(4209, 4209)

Idea:<br> No need to use the cosine similiraty matrix that was created. <br> Instead for each player we want to recommend: <br>
1. Take each game vector tf-idf * log(Playing time)
2. Sum all the games to one vector
3. Do cosine similiraty with all other games.
4. Return recommendations.

In [13]:
game_purchase = pd.read_csv('steamData/game_purchase.dat',delimiter='\t')
game_purchase

Unnamed: 0,User_ID,Game_ID,Purchase
0,1,1,1.0
1,1,2,1.0
2,1,3,1.0
3,1,4,1.0
4,1,5,1.0
...,...,...,...
129506,12393,29,1.0
129507,12393,1828,1.0
129508,12393,2049,1.0
129509,12393,1073,1.0


In [14]:
game_play = pd.read_csv('steamData/game_play.dat',delimiter='\t')
game_play

Unnamed: 0,User_ID,Game_ID,Hours
0,1,1,273.0
1,1,2,87.0
2,1,3,14.9
3,1,4,12.1
4,1,5,8.9
...,...,...,...
70484,12393,29,2.4
70485,12393,1828,2.2
70486,12393,2049,1.5
70487,12393,1073,1.5


In [15]:
item_info = pd.read_csv('steamData/item_info.dat',delimiter='\t')
item_info

Unnamed: 0,Game_ID,Game Name
0,1,The Elder Scrolls V Skyrim
1,2,Fallout 4
2,3,Spore
3,4,Fallout New Vegas
4,5,Left 4 Dead 2
...,...,...
5150,5151,Warriors & Castles
5151,5152,Romance of the Three Kingdoms Maker
5152,5153,Space Colony
5153,5154,Life is Hard


In [16]:
tf_idf_dict_pandas_merge = tf_idf_dict_pandas.merge(item_info, how='inner')
tf_idf_dict_pandas_merge

Unnamed: 0,Game Name,tf_idf vector,Game_ID
0,The Elder Scrolls V Skyrim,"(0, 20730)\t0.08226313359032639\n (0, 20585...",1
1,Fallout 4,"(0, 20585)\t0.1766533605040369\n (0, 20484)...",2
2,Fallout New Vegas,"(0, 20473)\t0.1774632027598725\n (0, 20361)...",4
3,Left 4 Dead 2,"(0, 20839)\t0.1005894584704967\n (0, 20286)...",5
4,HuniePop,"(0, 20554)\t0.20294699632970287\n (0, 20542...",6
...,...,...,...
4204,The Bug Butcher,"(0, 20433)\t0.07695517854501853\n (0, 20303...",5150
4205,Warriors & Castles,"(0, 20233)\t0.17794732411828398\n (0, 19523...",5151
4206,Romance of the Three Kingdoms Maker,"(0, 19223)\t0.14251235047885408\n (0, 18782...",5152
4207,Space Colony,"(0, 18741)\t0.08907896312244576\n (0, 18656...",5153


In [17]:
User_ID = 1
specific_user_purchase = game_purchase[game_purchase.User_ID == User_ID]
specific_user_game_play = game_play[game_play.User_ID == User_ID]
specific_user_purchase

Unnamed: 0,User_ID,Game_ID,Purchase
0,1,1,1.0
1,1,2,1.0
2,1,3,1.0
3,1,4,1.0
4,1,5,1.0
5,1,6,1.0
6,1,7,1.0
7,1,8,1.0
8,1,9,1.0
9,1,10,1.0


In [18]:
specific_user_id_name_vector = specific_user_purchase.merge(tf_idf_dict_pandas_merge, on='Game_ID')
specific_user_id_name_vector

Unnamed: 0,User_ID,Game_ID,Purchase,Game Name,tf_idf vector
0,1,1,1.0,The Elder Scrolls V Skyrim,"(0, 20730)\t0.08226313359032639\n (0, 20585..."
1,1,2,1.0,Fallout 4,"(0, 20585)\t0.1766533605040369\n (0, 20484)..."
2,1,4,1.0,Fallout New Vegas,"(0, 20473)\t0.1774632027598725\n (0, 20361)..."
3,1,5,1.0,Left 4 Dead 2,"(0, 20839)\t0.1005894584704967\n (0, 20286)..."
4,1,6,1.0,HuniePop,"(0, 20554)\t0.20294699632970287\n (0, 20542..."
5,1,7,1.0,Path of Exile,"(0, 19963)\t0.03824841846933033\n (0, 19726..."
6,1,8,1.0,Poly Bridge,"(0, 20581)\t0.17449319781073647\n (0, 20402..."
7,1,9,1.0,Left 4 Dead,"(0, 20839)\t0.2911024499925714\n (0, 20815)..."
8,1,10,1.0,Team Fortress 2,"(0, 20308)\t0.12340410820988834\n (0, 19828..."
9,1,11,1.0,Tomb Raider,"(0, 20286)\t0.0950816869833791\n (0, 20147)..."


In [19]:
specific_user_id_name_vector_hours = specific_user_id_name_vector.merge(specific_user_game_play, how='inner')
specific_user_id_name_vector_hours

Unnamed: 0,User_ID,Game_ID,Purchase,Game Name,tf_idf vector,Hours
0,1,1,1.0,The Elder Scrolls V Skyrim,"(0, 20730)\t0.08226313359032639\n (0, 20585...",273.0
1,1,2,1.0,Fallout 4,"(0, 20585)\t0.1766533605040369\n (0, 20484)...",87.0
2,1,4,1.0,Fallout New Vegas,"(0, 20473)\t0.1774632027598725\n (0, 20361)...",12.1
3,1,5,1.0,Left 4 Dead 2,"(0, 20839)\t0.1005894584704967\n (0, 20286)...",8.9
4,1,6,1.0,HuniePop,"(0, 20554)\t0.20294699632970287\n (0, 20542...",8.5
5,1,7,1.0,Path of Exile,"(0, 19963)\t0.03824841846933033\n (0, 19726...",8.1
6,1,8,1.0,Poly Bridge,"(0, 20581)\t0.17449319781073647\n (0, 20402...",7.5
7,1,9,1.0,Left 4 Dead,"(0, 20839)\t0.2911024499925714\n (0, 20815)...",3.3
8,1,10,1.0,Team Fortress 2,"(0, 20308)\t0.12340410820988834\n (0, 19828...",2.8
9,1,11,1.0,Tomb Raider,"(0, 20286)\t0.0950816869833791\n (0, 20147)...",2.5


In [20]:
test_vector = specific_user_id_name_vector_hours['tf_idf vector'][0]
test_vector

<1x20891 sparse matrix of type '<class 'numpy.float64'>'
	with 70 stored elements in Compressed Sparse Row format>

In [21]:
# Convert this to function for creating data frame to multiply the game time
#place tf-idf values in a pandas data frame 
df = pd.DataFrame(test_vector.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
skyrim,0.264183
scrolls,0.259900
elder,0.256069
virtual,0.201381
want,0.174213
...,...
fazbear,0.000000
faye,0.000000
faydwer,0.000000
favours,0.000000


In [22]:
def convert_to_pandas(encoded_vector):
    df = pd.DataFrame(encoded_vector.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    return df.sort_values(by=["tfidf"],ascending=False)
    

In [23]:
convert_to_pandas(test_vector)

Unnamed: 0,tfidf
skyrim,0.264183
scrolls,0.259900
elder,0.256069
virtual,0.201381
want,0.174213
...,...
fazbear,0.000000
faye,0.000000
faydwer,0.000000
favours,0.000000


In [24]:
user_vector_data = specific_user_id_name_vector_hours[['tf_idf vector', 'Hours']]
user_vector_data

Unnamed: 0,tf_idf vector,Hours
0,"(0, 20730)\t0.08226313359032639\n (0, 20585...",273.0
1,"(0, 20585)\t0.1766533605040369\n (0, 20484)...",87.0
2,"(0, 20473)\t0.1774632027598725\n (0, 20361)...",12.1
3,"(0, 20839)\t0.1005894584704967\n (0, 20286)...",8.9
4,"(0, 20554)\t0.20294699632970287\n (0, 20542...",8.5
5,"(0, 19963)\t0.03824841846933033\n (0, 19726...",8.1
6,"(0, 20581)\t0.17449319781073647\n (0, 20402...",7.5
7,"(0, 20839)\t0.2911024499925714\n (0, 20815)...",3.3
8,"(0, 20308)\t0.12340410820988834\n (0, 19828...",2.8
9,"(0, 20286)\t0.0950816869833791\n (0, 20147)...",2.5


In [25]:
user_vector_data_tf_idf_vectors = user_vector_data['tf_idf vector']
user_vector_data_tf_idf_vectors

0       (0, 20730)\t0.08226313359032639\n  (0, 20585...
1       (0, 20585)\t0.1766533605040369\n  (0, 20484)...
2       (0, 20473)\t0.1774632027598725\n  (0, 20361)...
3       (0, 20839)\t0.1005894584704967\n  (0, 20286)...
4       (0, 20554)\t0.20294699632970287\n  (0, 20542...
5       (0, 19963)\t0.03824841846933033\n  (0, 19726...
6       (0, 20581)\t0.17449319781073647\n  (0, 20402...
7       (0, 20839)\t0.2911024499925714\n  (0, 20815)...
8       (0, 20308)\t0.12340410820988834\n  (0, 19828...
9       (0, 20286)\t0.0950816869833791\n  (0, 20147)...
10      (0, 19979)\t0.2030633935121336\n  (0, 19955)...
11      (0, 19409)\t0.11057632142316463\n  (0, 18783...
12      (0, 20002)\t0.1549131750328819\n  (0, 19059)...
13      (0, 20865)\t0.08434757815068226\n  (0, 20733...
14      (0, 18039)\t0.1928224416563392\n  (0, 17622)...
15      (0, 20029)\t0.20753281262532175\n  (0, 18805...
16      (0, 18975)\t0.1438388950990718\n  (0, 18370)...
17      (0, 20539)\t0.13627870716481308\n  (0, 2

In [26]:
user_vector_data_tf_play_hours = user_vector_data['Hours']
user_vector_data_tf_play_hours

0     273.0
1      87.0
2      12.1
3       8.9
4       8.5
5       8.1
6       7.5
7       3.3
8       2.8
9       2.5
10      2.0
11      1.4
12      1.3
13      1.3
14      0.8
15      0.6
16      0.5
17      0.5
18      0.5
19      0.5
20      0.5
21      0.4
22      0.1
23      0.1
Name: Hours, dtype: float64

In [27]:
user_vector = float()
if len(user_vector_data_tf_play_hours) == len(user_vector_data_tf_idf_vectors):
    for index in range(len(user_vector_data_tf_play_hours)):
        game_vector = convert_to_pandas(user_vector_data_tf_idf_vectors[index])
        new_vector = game_vector * math.log(10, user_vector_data_tf_play_hours[index])
        user_vector += new_vector

In [28]:
user_vector

Unnamed: 0,tfidf
skyrim,0.252559
scrolls,0.248465
elder,0.244803
virtual,0.082663
want,0.071511
...,...
fazbear,0.000000
faye,0.000000
faydwer,0.000000
favours,0.000000


In [29]:
tfidf_vectorizer_vectors

<4209x20891 sparse matrix of type '<class 'numpy.float64'>'
	with 173884 stored elements in Compressed Sparse Row format>

In [30]:
t = user_vector.to_numpy()
t

array([[0.25255916],
       [0.24846501],
       [0.24480266],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [31]:
user_vector_updated = np.rot90(t, 3)
user_vector_updated = np.flip(user_vector_updated)
user_vector_updated.shape

(1, 20891)

In [35]:
tfidf_vectorizer_vectors

<4209x20891 sparse matrix of type '<class 'numpy.float64'>'
	with 173884 stored elements in Compressed Sparse Row format>

In [33]:
user_vector_updated.shape

(1, 20891)