In [37]:
import pandas as pd
import numpy as np

In [38]:
steam_data = pd.read_csv('./data/steam-200k.csv', header=None, names=['user_id', 'game', 'behavior', 'hours', '0'])
steam_data = steam_data.drop(columns=['0'])
steam_data = steam_data[steam_data['behavior'] != 'purchase']

In [39]:
steam_data

Unnamed: 0,user_id,game,behavior,hours
1,151603712,The Elder Scrolls V Skyrim,play,273.0
3,151603712,Fallout 4,play,87.0
5,151603712,Spore,play,14.9
7,151603712,Fallout New Vegas,play,12.1
9,151603712,Left 4 Dead 2,play,8.9
...,...,...,...,...
199991,128470551,Fallen Earth,play,2.4
199993,128470551,Magic Duels,play,2.2
199995,128470551,Titan Souls,play,1.5
199997,128470551,Grand Theft Auto Vice City,play,1.5


Nb d'apparitions de chaque jeux dans le dataset steam

In [40]:
steam_data.pivot_table(index = ['game'], aggfunc ='size').sort_values(ascending=False).head(20)

game
Dota 2                             4841
Team Fortress 2                    2323
Counter-Strike Global Offensive    1377
Unturned                           1069
Left 4 Dead 2                       801
Counter-Strike Source               715
The Elder Scrolls V Skyrim          677
Garry's Mod                         666
Counter-Strike                      568
Sid Meier's Civilization V          554
Terraria                            460
Portal 2                            453
Warframe                            424
Portal                              417
Robocraft                           407
PAYDAY 2                            390
Borderlands 2                       386
Half-Life 2                         356
Heroes & Generals                   335
War Thunder                         303
dtype: int64

In [41]:
games_tags = pd.read_csv('./data/t-games-tags.csv')
games_categories = pd.read_csv('./data/t-games-categories.csv')
len(games_tags['tags'].unique())

406

In [42]:
games_tags

Unnamed: 0,app_id,tags,tag_frequencies
0,655370,Indie,109
1,655370,Action,103
2,655370,Pixel Graphics,100
3,655370,2D,97
4,655370,Retro,93
...,...,...,...
777396,2540690,Relaxing,55
777397,2540690,Time Manipulation,39
777398,2540690,Score Attack,25
777399,2540690,Singleplayer,24


In [43]:
np.mean(games_tags.groupby('app_id').count())

12.755151932795169

In [44]:
games_tags.groupby('tags')['tag_frequencies'].sum().sort_values(ascending=False).head(20)

tags
Action            3168269
Adventure         2788974
Singleplayer      2462137
Indie             2436414
Casual            2394110
2D                1772061
Strategy          1551248
Simulation        1517721
RPG               1507638
Atmospheric       1292251
Exploration       1288214
3D                1244670
Puzzle            1126973
Multiplayer       1108033
Early Access      1093119
Colorful          1089550
Story Rich        1089301
First-Person      1019443
Pixel Graphics    1015184
Cute               959851
Name: tag_frequencies, dtype: int64

* hot sur tous les tags, nombre de vote en tant que poids

In [45]:
games_categories.groupby('categories')['categories'].count().sort_values(ascending=False)

categories
Single-player                 58112
Steam Achievements            30650
Steam Cloud                   16086
Full controller support       13634
Multi-player                  12103
Steam Trading Cards            9412
Partial Controller Support     8490
PvP                            7715
Co-op                          6193
Steam Leaderboards             5809
Online PvP                     5470
Remote Play Together           5225
Shared/Split Screen            4726
Online Co-op                   3429
Shared/Split Screen PvP        3414
Stats                          3085
Shared/Split Screen Co-op      2764
Remote Play on TV              1999
Cross-Platform Multiplayer     1860
Includes level editor          1704
Steam Workshop                 1703
In-App Purchases               1426
Captions available             1067
Remote Play on Tablet           844
MMO                             826
Remote Play on Phone            697
LAN PvP                         514
LAN Co-op        

In [46]:
games = pd.read_csv('./data/games.csv')
games

Unnamed: 0,name,release_date,price,positive,negative,app_id,min_owners,max_owners,hltb_single
0,Train Bandit,"Oct 12, 2017",0.99,53,5,655370,0,20000,
1,Henosis™,"Jul 23, 2020",5.99,3,0,1355720,0,20000,
2,Two Weeks in Painland,"Feb 3, 2020",0.00,50,8,1139950,0,20000,
3,Wartune Reborn,"Feb 26, 2021",0.00,87,49,1469160,50000,100000,
4,TD Worlds,"Jan 9, 2022",10.99,21,7,1659180,0,20000,
...,...,...,...,...,...,...,...,...,...
60947,Two Cubes,"Aug 14, 2023",0.99,54,0,2511290,0,20000,
60948,Wisp Child,"Oct 20, 2023",14.99,5,0,2424000,0,20000,
60949,FireKrackers,"Sep 17, 2023",4.99,1,0,2237270,0,20000,
60950,nekowater,"Nov 21, 2023",2.99,2,1,2650840,0,20000,


In [47]:
games_names_to_analyze = steam_data['game'].unique().tolist()
len(games_names_to_analyze)

3600

In [48]:
games_filtered = games[games['name'].isin(games_names_to_analyze)]
games_filtered.reset_index(drop=True, inplace=True)
len(games_filtered)

1292

In [49]:
cols = games_filtered.columns.tolist() + games_tags['tags'].unique().tolist()
games_full_infos = pd.DataFrame(columns=cols)

for index, row in games_filtered.iterrows():
    game_f = pd.DataFrame(columns=cols)
    game_f.loc[0] = row
    
    for tag in games_tags['tags'].unique().tolist():
        game_f.loc[0][tag] = 0
    
    for _, game_t in games_tags[game_f.loc[0]['app_id'] == games_tags["app_id"]].iterrows():
        game_f[game_t['tags']] = game_t['tag_frequencies']
    
    games_full_infos = pd.concat([games_full_infos, game_f])
    
    print("\r", index, "out of", len(games_filtered)-1, end="\r")

games_full_infos.reset_index(drop=True, inplace=True)
games_full_infos.to_csv('./data/games_full_infos.csv', index=False)

 1291 out of 1291

In [50]:
games_full_infos = pd.read_csv('./data/games_full_infos.csv')
games_full_infos = games_full_infos.drop(columns=['release_date', 'price', 'positive', 'negative', 'app_id', 'min_owners', 'max_owners', 'hltb_single'])
games_full_infos.head()

Unnamed: 0,name,Indie,Action,Pixel Graphics,2D,Retro,Arcade,Score Attack,Minimalist,Comedy,...,Sniper,Asymmetric VR,Silent Protagonist,Submarine,Golf,Cycling,Wrestling,Lemmings,Wholesome,Cozy
0,Vanguard Princess,170,150,0,99,0,70,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Deadfall Adventures,27,122,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Numba Deluxe,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Chaos Engine,0,70,0,7,24,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Just Get Through,45,32,19,16,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
games_by_user = steam_data.groupby('user_id').agg({'game': lambda x: list(x)}).reset_index()

In [52]:
filtered_games_by_user = pd.DataFrame(columns=['user_id', 'game'])

for index, row in games_by_user.iterrows():
    if len(row['game']) <= 4:
        continue
    
    game = []
    for i, g in enumerate(row['game']):
        if g in games_full_infos['name'].values:
            game.append(g)
    if len(game) <= 4:
        continue
    filtered_games_by_user = pd.concat([filtered_games_by_user, pd.DataFrame({'user_id': row['user_id'], 'game': [game]})])
    print("\r", index, "out of", len(games_by_user)-1, end="\r")

filtered_games_by_user.reset_index(drop=True, inplace=True)
filtered_games_by_user.head()

 11218 out of 11349

Unnamed: 0,user_id,game
0,298950,"[Fallout 4, Pillars of Eternity, Starbound, Ru..."
1,975449,"[Dota 2, Rogue Legacy, The Stanley Parable, To..."
2,1936551,"[This War of Mine, Prison Architect, The Banne..."
3,1950243,"[FINAL FANTASY VII, Company of Heroes 2, Dunge..."
4,2083767,"[Path of Exile, Clicker Heroes, Tap Tap Infini..."


In [53]:
X_train = []
Y_train = []

for index, row in filtered_games_by_user.iterrows():
    for k in range(3, len(row['game'])):
        tags_array = []
        for game in row['game'][k-3:k]:
            tags = games_full_infos[games_full_infos['name'] == game].drop(columns=['name']).values.tolist()[0]
            tags_array.append(tags)
            
        
        temp=0
        output = []
        
        for i in range(len(tags_array[0])):
            for j in range(len(tags_array)):
                temp += tags_array[j][i]
            output.append(temp/len(tags_array))
            temp = 0
        
        
        
        X_train.append(output)
        Y_train.append(row['game'][k])
        print("\r", index, "out of", len(filtered_games_by_user)-1, end="\r")

 1030 out of 1030

In [54]:
X_test = []
Y_test = []

for index, row in filtered_games_by_user.iterrows():
    tags_array = []
    for game in row['game'][0:3]:
        tags = games_full_infos[games_full_infos['name'] == game].drop(columns=['name']).values.tolist()[0]
        tags_array.append(tags)
    
    temp=0
    output = []
    
    for i in range(len(tags_array[0])):
        for j in range(len(tags_array)):
            temp += tags_array[j][i]
        output.append(temp)
        temp = 0
    
    X_test.append(output)
    Y_test.append(row['game'][4])
    print("\r", index, "out of", len(filtered_games_by_user)-1, end="\r")

 1030 out of 1030602 out of 1030 713 out of 1030 819 out of 1030 929 out of 1030 1027 out of 1030

In [55]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, Y_train)

predictions = rfc.predict_proba(X_test)

In [56]:
verified_predictions = []
games_pred_names = np.unique(Y_train)

for n in range(len(predictions)):
    preds = []
    indexes = []
    for i in range(5):
        max = 0
        index = 0
        for j in range(len(predictions[n])):
            if predictions[n][j] > max and j not in indexes and games_pred_names[j] not in filtered_games_by_user['game'][n][:3]:
                max = predictions[n][j]
                index = j
        indexes.append(index)
        preds.append(games_pred_names[index])
    verified_predictions.append(preds)
verified_predictions

[['BioShock Infinite', 'Rust', 'Age of Wonders III', 'Unturned', 'GRID'],
 ['PAYDAY 2',
  'Tomb Raider',
  'Unturned',
  'Democracy 3',
  'Guacamelee! Gold Edition'],
 ['Craft The World', 'DayZ', 'BioShock Infinite', "Don't Starve", 'Dota 2'],
 ['Arma 3',
  "Don't Starve",
  'Magic Duels',
  'Cook, Serve, Delicious!',
  'Risk of Rain'],
 ['Trove', 'Neverwinter', 'Dota 2', 'Star Conflict', 'Warframe'],
 ['Grand Theft Auto V',
  'Starbound',
  'Dead Space',
  'Surgeon Simulator',
  'Saints Row IV'],
 ['Grand Theft Auto V', 'Unturned', 'DayZ', 'Saints Row IV', 'Robocraft'],
 ['PAYDAY 2', 'Reign Of Kings', 'Dying Light', 'DayZ', 'Unturned'],
 ['Dota 2', 'Unturned', 'DayZ', 'Insurgency', 'Starbound'],
 ['DayZ', 'Prison Architect', 'Dying Light', 'Space Engineers', 'PAYDAY 2'],
 ['Craft The World',
  'Cities XL Platinum',
  'Democracy 3',
  'Tomb Raider',
  'Company of Heroes 2'],
 ['BioShock Infinite',
  'Path of Exile',
  'DRAGON BALL XENOVERSE',
  'DiRT Rally',
  'Dota 2'],
 ['BioShock In

In [57]:
# CUSTOM SCORE FUNCTION
res = []
for i, preds in enumerate(verified_predictions):
    y_true = filtered_games_by_user.loc[i]['game']
    for pred in preds:
        if pred in y_true:
            res.append("FOUND " + pred)
            break

In [58]:
print(round(len(res)/len(predictions)*100, 2), '%')

78.66 %


In [59]:
res

['FOUND BioShock Infinite',
 'FOUND Tomb Raider',
 'FOUND Arma 3',
 'FOUND Dota 2',
 'FOUND Grand Theft Auto V',
 'FOUND Reign Of Kings',
 'FOUND DayZ',
 'FOUND Craft The World',
 'FOUND Dota 2',
 'FOUND Dota 2',
 'FOUND Path of Exile',
 'FOUND Game Dev Tycoon',
 'FOUND Zack Zero',
 'FOUND Warframe',
 'FOUND Prison Architect',
 'FOUND Space Engineers',
 'FOUND Trove',
 'FOUND Tomb Raider',
 'FOUND This War of Mine',
 'FOUND Dungeon Defenders II',
 'FOUND Tomb Raider',
 'FOUND Dota 2',
 'FOUND Grand Theft Auto V',
 'FOUND War Thunder',
 'FOUND PAYDAY 2',
 'FOUND Banished',
 'FOUND Fallout 4',
 "FOUND Don't Starve",
 'FOUND Space Engineers',
 'FOUND PAYDAY 2',
 'FOUND The Stanley Parable',
 'FOUND Kerbal Space Program',
 'FOUND PAYDAY 2',
 'FOUND Fallout 4',
 'FOUND Ironcast',
 'FOUND DayZ',
 'FOUND Time Clickers',
 'FOUND DayZ',
 'FOUND Path of Exile',
 'FOUND PAYDAY 2',
 'FOUND War Thunder',
 'FOUND Surgeon Simulator',
 'FOUND Lethal League',
 'FOUND Saints Row IV',
 'FOUND PAYDAY 2',


In [60]:
from joblib import dump

dump(rfc, 'filename.joblib') 
dump(games_pred_names, 'games_pred_names.joblib')

['games_pred_names.joblib']