In [92]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from random import randrange, sample
from tqdm import tqdm

df = pd.read_csv('steam-200k.csv')
df.columns = ['id','game','state','hours_played','0']
df['id'].nunique()
df = df.drop('0',axis=1)
df.isnull().values.any()
bought_games = df.loc[df['state']=='purchase']
played_games = df.loc[df['state']=='play']
played_games_orig = played_games.drop(['state'],axis=1)


We want to see how well we can recommend games using frequent itemset

In [93]:
bought_grouped = bought_games.drop('hours_played', axis=1)
bought_grouped = bought_grouped.drop('state', axis=1)
bought_grouped = bought_grouped.groupby('id')['game'].apply(list).reset_index(name='games')
bought_grouped.columns = ['id','games_bought']
bought_grouped = bought_grouped.set_index('id')
bought_grouped


Unnamed: 0_level_0,games_bought
id,Unnamed: 1_level_1
5250,"[Cities Skylines, Deus Ex Human Revolution, Po..."
76767,"[Counter-Strike, Call of Duty World at War, To..."
86540,"[The Elder Scrolls V Skyrim, Audiosurf, XCOM E..."
103360,"[Counter-Strike, Counter-Strike Condition Zero..."
144736,"[Counter-Strike, Day of Defeat, Deathmatch Cla..."
...,...
309554670,[Mitos.is The Game]
309626088,[Age of Empires II HD Edition]
309812026,"[Counter-Strike Nexon Zombies, Robocraft]"
309824202,[Dota 2]


In [94]:
grouped = played_games_orig.drop('hours_played', axis=1)
grouped = grouped.groupby('id')['game'].apply(list).reset_index(name='games')
grouped

Unnamed: 0,id,games
0,5250,"[Cities Skylines, Deus Ex Human Revolution, Po..."
1,76767,"[Counter-Strike, Call of Duty World at War, To..."
2,86540,"[The Elder Scrolls V Skyrim, Audiosurf, XCOM E..."
3,144736,[Counter-Strike]
4,181212,"[Counter-Strike, Half-Life 2 Lost Coast]"
...,...,...
11345,309434439,[Dota 2]
11346,309554670,[Mitos.is The Game]
11347,309626088,[Age of Empires II HD Edition]
11348,309824202,[Dota 2]


In [95]:
joined = grouped.join(bought_grouped,on='id',lsuffix='_played', rsuffix='_bought')
joined


Unnamed: 0,id,games,games_bought
0,5250,"[Cities Skylines, Deus Ex Human Revolution, Po...","[Cities Skylines, Deus Ex Human Revolution, Po..."
1,76767,"[Counter-Strike, Call of Duty World at War, To...","[Counter-Strike, Call of Duty World at War, To..."
2,86540,"[The Elder Scrolls V Skyrim, Audiosurf, XCOM E...","[The Elder Scrolls V Skyrim, Audiosurf, XCOM E..."
3,144736,[Counter-Strike],"[Counter-Strike, Day of Defeat, Deathmatch Cla..."
4,181212,"[Counter-Strike, Half-Life 2 Lost Coast]","[Counter-Strike, Half-Life 2 Lost Coast, Count..."
...,...,...,...
11345,309434439,[Dota 2],[Dota 2]
11346,309554670,[Mitos.is The Game],[Mitos.is The Game]
11347,309626088,[Age of Empires II HD Edition],[Age of Empires II HD Edition]
11348,309824202,[Dota 2],[Dota 2]


In [96]:

te = TransactionEncoder()
te_ary = te.fit(grouped['games']).transform(grouped['games'])
df = pd.DataFrame(te_ary, columns=te.columns_)
df
fis = fpgrowth(df, min_support=0.02, use_colnames=True)
fis = fis.sort_values(by=['support'], ascending=False)
fis

Unnamed: 0,support,itemsets
0,0.426520,(Dota 2)
1,0.204670,(Team Fortress 2)
4,0.121322,(Counter-Strike Global Offensive)
15,0.094185,(Unturned)
10,0.070573,(Left 4 Dead 2)
...,...,...
72,0.020264,"(Warframe, Dota 2)"
26,0.020264,(No More Room in Hell)
37,0.020088,"(Counter-Strike Source, Dota 2)"
38,0.020000,"(Counter-Strike, Counter-Strike Global Offensive)"


In [97]:
fis = fpgrowth(df, min_support=0.005, use_colnames=True)
fis = fis.sort_values(by=['support'], ascending=False)
fis

Unnamed: 0,support,itemsets
0,0.426520,(Dota 2)
1,0.204670,(Team Fortress 2)
6,0.121322,(Counter-Strike Global Offensive)
39,0.094185,(Unturned)
20,0.070573,(Left 4 Dead 2)
...,...,...
2128,0.005022,"(Team Fortress 2, The Binding of Isaac, Sid Me..."
2933,0.005022,"(Don't Starve Together Beta, The Elder Scrolls..."
2940,0.005022,"(Terraria, Team Fortress 2, Don't Starve Toget..."
2941,0.005022,"(Team Fortress 2, Don't Starve Together Beta, ..."


In [98]:
ar = association_rules(fis, metric="confidence", min_threshold=0.9)
ar.sort_values(by=['lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
35,"(Call of Duty Black Ops - Multiplayer, Call of...","(Call of Duty Black Ops, Call of Duty Modern W...",0.007841,0.008458,0.007313,0.932584,110.258661,0.007246,14.707871
36,"(Call of Duty Black Ops - Multiplayer, Call of...","(Call of Duty Modern Warfare 2, Call of Duty B...",0.008106,0.008899,0.007313,0.902174,101.382910,0.007241,10.131258
254,"(Half-Life 2, Half-Life 2 Lost Coast, Half-Lif...",(Half-Life 2 Episode Two),0.005639,0.012423,0.005110,0.906250,72.949911,0.005040,10.534156
252,"(Call of Duty Black Ops II - Zombies, Counter-...",(Call of Duty Black Ops II - Multiplayer),0.005286,0.013304,0.005110,0.966667,72.660044,0.005040,29.600881
3,(Call of Duty Black Ops II - Zombies),(Call of Duty Black Ops II - Multiplayer),0.010132,0.013304,0.009515,0.939130,70.590268,0.009381,16.210006
...,...,...,...,...,...,...,...,...,...
10,"(Garry's Mod, Counter-Strike Source, Dota 2)",(Team Fortress 2),0.008899,0.204670,0.008018,0.900990,4.402169,0.006196,8.032837
12,"(Counter-Strike Global Offensive, Left 4 Dead ...",(Team Fortress 2),0.008899,0.204670,0.008018,0.900990,4.402169,0.006196,8.032837
161,"(Alien Swarm, Left 4 Dead 2, Counter-Strike So...",(Team Fortress 2),0.006167,0.204670,0.005551,0.900000,4.397331,0.004288,7.953304
165,"(Unturned, Counter-Strike Global Offensive, Wa...",(Team Fortress 2),0.006167,0.204670,0.005551,0.900000,4.397331,0.004288,7.953304


In [104]:
def recommendation_ar(played_games, fis, ar, bought_games):
    item_set_tmp = frozenset()
    score = 0.0
    for i in range(ar.shape[0]):
        if( set(ar.iloc[i][0]).issubset(set(played_games))):
            #print("antecedent : " + str(list(ar.iloc[i][0])) + " | consequent : " + str(list(ar.iloc[i][1])) + " | confidence : " + str(ar.iloc[i][5]))
            item_set_tmp = frozenset.union(item_set_tmp,ar.iloc[i][1])
            score = score + ar.iloc[i][5]
    #if(one):
    #    for i in range(fis.shape[0]):
    #       if(set(librairy).issubset(set(fis.iloc[i][1]))):
    #            item_set_tmp = frozenset.union(item_set_tmp,fis.iloc[i][1])
    item_set = list(frozenset.difference(item_set_tmp, bought_games))
    return item_set,score

In [103]:

for i in range (1):
    played_games_i = joined.iloc[i][1]
    bought_games_i = joined.iloc[i][2]
    print(played_games_i)
    print(bought_games_i)
    print(recommendation_ar(played_games_i, fis, ar, bought_games_i))
    print('\n')

['Cities Skylines', 'Deus Ex Human Revolution', 'Portal 2', 'Alien Swarm', 'Team Fortress 2', 'Dota 2']
['Cities Skylines', 'Deus Ex Human Revolution', 'Portal 2', 'Alien Swarm', 'Team Fortress 2', 'Dota 2', 'Counter-Strike', 'Counter-Strike Source', 'Day of Defeat', 'Deathmatch Classic', 'Half-Life', 'Half-Life 2', 'Half-Life 2 Deathmatch', 'Half-Life 2 Episode One', 'Half-Life 2 Episode Two', 'Half-Life 2 Lost Coast', 'Half-Life Blue Shift', 'Half-Life Opposing Force', 'Portal', 'Ricochet', 'Team Fortress Classic']
(['Unturned', 'Killing Floor', 'Just Cause 2', 'XCOM Enemy Unknown', 'Magicka', 'Dishonored', 'Left 4 Dead', 'Torchlight II', 'PAYDAY The Heist', 'Left 4 Dead 2', 'The Elder Scrolls V Skyrim', 'Tomb Raider', "Sid Meier's Civilization V", 'Borderlands 2', 'Dead Island', 'Castle Crashers', 'BioShock Infinite', 'Terraria', 'Grand Theft Auto IV', 'Fallout New Vegas', "Mirror's Edge", 'Chivalry Medieval Warfare', 'The Binding of Isaac', 'Metro 2033', 'Trine 2', 'Far Cry 3', 'Co

In [101]:
ar = association_rules(fis, metric="confidence", min_threshold=0.2)
ar.sort_values(by=[ 'lift','confidence'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7933,"(Call of Duty Modern Warfare 2, Call of Duty M...","(Call of Duty Modern Warfare 2 - Multiplayer, ...",0.006608,0.006784,0.005639,0.853333,125.783550,0.005594,6.771926
7934,"(Call of Duty Modern Warfare 2 - Multiplayer, ...","(Call of Duty Modern Warfare 2, Call of Duty M...",0.006784,0.006608,0.005639,0.831169,125.783550,0.005594,5.883938
7932,"(Call of Duty Modern Warfare 2, Call of Duty M...","(Call of Duty Modern Warfare 2 - Multiplayer, ...",0.007048,0.006784,0.005639,0.800000,117.922078,0.005591,4.966079
7935,"(Call of Duty Modern Warfare 2 - Multiplayer, ...","(Call of Duty Modern Warfare 2, Call of Duty M...",0.006784,0.007048,0.005639,0.831169,117.922078,0.005591,5.881328
2974,"(Call of Duty Black Ops - Multiplayer, Call of...","(Call of Duty Black Ops, Call of Duty Modern W...",0.007841,0.008458,0.007313,0.932584,110.258661,0.007246,14.707871
...,...,...,...,...,...,...,...,...,...
237,(Counter-Strike),(Dota 2),0.050044,0.426520,0.014890,0.297535,0.697588,-0.006455,0.816383
349,(Sid Meier's Civilization V),(Dota 2),0.048811,0.426520,0.013216,0.270758,0.634808,-0.007603,0.786406
3752,(Call of Duty Modern Warfare 2 - Multiplayer),(Dota 2),0.025639,0.426520,0.006872,0.268041,0.628438,-0.004063,0.783487
7368,"(Call of Duty Modern Warfare 2, Call of Duty M...",(Dota 2),0.022731,0.426520,0.005727,0.251938,0.590683,-0.003968,0.766621


In [58]:
librairy = ['Cities Skylines']
print(librairy)
print(recommendation_ar(librairy, fis, ar, []))

for i in range (10):
    print(i)
    played_games_i = joined.iloc[i][1]
    bought_games_i = joined.iloc[i][2]
    print(played_games_i)
    print(bought_games_i)
    print(recommendation_ar(played_games_i, fis, ar, bought_games_i))
    print('\n')

['Cities Skylines']
(["Sid Meier's Civilization V"], 0.4956521739130435)
0
['Cities Skylines', 'Deus Ex Human Revolution', 'Portal 2', 'Alien Swarm', 'Team Fortress 2', 'Dota 2']
['Cities Skylines', 'Deus Ex Human Revolution', 'Portal 2', 'Alien Swarm', 'Team Fortress 2', 'Dota 2', 'Counter-Strike', 'Counter-Strike Source', 'Day of Defeat', 'Deathmatch Classic', 'Half-Life', 'Half-Life 2', 'Half-Life 2 Deathmatch', 'Half-Life 2 Episode One', 'Half-Life 2 Episode Two', 'Half-Life 2 Lost Coast', 'Half-Life Blue Shift', 'Half-Life Opposing Force', 'Portal', 'Ricochet', 'Team Fortress Classic']
(['Unturned', 'Killing Floor', 'Just Cause 2', 'XCOM Enemy Unknown', 'Magicka', 'Dishonored', 'Left 4 Dead', 'Torchlight II', 'PAYDAY The Heist', 'Left 4 Dead 2', 'The Elder Scrolls V Skyrim', 'Tomb Raider', "Sid Meier's Civilization V", 'Borderlands 2', 'Dead Island', 'Castle Crashers', 'BioShock Infinite', 'Terraria', 'Grand Theft Auto IV', 'Fallout New Vegas', "Mirror's Edge", 'Chivalry Medieval 

We can first see that for supports of 10 to 2% almost all possible recommendations are for popular games. Which is not surprising since playtime isn't taken into account and there are a lot of different game. To see all possible games we need to lower the support a lot.
Even for lower support, the games recommended aren't very accurate : the second user doesn't get recommended games from Call of Duty nor the Total War franchise.
Even after lowering the support, we need to find a threshold for the confidence that give the right recommendations but also give at least one recommendation for any list of games.
As we can see we get more recommendations when the thershold is lower but too low and some recommendations don't make anysenses. Too high no recommendations are made for some lists of games

Let's see if taking into account the time spent playing a game can help make better recommendations.
If a player hasn't put more hours into a game than the average player, then we'll consider that he didn't really like the game.

In [59]:
avg = played_games_orig.groupby('game')['hours_played'].apply(np.average).reset_index(name='avg_hours_played')
avg

Unnamed: 0,game,avg_hours_played
0,007 Legends,0.700000
1,0RBITALIS,0.400000
2,1... 2... 3... KICK IT! (Drop That Beat Like a...,4.000000
3,10 Second Ninja,2.950000
4,10000000,3.600000
...,...,...
3595,rymdkapsel,1.100000
3596,sZone-Online,0.977586
3597,the static speaks my name,0.250000
3598,theHunter,2.493548


In [60]:
#Dictionnary of average time played
avg_dict = avg.set_index('game').to_dict()['avg_hours_played']
avg_dict

{'007 Legends': 0.7,
 '0RBITALIS': 0.39999999999999997,
 '1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)': 4.0,
 '10 Second Ninja': 2.95,
 '10,000,000': 3.6,
 '100% Orange Juice': 8.700000000000001,
 '1000 Amps': 0.1,
 '12 Labours of Hercules': 4.957142857142856,
 '12 Labours of Hercules II The Cretan Bull': 4.2875,
 '12 Labours of Hercules III Girl Power': 6.533333333333334,
 '140': 1.26,
 '15 Days': 0.5,
 '16bit Trader': 4.45,
 '1701 A.D. Sunken Dragon': 0.6,
 '18 Wheels of Steel American Long Haul': 5.8,
 '1953 NATO vs Warsaw Pact': 5.9,
 '1Quest': 2.7,
 '3 Stars of Destiny': 4.7,
 '3089 -- Futuristic Action RPG': 0.3,
 '3D Mini Golf': 1.0,
 '3DMark': 13.377777777777778,
 '3DMark 11': 3.0,
 '3DMark Vantage': 21.0,
 '4 Elements': 14.85,
 '404Sight': 0.5285714285714286,
 '500 Years Act 1': 3.0,
 '60 Seconds!': 1.86,
 '7 Days to Die': 41.586734693877546,
 '8BitBoy': 3.4,
 '8BitMMO': 0.4555555555555555,
 '9 Clues 2 The Ward': 5.1,
 '9 Clues The Secret of Serpent Creek': 2.8,

In [61]:
#To get the avg time played for a game
avg_dict['Dota 2']

202.78549886387108

In [62]:
#Select only the rows where players played more than average
played_games_sup_avg = played_games_orig.loc[played_games_orig['hours_played'] > played_games_orig['game'].astype(str).map(avg_dict)]
played_games_sup_avg.sort_values(by='id')

Unnamed: 0,id,game,hours_played
65423,5250,Cities Skylines,144.0
65425,5250,Deus Ex Human Revolution,62.0
55918,76767,Total War ATTILA,207.0
55920,76767,Call of Duty Modern Warfare 2 - Multiplayer,165.0
55922,76767,Call of Duty Modern Warfare 2,65.0
...,...,...,...
150248,307631446,MEDIEVAL Total War - Gold Edition,14.2
177078,307688442,Hurtworld,65.0
96358,307701164,Emily is Away,0.9
117507,309255941,Mitos.is The Game,5.7


In [63]:
g = played_games_sup_avg.drop('hours_played', axis=1)
g = g.groupby('id')['game'].apply(list).reset_index(name='games')
print(g)

             id                                              games
0          5250        [Cities Skylines, Deus Ex Human Revolution]
1         76767  [Counter-Strike, Call of Duty World at War, To...
2         86540  [The Elder Scrolls V Skyrim, Audiosurf, Killer...
3        229911  [Counter-Strike Condition Zero, Call of Duty M...
4        298950  [Team Fortress 2, Counter-Strike Global Offens...
...         ...                                                ...
4237  307631446                [MEDIEVAL Total War - Gold Edition]
4238  307688442                                        [Hurtworld]
4239  307701164                                    [Emily is Away]
4240  309255941                                [Mitos.is The Game]
4241  309554670                                [Mitos.is The Game]

[4242 rows x 2 columns]


In [65]:
j = g.join(bought_grouped,on='id',lsuffix='_played', rsuffix='_bought')
j

Unnamed: 0,id,games,games_bought
0,5250,"[Cities Skylines, Deus Ex Human Revolution]","[Cities Skylines, Deus Ex Human Revolution, Po..."
1,76767,"[Counter-Strike, Call of Duty World at War, To...","[Counter-Strike, Call of Duty World at War, To..."
2,86540,"[The Elder Scrolls V Skyrim, Audiosurf, Killer...","[The Elder Scrolls V Skyrim, Audiosurf, XCOM E..."
3,229911,"[Counter-Strike Condition Zero, Call of Duty M...","[Counter-Strike Condition Zero, Call of Duty M..."
4,298950,"[Team Fortress 2, Counter-Strike Global Offens...","[Team Fortress 2, Counter-Strike Global Offens..."
...,...,...,...
4237,307631446,[MEDIEVAL Total War - Gold Edition],[MEDIEVAL Total War - Gold Edition]
4238,307688442,[Hurtworld],"[Hurtworld, Metal War Online Retribution, Warf..."
4239,307701164,[Emily is Away],[Emily is Away]
4240,309255941,[Mitos.is The Game],[Mitos.is The Game]


In [69]:

te2 = TransactionEncoder()
te_ary2 = te2.fit(g['games']).transform(g['games'])
df2 = pd.DataFrame(te_ary2, columns=te2.columns_)
fis2 = fpgrowth(df2, min_support=0.001, use_colnames=True)
fis2 = fis2.sort_values(by=['support'], ascending=False)
fis2

Unnamed: 0,support,itemsets
123,0.202263,(Dota 2)
11,0.093352,(Counter-Strike Global Offensive)
12,0.081094,(Team Fortress 2)
478,0.059642,(Unturned)
8,0.049033,(The Elder Scrolls V Skyrim)
...,...,...
1590,0.001179,"(Portal 2, Magicka, Alien Swarm)"
2422,0.001179,"(PAYDAY 2, Middle-earth Shadow of Mordor)"
2421,0.001179,"(ARK Survival Evolved, Middle-earth Shadow of ..."
984,0.001179,"(Far Cry 3, Call of Duty Modern Warfare 2 - Mu..."


We can now see that with the time played taken into account, we need a lower support to have as many itemsets.
This is normal as game don't appear as frequently in the list of game of players.

In [70]:
ar2 = association_rules(fis2, metric="confidence", min_threshold=0.2)
ar2 = ar2.sort_values(by=['confidence'], ascending=False)
ar2 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1378,"(Empire Total War, Total War ATTILA)","(Total War ROME II - Emperor Edition, Total Wa...",0.001414,0.005186,0.001414,1.0,192.818182,0.001407,inf
1408,"(Tomb Raider, The Witcher Enhanced Edition)",(The Witcher 2 Assassins of Kings Enhanced Edi...,0.001414,0.011551,0.001414,1.0,86.571429,0.001398,inf
1859,(Thief Gold),(The Elder Scrolls V Skyrim),0.001179,0.049033,0.001179,1.0,20.394231,0.001121,inf
564,"(Left 4 Dead, Alien Swarm)",(Left 4 Dead 2),0.001886,0.036304,0.001886,1.0,27.545455,0.001817,inf
2060,"(Total War SHOGUN 2, Total War ROME II - Emper...",(Empire Total War),0.001179,0.014851,0.001179,1.0,67.333333,0.001161,inf
...,...,...,...,...,...,...,...,...,...
1411,(The Witcher Enhanced Edition),"(Tomb Raider, The Witcher 2 Assassins of Kings...",0.007072,0.002357,0.001414,0.2,84.840000,0.001398,1.247053
1407,(The Witcher Enhanced Edition),"(Portal, The Witcher 2 Assassins of Kings Enha...",0.007072,0.001886,0.001414,0.2,106.050000,0.001401,1.247643
1403,(The Witcher Enhanced Edition),"(The Elder Scrolls V Skyrim, The Witcher 2 Ass...",0.007072,0.004243,0.001414,0.2,47.133333,0.001384,1.244696
1399,(The Witcher Enhanced Edition),(Magicka),0.007072,0.015323,0.001414,0.2,13.052308,0.001306,1.230846


For the same threshold as above, we can get a lot less possible recommendations (less  rows ). 

In [72]:
librairy = ['Cities Skylines']
print(librairy)
print(recommendation_ar(librairy, fis2, ar2, []))


for i in range (10):
    played_games_i = j.iloc[i][1]
    bought_games_i = j.iloc[i][2]
    print(played_games_i)
    #print(bought_games_i)
    print(recommendation_ar(played_games_i, fis2, ar2, bought_games_i))
    print('\n')
    print('\n')

['Cities Skylines']
(['Prison Architect'], 0.2)
['Cities Skylines', 'Deus Ex Human Revolution']
(['Fallout New Vegas', 'Dishonored', 'Metro 2033', 'The Elder Scrolls V Skyrim', 'Prison Architect'], 1.7799999999999998)




['Counter-Strike', 'Call of Duty World at War', 'Total War ATTILA', 'Call of Duty Modern Warfare 2 - Multiplayer', 'Call of Duty Modern Warfare 2', 'Call of Duty Black Ops']
(['Empire Total War', 'Team Fortress 2', 'Total War ROME II - Emperor Edition', 'Total War SHOGUN 2', 'PAYDAY 2'], 9.151715737698268)




['The Elder Scrolls V Skyrim', 'Audiosurf', 'Killer is Dead']
(['Team Fortress 2'], 0.875)




['Counter-Strike Condition Zero', 'Call of Duty Modern Warfare 2']
(['Call of Duty Modern Warfare 3'], 1.1924907456372291)




['Team Fortress 2', 'Counter-Strike Global Offensive', 'The Elder Scrolls V Skyrim', 'Far Cry 3', 'Fallout New Vegas', 'The Witcher 3 Wild Hunt', 'Terraria', 'Fallout 4', 'Borderlands 2', 'DARK SOULS II Scholar of the First Sin', 'Endless Legen

However, even with less possible recommendations, the games that are recommended are very good : for the second user, other COD and Total war games are recommended.
We do have a slight issue : 
For the second user Team Fortress 2 was recommended even though the user didn't play other games made by Valve(like Half Life or Portal or Counter Strike). 
We can actually see that some very popular games still appear in recommendations even if the user didn't like anything related to those games.
This is because the confidence threshold is low enough to allow popular games to be recommended because they are popular.
To fix this issue, we can increase the threshold, but doing so would remove some recommendations. Players who like only Counter-Strike Global Offensive, won't be recommended anything for exemple.

Let's evaluate now our model

In [74]:
geval = j
print(geval)
geval = geval.sample(frac=1, random_state=42)
train_nb = int(geval.shape[0] / 100 * 80)
print(train_nb)
X_train = geval[:train_nb]
X_test = geval[train_nb:]
print(X_train)
X_test

             id                                              games  \
0          5250        [Cities Skylines, Deus Ex Human Revolution]   
1         76767  [Counter-Strike, Call of Duty World at War, To...   
2         86540  [The Elder Scrolls V Skyrim, Audiosurf, Killer...   
3        229911  [Counter-Strike Condition Zero, Call of Duty M...   
4        298950  [Team Fortress 2, Counter-Strike Global Offens...   
...         ...                                                ...   
4237  307631446                [MEDIEVAL Total War - Gold Edition]   
4238  307688442                                        [Hurtworld]   
4239  307701164                                    [Emily is Away]   
4240  309255941                                [Mitos.is The Game]   
4241  309554670                                [Mitos.is The Game]   

                                           games_bought  
0     [Cities Skylines, Deus Ex Human Revolution, Po...  
1     [Counter-Strike, Call of Duty World a

Unnamed: 0,id,games,games_bought
3540,209804474,[No More Room in Hell],"[Counter-Strike Global Offensive, No More Room..."
2724,161816812,"[Counter-Strike Global Offensive, Dota 2, Garr...","[Counter-Strike Global Offensive, Dota 2, Garr..."
3044,181549899,[Dead Island],"[Dead Island, Garry's Mod]"
3094,183801719,[Dota 2],[Dota 2]
1460,95586525,"[Call of Duty Modern Warfare 3 - Multiplayer, ...","[Call of Duty Modern Warfare 3 - Multiplayer, ..."
...,...,...,...
3444,203712998,[Football Manager 2015],[Football Manager 2015]
466,44240866,"[Call of Duty Modern Warfare 2 - Multiplayer, ...","[Call of Duty Modern Warfare 2 - Multiplayer, ..."
3092,183574382,[Dino D-Day],"[PAYDAY The Heist, Dino D-Day, Really Big Sky,..."
3772,233532285,[Grand Theft Auto V],[Grand Theft Auto V]


In [75]:
te3 = TransactionEncoder()
te_ary3 = te3.fit(X_train['games']).transform(X_train['games'])
dftrain = pd.DataFrame(te_ary3, columns=te3.columns_)
fistrain = fpgrowth(dftrain, min_support=0.001, use_colnames=True)
fistrain = fistrain.sort_values(by=['support'], ascending=False)
fistrain

Unnamed: 0,support,itemsets
2,0.206012,(Dota 2)
3,0.091954,(Counter-Strike Global Offensive)
56,0.078691,(Team Fortress 2)
119,0.058650,(Unturned)
7,0.048630,(The Elder Scrolls V Skyrim)
...,...,...
1369,0.001179,"(Hotline Miami, Prison Architect)"
1370,0.001179,"(Deus Ex Human Revolution, Prison Architect)"
1371,0.001179,"(Prison Architect, Saints Row The Third)"
2754,0.001179,"(Fallout 3 - Game of the Year Edition, Mass Ef..."


In [76]:
artrain = association_rules(fistrain, metric="confidence", min_threshold=0.2)
artrain = artrain.sort_values(by=['antecedent support'], ascending=False)
artrain

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Counter-Strike Global Offensive),(Dota 2),0.091954,0.206012,0.020041,0.217949,1.057940,0.001098,1.015263
1,(Left 4 Dead 2),(Team Fortress 2),0.038904,0.078691,0.012084,0.310606,3.947140,0.009022,1.336404
2,(Garry's Mod),(Counter-Strike Global Offensive),0.035367,0.091954,0.010315,0.291667,3.171875,0.007063,1.281947
4,(Garry's Mod),(Team Fortress 2),0.035367,0.078691,0.009726,0.275000,3.494663,0.006943,1.270770
14,(Garry's Mod),(Unturned),0.035367,0.058650,0.007663,0.216667,3.694221,0.005589,1.201723
...,...,...,...,...,...,...,...,...,...
2158,"(Team Fortress 2, Batman Arkham City GOTY)",(Left 4 Dead 2),0.001179,0.038904,0.001179,1.000000,25.704545,0.001133,inf
4101,"(Half-Life 2, Dishonored, Tomb Raider)",(The Elder Scrolls V Skyrim),0.001179,0.048630,0.001179,1.000000,20.563636,0.001122,inf
4102,"(Half-Life 2, Dishonored, The Elder Scrolls V ...",(Tomb Raider),0.001179,0.017094,0.001179,1.000000,58.500000,0.001159,inf
3350,"(Left 4 Dead, Left 4 Dead 2, LIMBO)",(Team Fortress 2),0.001179,0.078691,0.001179,1.000000,12.707865,0.001086,inf


In [81]:

X_test_formatted = X_test

def remove_n_random_items(lst, n):
    if(len(lst)>1):
        to_delete = set(sample(range(len(lst)), n))

        return [
            item for index, item in enumerate(lst)
            if not index in to_delete
        ]


X_test_formatted['incomplet_games'] = X_test_formatted['games'].apply(lambda x : remove_n_random_items(x,1))
X_test_formatted = X_test_formatted.dropna()
print(X_test_formatted)



             id                                              games  \
2724  161816812  [Counter-Strike Global Offensive, Dota 2, Garr...   
1460   95586525  [Call of Duty Modern Warfare 3 - Multiplayer, ...   
2818  167013261                            [Portal, Dragon's Lair]   
1893  117022366  [Euro Truck Simulator 2, Call of Duty 4 Modern...   
2000  122620165                [Dota 2, Sir, You Are Being Hunted]   
...         ...                                                ...   
769    60217594  [Counter-Strike, Counter-Strike Global Offensi...   
1685  106834867  [Team Fortress 2, Portal 2, Half-Life, Half-Li...   
130    14906179  [Counter-Strike Source, Half-Life 2 Deathmatch...   
3171  188681523  [Counter-Strike Global Offensive, PAYDAY 2, In...   
466    44240866  [Call of Duty Modern Warfare 2 - Multiplayer, ...   

                                           games_bought  \
2724  [Counter-Strike Global Offensive, Dota 2, Garr...   
1460  [Call of Duty Modern Warfare 3 - Mu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_formatted['incomplet_games'] = X_test_formatted['games'].apply(lambda x : remove_n_random_items(x,1))


In [90]:
true_pos = 0
false_pos = 0

for i in tqdm(range(X_test_formatted.shape[0])):
    item_removed = set(X_test_formatted.iloc[i][1]).difference(set(X_test_formatted.iloc[i][3]))
    recommended_games,score = recommendation_ar(X_test_formatted.iloc[i][3],fistrain,artrain, set(X_test_formatted.iloc[i][2]).difference(set(item_removed)))
    if(item_removed.issubset(set(recommended_games))):
        true_pos = true_pos + 1
    else :
        false_pos = false_pos + 1

precision = true_pos/(true_pos+false_pos)
print(true_pos)
print(false_pos)
print(precision)

100%|██████████| 447/447 [02:26<00:00,  3.05it/s]

101
346
0.22595078299776286





Pretty good, but if we recommend the list of all possible games each time, we pretty much get a perfect score so maybe there's a better evaluation metric to use ?

Idea : We don't actually care that the recommended games list is long or not, we should try to rank the games in the recommendation list so that the games first in the list are the most relevant to the user

What changed :
Taken into account the bought games : now we recommend only games that weren't bought


Tentative of improvement : added a score based on the sum of confidence of rules used to create the recommended games 

Issue : not very useful as having a big recommendation list gives a higher score.


Let's try to add a ranking to the list of recommended games