In [84]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from random import randrange, sample
from tqdm import tqdm

df = pd.read_csv('steam-200k.csv')
df.columns = ['id','game','state','hours_played','0']
df['id'].nunique()
df = df.drop('0',axis=1)
df.isnull().values.any()
bought_games = df.loc[df['state']=='purchase']
played_games = df.loc[df['state']=='play']
played_games_orig = played_games.drop(['state'],axis=1)


We want to see how well we can recommend games using frequent itemset

In [2]:
grouped = played_games_orig.drop('hours_played', axis=1)
grouped = grouped.groupby('id')['game'].apply(list).reset_index(name='games')
grouped

Unnamed: 0,id,games
0,5250,"[Cities Skylines, Deus Ex Human Revolution, Po..."
1,76767,"[Counter-Strike, Call of Duty World at War, To..."
2,86540,"[The Elder Scrolls V Skyrim, Audiosurf, XCOM E..."
3,144736,[Counter-Strike]
4,181212,"[Counter-Strike, Half-Life 2 Lost Coast]"
...,...,...
11345,309434439,[Dota 2]
11346,309554670,[Mitos.is The Game]
11347,309626088,[Age of Empires II HD Edition]
11348,309824202,[Dota 2]


In [3]:

te = TransactionEncoder()
te_ary = te.fit(grouped['games']).transform(grouped['games'])
df = pd.DataFrame(te_ary, columns=te.columns_)
df
fis = fpgrowth(df, min_support=0.005, use_colnames=True)
fis = fis.sort_values(by=['support'], ascending=False)
fis

Unnamed: 0,support,itemsets
0,0.426520,(Dota 2)
1,0.204670,(Team Fortress 2)
6,0.121322,(Counter-Strike Global Offensive)
39,0.094185,(Unturned)
20,0.070573,(Left 4 Dead 2)
...,...,...
2128,0.005022,"(Team Fortress 2, Sid Meier's Civilization V, ..."
2933,0.005022,"(The Elder Scrolls V Skyrim, Don't Starve Toge..."
2940,0.005022,"(Don't Starve Together Beta, Team Fortress 2, ..."
2941,0.005022,"(Garry's Mod, Team Fortress 2, Don't Starve To..."


In [4]:
ar = association_rules(fis, metric="confidence", min_threshold=0.2)
ar.sort_values(by=['confidence', 'lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4747,"(Counter-Strike Condition Zero Deleted Scenes,...",(Counter-Strike),0.006432,0.050044,0.006432,1.000000,19.982394,0.006110,inf
1867,(Counter-Strike Condition Zero Deleted Scenes),(Counter-Strike),0.008282,0.050044,0.008194,0.989362,19.769816,0.007779,89.295859
7369,"(Dota 2, Call of Duty Modern Warfare 2)",(Call of Duty Modern Warfare 2 - Multiplayer),0.005815,0.025639,0.005727,0.984848,38.412475,0.005578,64.307841
10869,"(Counter-Strike Global Offensive, Team Fortres...",(Call of Duty Modern Warfare 2 - Multiplayer),0.005374,0.025639,0.005286,0.983607,38.364036,0.005149,59.436035
12547,"(Insurgency, Counter-Strike Source)",(Counter-Strike Global Offensive),0.005198,0.121322,0.005110,0.983051,8.102852,0.004479,51.842026
...,...,...,...,...,...,...,...,...,...
3693,(PAYDAY 2),"(Counter-Strike Global Offensive, Team Fortres...",0.034361,0.017004,0.006872,0.200000,11.761658,0.006288,1.228744
3641,(PAYDAY 2),"(Left 4 Dead 2, The Elder Scrolls V Skyrim)",0.034361,0.020441,0.006872,0.200000,9.784483,0.006170,1.224449
3728,(PAYDAY 2),"(Dota 2, Team Fortress 2, Unturned)",0.034361,0.022555,0.006872,0.200000,8.867188,0.006097,1.221806
4792,"(Dota 2, Unturned)","(Left 4 Dead 2, Garry's Mod, Team Fortress 2)",0.032159,0.019295,0.006432,0.200000,10.365297,0.005811,1.225881


In [5]:
def recommendation_ar(librairy, fis, ar):
    item_set_tmp = frozenset()

    for i in range(ar.shape[0]):
        if( set(ar.iloc[i][0]).issubset(set(librairy))):
            #print("antecedent : " + str(list(ar.iloc[i][0])) + " | consequent : " + str(list(ar.iloc[i][1])) + " | confidence : " + str(ar.iloc[i][5]))
            item_set_tmp = frozenset.union(item_set_tmp,ar.iloc[i][1])
    #if(one):
    #    for i in range(fis.shape[0]):
    #       if(set(librairy).issubset(set(fis.iloc[i][1]))):
    #            item_set_tmp = frozenset.union(item_set_tmp,fis.iloc[i][1])
    item_set = list(frozenset.difference(item_set_tmp, librairy))
    return item_set

In [6]:
librairy = ['Cities Skylines']
print(librairy)
print(recommendation_ar(librairy, fis, ar))

for i in range (10):
    librairy = grouped.iloc[i][1]
    print(librairy)
    print(recommendation_ar(librairy, fis, ar))
    print('\n')

['Cities Skylines']
["Sid Meier's Civilization V"]
['Cities Skylines', 'Deus Ex Human Revolution', 'Portal 2', 'Alien Swarm', 'Team Fortress 2', 'Dota 2']
['Killing Floor', 'Left 4 Dead', 'BioShock Infinite', 'Tomb Raider', 'Chivalry Medieval Warfare', 'Terraria', 'Left 4 Dead 2', "Sid Meier's Civilization V", 'PAYDAY 2', 'PAYDAY The Heist', 'Borderlands 2', 'Counter-Strike Source', 'Half-Life 2', 'Far Cry 3', 'Saints Row The Third', 'Torchlight II', "Garry's Mod", 'The Elder Scrolls V Skyrim', 'The Binding of Isaac', 'Unturned', 'Trine 2', 'Warframe', 'Portal', 'Grand Theft Auto IV', 'Castle Crashers', 'XCOM Enemy Unknown', 'Just Cause 2', 'Magicka', 'Fallout New Vegas', 'Counter-Strike Global Offensive', 'Dead Island', "Mirror's Edge", 'BioShock', 'Dishonored', 'Metro 2033']


['Counter-Strike', 'Call of Duty World at War', 'Total War ATTILA', 'Call of Duty Modern Warfare 2 - Multiplayer', 'Call of Duty Modern Warfare 2', 'Counter-Strike Source', 'Banished', 'Call of Duty Black Ops',

On peut remarquer que pour des supports allant de 10% à 5% on obtient presque uniquement les jeux les plus joués. Ce qui n'est pas étonnant comme on n'a pas filtré selon les temps de jeux. Obtenir une recommendation par association semble donc être futile.
On va voir s'il est possible de faire .
On va déjà faire rentrer le temps de jeu en paramètre : Si un joueur n'a pas plus d'heure de jeu que la moyenne pour un certain jeu, on ne le prend pas en compte.

In [7]:
avg = played_games_orig.groupby('game')['hours_played'].apply(np.average).reset_index(name='avg_hours_played')
avg

Unnamed: 0,game,avg_hours_played
0,007 Legends,0.700000
1,0RBITALIS,0.400000
2,1... 2... 3... KICK IT! (Drop That Beat Like a...,4.000000
3,10 Second Ninja,2.950000
4,10000000,3.600000
...,...,...
3595,rymdkapsel,1.100000
3596,sZone-Online,0.977586
3597,the static speaks my name,0.250000
3598,theHunter,2.493548


In [8]:
#Dictionnary of average time played
avg_dict = avg.set_index('game').to_dict()['avg_hours_played']
avg_dict

{'007 Legends': 0.7,
 '0RBITALIS': 0.39999999999999997,
 '1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby)': 4.0,
 '10 Second Ninja': 2.95,
 '10,000,000': 3.6,
 '100% Orange Juice': 8.700000000000001,
 '1000 Amps': 0.1,
 '12 Labours of Hercules': 4.957142857142856,
 '12 Labours of Hercules II The Cretan Bull': 4.2875,
 '12 Labours of Hercules III Girl Power': 6.533333333333334,
 '140': 1.26,
 '15 Days': 0.5,
 '16bit Trader': 4.45,
 '1701 A.D. Sunken Dragon': 0.6,
 '18 Wheels of Steel American Long Haul': 5.8,
 '1953 NATO vs Warsaw Pact': 5.9,
 '1Quest': 2.7,
 '3 Stars of Destiny': 4.7,
 '3089 -- Futuristic Action RPG': 0.3,
 '3D Mini Golf': 1.0,
 '3DMark': 13.377777777777778,
 '3DMark 11': 3.0,
 '3DMark Vantage': 21.0,
 '4 Elements': 14.85,
 '404Sight': 0.5285714285714286,
 '500 Years Act 1': 3.0,
 '60 Seconds!': 1.86,
 '7 Days to Die': 41.586734693877546,
 '8BitBoy': 3.4,
 '8BitMMO': 0.4555555555555555,
 '9 Clues 2 The Ward': 5.1,
 '9 Clues The Secret of Serpent Creek': 2.8,

In [9]:
#To get the avg time played for a game
avg_dict['Dota 2']

202.78549886387108

In [49]:
#Select only the rows where players played more than average
played_games_sup_avg = played_games_orig.loc[played_games_orig['hours_played'] > played_games_orig['game'].astype(str).map(avg_dict)]
played_games_sup_avg.sort_values(by='id')

Unnamed: 0,id,game,hours_played
65423,5250,Cities Skylines,144.0
65425,5250,Deus Ex Human Revolution,62.0
55918,76767,Total War ATTILA,207.0
55920,76767,Call of Duty Modern Warfare 2 - Multiplayer,165.0
55922,76767,Call of Duty Modern Warfare 2,65.0
...,...,...,...
150248,307631446,MEDIEVAL Total War - Gold Edition,14.2
177078,307688442,Hurtworld,65.0
96358,307701164,Emily is Away,0.9
117507,309255941,Mitos.is The Game,5.7


In [54]:
g = played_games_sup_avg.drop('hours_played', axis=1)
g = g.groupby('id')['game'].apply(list).reset_index(name='games')
print(g)
te2 = TransactionEncoder()
te_ary2 = te2.fit(g['games']).transform(g['games'])
df2 = pd.DataFrame(te_ary2, columns=te2.columns_)
fis2 = fpgrowth(df2, min_support=0.0005, use_colnames=True)
fis2 = fis2.sort_values(by=['support'], ascending=False)
fis2

             id                                              games
0          5250        [Cities Skylines, Deus Ex Human Revolution]
1         76767  [Counter-Strike, Call of Duty World at War, To...
2         86540  [The Elder Scrolls V Skyrim, Audiosurf, Killer...
3        229911  [Counter-Strike Condition Zero, Call of Duty M...
4        298950  [Team Fortress 2, Counter-Strike Global Offens...
...         ...                                                ...
4237  307631446                [MEDIEVAL Total War - Gold Edition]
4238  307688442                                        [Hurtworld]
4239  307701164                                    [Emily is Away]
4240  309255941                                [Mitos.is The Game]
4241  309554670                                [Mitos.is The Game]

[4242 rows x 2 columns]


Unnamed: 0,support,itemsets
140,0.202263,(Dota 2)
12,0.093352,(Counter-Strike Global Offensive)
13,0.081094,(Team Fortress 2)
617,0.059642,(Unturned)
8,0.049033,(The Elder Scrolls V Skyrim)
...,...,...
8520,0.000707,"(BioShock 2, BioShock Infinite)"
8521,0.000707,"(Saints Row IV, BioShock 2)"
8522,0.000707,"(Saints Row The Third, BioShock 2)"
8524,0.000707,"(Team Fortress 2, Borderlands 2, BioShock 2)"


In [18]:
ar2 = association_rules(fis2, metric="confidence", min_threshold=0.2)
ar2 = ar2.sort_values(by=['confidence'], ascending=False)
ar2 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
39235,"(The Elder Scrolls V Skyrim, The Witcher 3 Wil...","(Divinity Original Sin, Pillars of Eternity, F...",0.000707,0.000707,0.000707,1.0,1414.000000,0.000707,inf
77382,"(Left 4 Dead 2, Starbound, Orcs Must Die! 2)",(Terraria),0.000707,0.031825,0.000707,1.0,31.422222,0.000685,inf
55279,"(BioShock Infinite, Dishonored, The Witcher 3 ...","(Fallout 4, The Witcher 2 Assassins of Kings E...",0.000707,0.003300,0.000707,1.0,303.000000,0.000705,inf
55280,"(Fallout 4, Dishonored, The Witcher 3 Wild Hun...",(The Witcher 2 Assassins of Kings Enhanced Edi...,0.000707,0.002122,0.000707,1.0,471.333333,0.000706,inf
22746,"(Wargame AirLand Battle, Empire Total War)",(Europa Universalis IV),0.000707,0.003065,0.000707,1.0,326.307692,0.000705,inf
...,...,...,...,...,...,...,...,...,...
23908,"(Call of Duty Modern Warfare 2 - Multiplayer, ...","(Team Fortress 2, Aliens vs. Predator)",0.003536,0.001179,0.000707,0.2,169.680000,0.000703,1.248527
23927,(Far Cry 3 Blood Dragon),"(Sleeping Dogs, Psychonauts, BioShock)",0.003536,0.000943,0.000707,0.2,212.100000,0.000704,1.248821
23939,(Far Cry 3 Blood Dragon),"(Sleeping Dogs, Psychonauts, Deus Ex Human Rev...",0.003536,0.000707,0.000707,0.2,282.800000,0.000705,1.249116
23969,(Far Cry 3 Blood Dragon),"(Sleeping Dogs, BioShock, Psychonauts, Deus Ex...",0.003536,0.000707,0.000707,0.2,282.800000,0.000705,1.249116


In [19]:
librairy = ['Cities Skylines']
print(librairy)
print(recommendation_ar(librairy, fis2, ar2))

for i in range (10):
    librairy = g.iloc[i][1]
    print(librairy)
    print(recommendation_ar(librairy, fis2, ar2))
    print('\n')

['Cities Skylines']
['Prison Architect']
['Cities Skylines', 'Deus Ex Human Revolution']
['Half-Life 2', 'Prison Architect', 'The Elder Scrolls V Skyrim', 'Team Fortress 2', 'Dishonored', 'Metro 2033', 'Fallout New Vegas']


['Counter-Strike', 'Call of Duty World at War', 'Total War ATTILA', 'Call of Duty Modern Warfare 2 - Multiplayer', 'Call of Duty Modern Warfare 2', 'Call of Duty Black Ops']
['Dota 2', 'Empire Total War', 'DayZ', 'The Elder Scrolls V Skyrim', 'Alien Swarm', 'Portal 2', 'Napoleon Total War', 'Call of Duty Modern Warfare 3', 'Call of Duty Modern Warfare 3 - Multiplayer', 'Rising Storm/Red Orchestra 2 Multiplayer', 'Mount & Blade Warband', 'Aliens vs. Predator', 'Portal', 'Fallout 4', 'Call of Duty Black Ops - Multiplayer', 'Crusader Kings II', 'Company of Heroes 2', 'Total War ROME II - Emperor Edition', 'Total War SHOGUN 2', 'PAYDAY 2', 'Metro 2033', 'Counter-Strike Global Offensive', 'RAGE', 'Team Fortress 2']


['The Elder Scrolls V Skyrim', 'Audiosurf', 'Killer i

Let's evaluate our model

In [1]:
geval = g
print(geval)
geval = geval.sample(frac=1, random_state=42)
train_nb = int(geval.shape[0] / 100 * 85)
print(train_nb)
X_train = geval[:train_nb]
X_test = geval[train_nb:]
print(X_train)
X_test

NameError: name 'g' is not defined

In [87]:
te3 = TransactionEncoder()
te_ary3 = te3.fit(X_train['games']).transform(X_train['games'])
dftrain = pd.DataFrame(te_ary3, columns=te3.columns_)
fistrain = fpgrowth(dftrain, min_support=0.0005, use_colnames=True)
fistrain = fistrain.sort_values(by=['support'], ascending=False)
fistrain

Unnamed: 0,support,itemsets
3,0.206012,(Dota 2)
4,0.091954,(Counter-Strike Global Offensive)
61,0.078691,(Team Fortress 2)
140,0.058650,(Unturned)
8,0.048630,(The Elder Scrolls V Skyrim)
...,...,...
1495593,0.000589,"(Intergalactic Bubbles, BEEP, Skyborn, Squishy..."
1495594,0.000589,"(Intergalactic Bubbles, BEEP, Skyborn, Squishy..."
1495595,0.000589,"(Intergalactic Bubbles, Afterfall InSanity Ext..."
1495596,0.000589,"(Intergalactic Bubbles, BEEP, Skyborn, Squishy..."


In [88]:
artrain = association_rules(fistrain, metric="confidence", min_threshold=0.2)
artrain = artrain.sort_values(by=['antecedent support'], ascending=False)
artrain

: 

: 

In [73]:

X_test_formatted = X_test

def remove_n_random_items(lst, n):
    if(len(lst)>1):
        to_delete = set(sample(range(len(lst)), n))

        return [
            item for index, item in enumerate(lst)
            if not index in to_delete
        ]


X_test_formatted['incomplet_games'] = X_test_formatted['games'].apply(lambda x : remove_n_random_items(x,1))
X_test_formatted = X_test_formatted.dropna()
print(X_test_formatted)



             id                                              games  \
4069  273437868  [FEZ, Child of Light, Another World, You Have ...   
1787  111362598  [Dota 2, Counter-Strike Global Offensive, Arma...   
262    29852387              [Counter-Strike, Assassin's Creed II]   
1634  104159428  [Arma 2 Operation Arrowhead, Orcs Must Die! 2,...   
2198  132180557  [Call of Duty World at War, RollerCoaster Tyco...   
...         ...                                                ...   
769    60217594  [Counter-Strike, Counter-Strike Global Offensi...   
1685  106834867  [Team Fortress 2, Portal 2, Half-Life, Half-Li...   
130    14906179  [Counter-Strike Source, Half-Life 2 Deathmatch...   
3171  188681523  [Counter-Strike Global Offensive, PAYDAY 2, In...   
466    44240866  [Call of Duty Modern Warfare 2 - Multiplayer, ...   

                                        incomplet_games  
4069               [FEZ, Child of Light, Another World]  
1787  [Dota 2, Counter-Strike Global Offens

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_formatted['incomplet_games'] = X_test_formatted['games'].apply(lambda x : remove_n_random_items(x,1))


In [85]:
true_pos = 0
false_pos = 0

for i in tqdm(range(X_test_formatted.shape[0])):
    item_removed = set(X_test_formatted.iloc[i][1]).difference(set(X_test_formatted.iloc[i][2]))
    recommended_games = recommendation_ar(X_test_formatted.iloc[i][2],fistrain,artrain)
    if(item_removed.issubset(set(recommended_games))):
        true_pos = true_pos + 1
    else :
        false_pos = false_pos +1

precision = true_pos/(true_pos+false_pos)
print(true_pos)
print(false_pos)
print(precision)

100%|██████████| 114/114 [09:23<00:00,  4.95s/it]

36
78
0.3157894736842105



