In [94]:
import polars as pl
import mlxtend.frequent_patterns as fp
import mlxtend.preprocessing as pp
import pickle
import pandas as pd
from efficient_apriori import apriori as apriori_efficient
import numpy as np
import pyarrow as pa

In [95]:
data = pl.read_csv('data/bgg-26m-reviews.csv')
print(data.head())

shape: (5, 6)
┌─────┬─────────────┬────────┬─────────────────────────────────┬─────┬───────┐
│     ┆ user        ┆ rating ┆ comment                         ┆ ID  ┆ name  │
│ --- ┆ ---         ┆ ---    ┆ ---                             ┆ --- ┆ ---   │
│ i64 ┆ str         ┆ f64    ┆ str                             ┆ i64 ┆ str   │
╞═════╪═════════════╪════════╪═════════════════════════════════╪═════╪═══════╡
│ 0   ┆ sidehacker  ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 1   ┆ Varthlokkur ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 2   ┆ dougthonus  ┆ 10.0   ┆ Currently, this sits on my lis… ┆ 13  ┆ CATAN │
│ 3   ┆ cypar7      ┆ 10.0   ┆ I know it says how many plays,… ┆ 13  ┆ CATAN │
│ 4   ┆ ssmooth     ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
└─────┴─────────────┴────────┴─────────────────────────────────┴─────┴───────┘


## Filtering

In [96]:
data = data.filter(pl.col('rating') >= 8)
print(data.head())

shape: (5, 6)
┌─────┬─────────────┬────────┬─────────────────────────────────┬─────┬───────┐
│     ┆ user        ┆ rating ┆ comment                         ┆ ID  ┆ name  │
│ --- ┆ ---         ┆ ---    ┆ ---                             ┆ --- ┆ ---   │
│ i64 ┆ str         ┆ f64    ┆ str                             ┆ i64 ┆ str   │
╞═════╪═════════════╪════════╪═════════════════════════════════╪═════╪═══════╡
│ 0   ┆ sidehacker  ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 1   ┆ Varthlokkur ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 2   ┆ dougthonus  ┆ 10.0   ┆ Currently, this sits on my lis… ┆ 13  ┆ CATAN │
│ 3   ┆ cypar7      ┆ 10.0   ┆ I know it says how many plays,… ┆ 13  ┆ CATAN │
│ 4   ┆ ssmooth     ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
└─────┴─────────────┴────────┴─────────────────────────────────┴─────┴───────┘


## Grouping into baskets

In [97]:
baskets = (
    data
    .group_by(['user'])
    .agg(
        pl.col('name').alias('games')
    )
)

### Training and test split

In [98]:
baskets_train = baskets.head(int(0.8 * len(baskets)))
baskets_test = baskets.tail(len(baskets) - int(0.8 * len(baskets)))
baskets_test

user,games
str,list[str]
"""Kiperius""","[""Splendor"", ""King of Tokyo"", … ""Black Rose Wars: Rebirth""]"
"""Efoss98""","[""Paleo"", ""Era: Medieval Age""]"
"""aurora1986""","[""Small World"", ""Ticket to Ride"", … ""Barragoon""]"
"""misselainious""","[""Wingspan"", ""Le Havre""]"
"""bigcpwnzj00""","[""Agricola"", ""Splendor"", … ""Fealty""]"
…,…
"""damourr""","[""Carcassonne"", ""The Princes of Florence"", ""Kanaloa""]"
"""giant_karlik""","[""Forbidden Island"", ""Star Wars: The Card Game"", ""Trash Pandas""]"
"""passstab""","[""Go""]"
"""yessit""","[""Love Letter"", ""Codenames"", … ""Quests of Valeria""]"


In [99]:
baskets_test = baskets_test.filter(pl.col('games').list.len() >= 3)
baskets_test = baskets_test.to_pandas()

baskets_test['test'] = baskets_test['games'].apply(lambda x: x[0:int(np.ceil(0.3*len(x)))])
baskets_test['games'] = baskets_test['games'].apply(lambda x: x[int(np.ceil(0.3*len(x))):])

In [100]:
baskets_test_training = baskets_test[['user', 'games']]

In [101]:
baskets_train = baskets_train.to_pandas()
pd.concat([baskets_train, baskets_test_training], ignore_index=True)

Unnamed: 0,user,games
0,servbot,"[Agricola, Pandemic, Puerto Rico, Ticket to Ri..."
1,Geemantra,"[Dune: Imperium, Mansions of Madness: Second E..."
2,vanlag1968,"[Final Girl, The Artemis Project, Alienation]"
3,reverend_lovefist,"[Ticket to Ride: Europe, Love Letter, Codename..."
4,thesponsduke,"[Splendor, Ticket to Ride: Europe, Dominion, S..."
...,...,...
503880,Zudrak,"[Dungeon!, APBA Pro Baseball]"
503881,damourr,"[The Princes of Florence, Kanaloa]"
503882,giant_karlik,"[Star Wars: The Card Game, Trash Pandas]"
503883,yessit,"[Arkham Horror, Mansions of Madness: Second Ed..."


## Encoding

In [102]:
te = pp.TransactionEncoder()
te_ary = te.fit(baskets_train['games']).transform(baskets_train['games'])

In [103]:
baskets_train_encoded = pd.DataFrame(te_ary, columns=te.columns_)

print(f"Shape of the basket dataframe: {baskets_train_encoded.shape}")
print(f"Number of unique items: {len(te.columns_)}")
baskets_train_encoded.head()

Shape of the basket dataframe: (429463, 26959)
Number of unique items: 26959


Unnamed: 0,"""I Would Kill Hitler""","""La Garde recule!""","""Oh My God! There's An Axe In My Head."" The Game of International Diplomacy","""Scratch One Flat Top!""","""Shobai"" All Right","""Tarleton's Quarter!""",#mylife,'65: Squad-Level Combat in the Jungles of Vietnam,'85 Afghanistan: Graveyard of Empires,"'CA' Tactical Naval Warfare in the Pacific, 1941-45",...,異世界ギルドマスターズ (Isekai Guild Masters),白と黒でトリテ (Trick-Taking in Black and White),目撃者たちの夜 (Witness Night),翡翠の商人 (Jade Merchant),落水邸物語 (Origin of Falling Water),蒼天之死 (Death of Heaven),隅田川 (Sumida River),魔女の一撃宅配便 (Witch's Shot Delivery Service),魔界札 (Makaifuda),스플렌더: Pokémon (Splendor: Pokémon)
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [104]:
# Checkpoint
pickle.dump(baskets_train_encoded, open('data/freq_itemsets_baskets_train_encoded.pkl', 'wb'))

In [105]:
baskets_train_encoded = pickle.load(open('data/freq_itemsets_baskets_train_encoded.pkl', 'rb'))

## Getting frequent itemsets

In [106]:
baskets_train_as_tuples = [tuple(row) for row in baskets_train['games']]
itemsets, rules = apriori_efficient(baskets_train_as_tuples, min_support=0.005, min_confidence=0.7)

In [107]:
pickle.dump((itemsets, rules), open('data/freq_itemsets_approach2_0.005_0.8.pkl', 'wb'))

In [108]:
itemsets, rules = pickle.load(open('data/freq_itemsets_approach2_0.005_0.8.pkl', 'rb'))

In [109]:
len_transactions = len(baskets_train_as_tuples)
itemsets_flattened = []
supports = []
for i in itemsets.keys():
    itemsets_flattened.extend([item[0] for item in itemsets[i].items()])
    supports.extend([item[1]/len_transactions for item in itemsets[i].items()])

In [110]:
itemsets_dict = {
    'support': supports,
    'itemsets': itemsets_flattened
}
itemsets_df = pd.DataFrame(itemsets_dict)
itemsets_df.head()

Unnamed: 0,support,itemsets
0,0.086005,"(Agricola,)"
1,0.119107,"(Pandemic,)"
2,0.085069,"(Puerto Rico,)"
3,0.070896,"(Ticket to Ride: Europe,)"
4,0.075152,"(Power Grid,)"


## Association rules

In [111]:
rules = fp.association_rules(itemsets_df, metric="confidence", min_threshold=0.6)
rules.tail()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
15705,"(Ark Nova, Scythe, Wingspan, Lost Ruins of Arn...",(Terraforming Mars),0.006422,0.147759,0.005288,0.823423,5.572744,1.0,0.004339,4.826449,0.825859,0.035515,0.792808,0.429605
15706,"(Ark Nova, Scythe, Wingspan, Terraforming Mars...",(Lost Ruins of Arnak),0.007842,0.062629,0.005288,0.674287,10.766312,1.0,0.004797,2.877907,0.914288,0.081125,0.652525,0.37936
15707,"(Ark Nova, Scythe, Lost Ruins of Arnak, Terraf...",(Wingspan),0.007433,0.125615,0.005288,0.711466,5.663863,1.0,0.004354,3.030442,0.829608,0.04139,0.670015,0.376782
15708,"(Ark Nova, Wingspan, Lost Ruins of Arnak, Terr...",(Scythe),0.007835,0.113744,0.005288,0.674889,5.93338,1.0,0.004397,2.726006,0.838028,0.045472,0.633163,0.360689
15709,"(Scythe, Wingspan, Lost Ruins of Arnak, Terraf...",(Ark Nova),0.006955,0.074388,0.005288,0.760295,10.220628,1.0,0.004771,3.861456,0.908477,0.069528,0.74103,0.415691


## Recommendation function

In [112]:
def recommender_association(rules_df, product_list, N=1):
    candidate_rules = rules_df[rules_df['antecedents'].apply(lambda x: x.issubset(product_list))]
    candidate_rules = candidate_rules.sort_values("confidence")
    recommendation_list = []
    for i in range(len(candidate_rules)):
        for item in candidate_rules.iloc[i]['consequents']:
            if item not in product_list and item not in recommendation_list:
                recommendation_list.append(item)
            if len(recommendation_list) >= N:
                break
        if len(recommendation_list) >= N:
            break

    return recommendation_list

## Evaluation

In [113]:
## TODO: Evaluation