In [1]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831126 sha256=dd1355c3d5d48f7f684b4ebafed925ae468e7c7859fdc5b6296ccd54c1290399
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")
seed = 2354
np.random.seed(seed=seed)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [3]:
games_df = pd.read_csv("/kaggle/input/game-recommendations-on-steam/games.csv")
users_df = pd.read_csv("/kaggle/input/game-recommendations-on-steam/users.csv")
games_meta_df = pd.read_json("/kaggle/input/game-recommendations-on-steam/games_metadata.json", lines=True)
iteractions_df = pd.read_csv("/kaggle/input/game-recommendations-on-steam/recommendations.csv")

In [4]:
games_meta_df.head()

Unnamed: 0,app_id,description,tags
0,13500,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,,[Action]
2,113020,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."


# Предобработка

In [5]:
iteractions_df["app_id"].value_counts()

app_id
440        319492
252490     270684
1091500    226414
730        219737
570        216914
            ...  
1814870         1
1136110         1
1771540         1
2094300         1
498940          1
Name: count, Length: 37610, dtype: int64

In [6]:
iteractions_df["app_id"].value_counts().describe()

count     37610.000000
mean       1094.251369
std        7689.340463
min           1.000000
25%          13.000000
50%          39.000000
75%         179.750000
max      319492.000000
Name: count, dtype: float64

In [8]:
temp = iteractions_df["app_id"].value_counts()
filt_index = temp[(temp >= 40) & (temp <= 5000)].index

In [10]:
iteractions_df = iteractions_df[iteractions_df["app_id"].isin(filt_index)]

In [11]:
iteractions_df["date"] = pd.to_datetime(iteractions_df["date"])

In [12]:
max_date = max(iteractions_df["date"])
max_date

Timestamp('2022-12-31 00:00:00')

In [13]:
global_train, global_test = (
    iteractions_df[iteractions_df["date"] < max_date - pd.Timedelta(days=30)]
    ,iteractions_df[iteractions_df["date"] >= max_date - pd.Timedelta(days=30)]
)

In [14]:
lfm_train_threshold = global_train["date"].quantile(q=0.8, interpolation="nearest")
lfm_train_threshold

Timestamp('2022-01-07 00:00:00')

In [15]:
lfm_train = global_train[global_train["date"] < lfm_train_threshold]
lfm_pred = global_train[global_train["date"] >= lfm_train_threshold]

In [16]:
lfm_pred = lfm_pred[lfm_pred["user_id"].isin(lfm_train["user_id"].unique())]

In [17]:
from lightfm.data import Dataset
from lightfm import LightFM

In [18]:
dataset = Dataset()
dataset.fit(lfm_train["user_id"].unique(), lfm_train["app_id"].unique())

In [19]:
iteractions_matrix, weights_matrix = (
    dataset.build_interactions(
        zip(
            *lfm_train[["user_id", "app_id", "is_recommended"]].values.T
        )
    )
)
weights_matrix_csr = weights_matrix.tocsr()

In [20]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    "users_mapping":lightfm_mapping[0]
    ,"apps_mapping":lightfm_mapping[2]
}

In [21]:
lightfm_mapping["users_inv_mapping"] = {v:k for k, v in lightfm_mapping["users_mapping"].items()}
lightfm_mapping["apps_inv_mapping"] = {v:k for k, v in lightfm_mapping["apps_mapping"].items()}

In [22]:
print(len(lightfm_mapping["users_inv_mapping"]))
print(len(lightfm_mapping["apps_inv_mapping"]))

2855118
15659


In [23]:
lfm_model = (
    LightFM(
        no_components = 32
        ,learning_rate = 1e-1
        ,loss="warp"
        ,max_sampled = 5
        ,random_state = seed
    )
)

In [24]:
from tqdm import tqdm

In [25]:
num_epochs = 10

for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(weights_matrix_csr)

100%|██████████| 10/10 [01:50<00:00, 11.07s/it]


In [26]:
candidates = pd.DataFrame({"user_id":lfm_pred["user_id"].unique()})

In [27]:
candidates.head()

Unnamed: 0,user_id
0,11391224
1,11327355
2,3156894
3,5357060
4,13099571


In [28]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, N, 
                                 user_mapping, item_inv_mapping,
                                 user_features=None, item_features=None, 
                                 num_threads=1):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, user_features=user_features, 
                             item_features=item_features, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [29]:
top_N = 10
all_cols = list(lightfm_mapping["apps_mapping"].values())

In [30]:
mapper = generate_lightfm_recs_mapper(
    model = lfm_model
    ,item_ids = all_cols
    ,known_items = dict()
    ,N = top_N
    ,user_mapping = lightfm_mapping["users_mapping"]
    ,item_inv_mapping = lightfm_mapping["apps_inv_mapping"]
)

In [31]:
candidates.shape

(400006, 1)

In [32]:
candidates["app_id"] = candidates["user_id"].map(mapper)

In [33]:
candidates.head()

Unnamed: 0,user_id,app_id
0,11391224,"[692890, 1209490, 1271700, 594330, 985830, 342..."
1,11327355,"[1147690, 468920, 1123770, 946050, 1058830, 13..."
2,3156894,"[223510, 41500, 22180, 46510, 217920, 204030, ..."
3,5357060,"[356570, 496300, 396750, 474750, 275390, 90027..."
4,13099571,"[718590, 1203630, 460810, 421170, 710920, 8383..."
