In [1]:
!pip install recbole
!pip install ray
!pip install "numpy<2.0" "scipy<1.13"

Collecting recbole
  Downloading recbole-1.2.1-py3-none-any.whl.metadata (1.4 kB)
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting colorama==0.4.4 (from recbole)
  Downloading colorama-0.4.4-py2.py3-none-any.whl.metadata (14 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting texttable>=0.9.0 (from recbole)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
INFO: pip is looking at multiple versions of recbole to determine which version is compatible with other requirements. This could take a while.
Collecting recbole
  Downloading recbole-1.2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading recbole-1.2.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Do

In [1]:
!pip install kmeans_pytorch

Collecting kmeans_pytorch
  Using cached kmeans_pytorch-0.3-py3-none-any.whl.metadata (1.6 kB)
Using cached kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans_pytorch
Successfully installed kmeans_pytorch-0.3


In [3]:
import os
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
SEED = 2020

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()
print(f"Ziarno losowości ustawione na: {SEED}")

Ziarno losowości ustawione na: 2020


In [5]:
!mkdir -p /content/data/amazon-book

BASE="https://raw.githubusercontent.com/kuandeng/LightGCN/master/Data/amazon-book"

!wget $BASE/train.txt      -P /content/data/amazon-book
!wget $BASE/test.txt       -P /content/data/amazon-book
!wget $BASE/user_list.txt  -P /content/data/amazon-book
!wget $BASE/item_list.txt  -P /content/data/amazon-book


--2025-11-29 15:15:54--  https://raw.githubusercontent.com/kuandeng/LightGCN/master/Data/amazon-book/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14125691 (13M) [text/plain]
Saving to: ‘/content/data/amazon-book/train.txt’


2025-11-29 15:15:55 (185 MB/s) - ‘/content/data/amazon-book/train.txt’ saved [14125691/14125691]

--2025-11-29 15:15:55--  https://raw.githubusercontent.com/kuandeng/LightGCN/master/Data/amazon-book/test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3848611 (3.7M) [text/plain]
Saving to: ‘/c

In [6]:
def create_amazon_subset_inter_file(data_dir, output_file, max_users=2000, min_interactions=10):
    train_path = os.path.join(data_dir, "train.txt")

    # 1. Wczytujemy dane linia po linii, budując listę (user, item)
    # Robimy to w "chunky" sposób lub po prostu parsujemy stringi sprawniej
    data = []
    with open(train_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) > 1:
                u = int(parts[0])
                # Pomijamy użytkowników z małą liczbą interakcji już na etapie czytania (opcjonalnie)
                # ale dla pewności wczytajmy strukturę
                items = [int(x) for x in parts[1:]]
                for i in items:
                    data.append((u, i))

    df = pd.DataFrame(data, columns=["user_id", "item_id"])

    # 2. Filtrowanie: Użytkownicy z min_interactions
    user_counts = df['user_id'].value_counts()
    valid_users = user_counts[user_counts >= min_interactions].index
    df = df[df['user_id'].isin(valid_users)]

    # 3. Wybieramy top N użytkowników (lub losowych)
    top_users = df['user_id'].value_counts().head(max_users).index
    df = df[df['user_id'].isin(top_users)]

    # 4. Remapowanie ID (RecBole lubi ciągłe ID, choć nie jest to wymagane przy tokenach, to dobra praktyka)
    # Ale w pliku .inter możemy zostawić oryginalne ID, jeśli używamy suffixu :token

    # 5. Zapis do formatu RecBole (.inter)
    # Tworzymy katalog, jeśli nie istnieje
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    print(f"Zapisywanie {len(df)} interakcji dla {df['user_id'].nunique()} użytkowników i {df['item_id'].nunique()} przedmiotów...")

    # Nagłówek wymagany przez RecBole
    with open(output_file, "w") as f:
        f.write("user_id:token\titem_id:token\n")
        for _, row in df.iterrows():
            f.write(f"{row['user_id']}\t{row['item_id']}\n")

    print("Gotowe!")

In [7]:
create_amazon_subset_inter_file(
    data_dir="/content/data/amazon-book",
    output_file="/content/dataset/amazon_sub_rb/amazon_sub_rb.inter", # Poprawiona ścieżka
    max_users=5000,
    min_interactions=5
)

Zapisywanie 903370 interakcji dla 5000 użytkowników i 87594 przedmiotów...
Gotowe!


In [9]:
from recbole.quick_start import run_recbole

config = {
    "model": "NGCF",
    "dataset": "amazon_sub_rb",
    "data_path": "/content/dataset/",
    "field_separator": "\t",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "load_col": {"inter": ["user_id", "item_id"]},

    # Parametry treningu
    "epochs": 50,
    "train_batch_size": 2048,
    "embedding_size": 64,
    "learning_rate": 0.001,

    "metrics": ["Recall", "NDCG"],

    # 2. Wartość K (Top-K) ustawiona na 20
    "topk": [20],

    # 3. Early Stopping nastawiony na Recall@20 z cierpliwością 10 (jak w LightGCN)
    "valid_metric": "Recall@20",
    "valid_metric_bigger": True,
    "stopping_step": 10,

    # ------------------------------------------------------

    "tensorboard": True,

    "eval_args": {
        "split": {"RS": [0.8, 0.1, 0.1]},
        "group_by": "user",
        "order": "RO",
        "mode": "full" # Full Ranking (ocena na tle wszystkich przedmiotów, tak jak w Twoim kodzie LightGCN)
    },

    # Parametry specyficzne dla NGCF
    "node_dropout_prob": 0.1,
    "mess_dropout_prob": [0.1, 0.1, 0.1],
    "reg_weight": 1e-5,
}

run_recbole(config_dict=config)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
  SparseL = torch.sparse.FloatTensor(i, data, torch.Size(L.shape))
  return torch.sparse.FloatTensor(i, val)
  scaler = amp.GradScaler(enabled=self.enable_scaler)
Train     0: 100%|███████████████████████| 356/356 [00:43<00:00,  8.22it/s, GPU RAM: 0.83 G/14.74 G]
Evaluate   : 100%|████████████████████| 5000/5000 [00:31<00:00, 159.20it/s, GPU RAM: 0.83 G/14.74 G]
Train     1: 100%|███████████████████████| 356/356 [00:43<00:00,  8.20it/s, GPU RAM: 1.01 G/14.74 G]
Evaluate   : 100%|████████████████████| 5000/5000 [00:29<00:00, 171.46it/s, GPU RAM: 1.01 G/14.74 G]
Train  

KeyboardInterrupt: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir log_tensorboard