Add next-basket evaluation method (#545)

PreferredAI · Nov 27, 2023 · 4af34f2 · 4af34f2
1 parent 2bdae6d
commit 4af34f2
Show file tree

Hide file tree

Showing 25 changed files with 1,276 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -191,6 +191,7 @@ The recommender models supported by Cornac are listed below. Why don't you join
 |      | [Bayesian Personalized Ranking (BPR)](cornac/models/bpr), [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) | N/A | [bpr_netflix.py](examples/bpr_netflix.py)
 |      | [Factorization Machines (FM)](cornac/models/fm), [paper](https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf) | Linux only | [fm_example.py](examples/fm_example.py)
 |      | [Global Average (GlobalAvg)](cornac/models/global_avg), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A | [biased_mf.py](examples/biased_mf.py)
+|      | [Global Personalized Top Frequent (GPTop)](cornac/models/gp_top), [paper](https://dl.acm.org/doi/pdf/10.1145/3587153) | N/A | [gp_top_tafeng.py](examples/gp_top_tafeng.py)
 |      | [Item K-Nearest-Neighbors (ItemKNN)](cornac/models/knn), [paper](https://dl.acm.org/doi/pdf/10.1145/371920.372071) | N/A | [knn_movielens.py](examples/knn_movielens.py)
 |      | [Matrix Factorization (MF)](cornac/models/mf), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A | [biased_mf.py](examples/biased_mf.py), [given_data.py](examples/given_data.py)
 |      | [Maximum Margin Matrix Factorization (MMMF)](cornac/models/mmmf), [paper](https://link.springer.com/content/pdf/10.1007/s10994-008-5073-7.pdf) | N/A | [mmmf_exp.py](examples/mmmf_exp.py)

diff --git a/cornac/data/__init__.py b/cornac/data/__init__.py
@@ -21,12 +21,14 @@
 from .sentiment import SentimentModality
 from .reader import Reader
 from .dataset import Dataset
+from .dataset import BasketDataset
 
 __all__ = ['FeatureModality',
            'TextModality',
            'ReviewModality',
            'ImageModality',
            'GraphModality',
            'SentimentModality',
+           'BasketDataset',
            'Dataset',
            'Reader']
diff --git a/cornac/data/dataset.py b/cornac/data/dataset.py
@@ -13,16 +13,13 @@
 # limitations under the License.
 # ============================================================================
 
-from collections import OrderedDict, defaultdict
-import itertools
 import warnings
+from collections import Counter, OrderedDict, defaultdict
 
 import numpy as np
-from scipy.sparse import csr_matrix, csc_matrix, dok_matrix
+from scipy.sparse import csc_matrix, csr_matrix, dok_matrix
 
-from ..utils import get_rng
-from ..utils import validate_format
-from ..utils import estimate_batches
+from ..utils import estimate_batches, get_rng, validate_format
 
 
 class Dataset(object):
@@ -565,3 +562,354 @@ def add_modalities(self, **kwargs):
         self.item_graph = kwargs.get("item_graph", None)
         self.sentiment = kwargs.get("sentiment", None)
         self.review_text = kwargs.get("review_text", None)
+
+
+class BasketDataset(Dataset):
+    """Training set contains history baskets
+
+    Parameters
+    ----------
+    num_users: int, required
+        Number of users.
+
+    num_items: int, required
+        Number of items.
+
+    uid_map: :obj:`OrderDict`, required
+        The dictionary containing mapping from user original ids to mapped integer indices.
+
+    iid_map: :obj:`OrderDict`, required
+        The dictionary containing mapping from item original ids to mapped integer indices.
+
+    uir_tuple: tuple, required
+        Tuple of 3 numpy arrays (user_indices, item_indices, rating_values).
+
+    basket_ids: numpy.array, required
+        Array of basket indices corresponding to observation in `uir_tuple`.
+
+    timestamps: numpy.array, optional, default: None
+        Numpy array of timestamps corresponding to feedback in `uir_tuple`.
+        This is only available when input data is in `UBIT` and `UBITJson` formats.
+
+    extra_data: numpy.array, optional, default: None
+        Array of json object corresponding to observations in `uir_tuple`.
+
+    seed: int, optional, default: None
+        Random seed for reproducing data sampling.
+
+    Attributes
+    ----------
+    ubi_tuple: tuple
+        Tuple (user_indices, baskets).
+
+    timestamps: numpy.array
+        Numpy array of timestamps corresponding to feedback in `ubi_tuple`.
+        This is only available when input data is in `UTB` format.
+    """
+
+    def __init__(
+        self,
+        num_users,
+        num_baskets,
+        num_items,
+        uid_map,
+        bid_map,
+        iid_map,
+        uir_tuple,
+        basket_ids=None,
+        timestamps=None,
+        extra_data=None,
+        seed=None,
+    ):
+        super().__init__(
+            num_users=num_users,
+            num_items=num_items,
+            uid_map=uid_map,
+            iid_map=iid_map,
+            uir_tuple=uir_tuple,
+            timestamps=timestamps,
+            seed=seed,
+        )
+        self.num_baskets = num_baskets
+        self.bid_map = bid_map
+        self.basket_ids = basket_ids
+        self.extra_data = extra_data
+        basket_sizes = list(Counter(basket_ids).values())
+        self.max_basket_size = np.max(basket_sizes)
+        self.min_basket_size = np.min(basket_sizes)
+        self.avg_basket_size = np.mean(basket_sizes)
+
+        self.__baskets = None
+        self.__user_basket_data = None
+        self.__chrono_user_basket_data = None
+
+    @property
+    def baskets(self):
+        """A dictionary to store indices where basket ID appears in the data."""
+        if self.__baskets is None:
+            self.__baskets = OrderedDict()
+            for idx, bid in enumerate(self.basket_ids):
+                self.__baskets.setdefault(bid, [])
+                self.__baskets[bid].append(idx)
+        return self.__baskets
+
+    @property
+    def user_basket_data(self):
+        """Data organized by user. A dictionary where keys are users,
+        values are list of baskets purchased by corresponding users.
+        """
+        if self.__user_basket_data is None:
+            self.__user_basket_data = defaultdict()
+            for bid, ids in self.baskets.items():
+                u = self.uir_tuple[0][ids[0]]
+                self.__user_basket_data.setdefault(u, [])
+                self.__user_basket_data[u].append(bid)
+        return self.__user_basket_data
+
+    @property
+    def chrono_user_basket_data(self):
+        """Data organized by user sorted chronologically (timestamps required).
+        A dictionary where keys are users, values are tuples of three chronologically
+        sorted lists (baskets, timestamps) interacted by the corresponding users.
+        """
+        if self.__chrono_user_basket_data is None:
+            assert self.timestamps is not None  # we need timestamps
+
+            basket_timestamps = [
+                self.timestamps[ids[0]] for ids in self.baskets.values()
+            ]  # one-off
+
+            self.__chrono_user_basket_data = defaultdict(lambda: ([], []))
+            for (bid, ids), t in zip(self.baskets.items(), basket_timestamps):
+                u = self.uir_tuple[0][ids[0]]
+                self.__chrono_user_basket_data[u][0].append(bid)
+                self.__chrono_user_basket_data[u][1].append(t)
+
+            # sorting based on timestamps
+            for user, (baskets, timestamps) in self.__chrono_user_basket_data.items():
+                sorted_idx = np.argsort(timestamps)
+                sorted_baskets = [baskets[i] for i in sorted_idx]
+                sorted_timestamps = [timestamps[i] for i in sorted_idx]
+                self.__chrono_user_basket_data[user] = (
+                    sorted_baskets,
+                    sorted_timestamps,
+                )
+
+        return self.__chrono_user_basket_data
+
+    @classmethod
+    def build(
+        cls,
+        data,
+        fmt="UBI",
+        global_uid_map=None,
+        global_bid_map=None,
+        global_iid_map=None,
+        seed=None,
+        exclude_unknowns=False,
+    ):
+        """Constructing Dataset from given data of specific format.
+
+        Parameters
+        ----------
+        data: list, required
+            Data in the form of tuple (user, basket) for UB format,
+            or tuple (user, timestamps, basket) for UTB format.
+
+        fmt: str, default: 'UBI'
+            Format of the input data. Currently, we are supporting:
+
+            'UBI': User, Basket_ID, Item
+            'UBIT': User, Basket_ID, Item, Timestamp
+            'UBITJson': User, Basket_ID, Item, Timestamp, Extra data in Json format
+
+        global_uid_map: :obj:`defaultdict`, optional, default: None
+            The dictionary containing global mapping from original ids to mapped ids of users.
+
+        global_bid_map: :obj:`defaultdict`, optional, default: None
+            The dictionary containing global mapping from original ids to mapped ids of baskets.
+
+        global_iid_map: :obj:`defaultdict`, optional, default: None
+            The dictionary containing global mapping from original ids to mapped ids of items.
+
+        seed: int, optional, default: None
+            Random seed for reproducing data sampling.
+
+        exclude_unknowns: bool, default: False
+            Ignore unknown users and items.
+
+        Returns
+        -------
+        res: :obj:`<cornac.data.BasketDataset>`
+            BasketDataset object.
+
+        """
+        fmt = validate_format(fmt, ["UBI", "UBIT", "UBITJson"])
+
+        if global_uid_map is None:
+            global_uid_map = OrderedDict()
+        if global_bid_map is None:
+            global_bid_map = OrderedDict()
+        if global_iid_map is None:
+            global_iid_map = OrderedDict()
+
+        u_indices = []
+        b_indices = []
+        i_indices = []
+        valid_idx = []
+        extra_data = []
+        for idx, (uid, bid, iid, *_) in enumerate(data):
+            if exclude_unknowns and (iid not in global_iid_map):
+                continue
+
+            global_uid_map.setdefault(uid, len(global_uid_map))
+            global_bid_map.setdefault(bid, len(global_bid_map))
+            global_iid_map.setdefault(iid, len(global_iid_map))
+
+            u_indices.append(global_uid_map[uid])
+            b_indices.append(global_bid_map[bid])
+            i_indices.append(global_iid_map[iid])
+            valid_idx.append(idx)
+
+        uir_tuple = (
+            np.asarray(u_indices, dtype="int"),
+            np.asarray(i_indices, dtype="int"),
+            np.ones(len(u_indices), dtype="float"),
+        )
+
+        basket_ids = np.asarray(b_indices, dtype="int")
+
+        timestamps = (
+            np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int")
+            if fmt in ["UBIT", "UBITJson"]
+            else None
+        )
+
+        extra_data = [data[i][4] for i in valid_idx] if fmt == "UBITJson" else None
+
+        dataset = cls(
+            num_users=len(global_uid_map),
+            num_baskets=len(global_bid_map),
+            num_items=len(global_iid_map),
+            uid_map=global_uid_map,
+            bid_map=global_bid_map,
+            iid_map=global_iid_map,
+            uir_tuple=uir_tuple,
+            basket_ids=basket_ids,
+            timestamps=timestamps,
+            extra_data=extra_data,
+            seed=seed,
+        )
+
+        return dataset
+
+    @classmethod
+    def from_ubi(cls, data, seed=None):
+        """Constructing Dataset from UBI (User, Basket, Item) triples data.
+
+        Parameters
+        ----------
+        data: list
+            Data in the form of tuples (user, basket, item).
+
+        seed: int, optional, default: None
+            Random seed for reproducing data sampling.
+
+        Returns
+        -------
+        res: :obj:`<cornac.data.BasketDataset>`
+            BasketDataset object.
+
+        """
+        return cls.build(data, fmt="UBI", seed=seed)
+
+    @classmethod
+    def from_ubit(cls, data, seed=None):
+        """Constructing Dataset from UBIT format (User, Basket, Item, Timestamp)
+
+        Parameters
+        ----------
+        data: tuple
+            Data in the form of quadruples (user, basket, item, timestamp)
+
+        seed: int, optional, default: None
+            Random seed for reproducing data sampling.
+
+        Returns
+        -------
+        res: :obj:`<cornac.data.BasketDataset>`
+            BasketDataset object.
+
+        """
+        return cls.build(data, fmt="UBIT", seed=seed)
+
+    @classmethod
+    def from_ubitjson(cls, data, seed=None):
+        """Constructing Dataset from UBITJson format (User, Basket, Item, Timestamp, Json)
+
+        Parameters
+        ----------
+        data: tuple
+            Data in the form of tuples (user, basket, item, timestamp, json)
+
+        seed: int, optional, default: None
+            Random seed for reproducing data sampling.
+
+        Returns
+        -------
+        res: :obj:`<cornac.data.BasketDataset>`
+            BasketDataset object.
+
+        """
+        return cls.build(data, fmt="UBITJson", seed=seed)
+
+    def num_batches(self, batch_size):
+        """Estimate number of batches per epoch"""
+        return estimate_batches(len(self.user_data), batch_size)
+
+    def user_basket_data_iter(self, batch_size=1, shuffle=False):
+        """Create an iterator over data yielding batch of basket indices and batch of baskets
+
+        Parameters
+        ----------
+        batch_size: int, optional, default = 1
+
+        shuffle: bool, optional, default: False
+            If `True`, orders of triplets will be randomized. If `False`, default orders kept.
+
+        Returns
+        -------
+        iterator : batch of user indices, batch of user data corresponding to user indices
+
+        """
+        user_indices = np.asarray(list(self.user_basket_data.keys()), dtype="int")
+        for batch_ids in self.idx_iter(
+            len(self.user_basket_data), batch_size=batch_size, shuffle=shuffle
+        ):
+            batch_users = user_indices[batch_ids]
+            batch_basket_ids = np.asarray(
+                [self.user_basket_data[uid] for uid in batch_users], dtype="int"
+            )
+            yield batch_users, batch_basket_ids
+
+    def basket_iter(self, batch_size=1, shuffle=False):
+        """Create an iterator over data yielding batch of basket indices and batch of baskets
+
+        Parameters
+        ----------
+        batch_size: int, optional, default = 1
+
+        shuffle: bool, optional, default: False
+            If `True`, orders of triplets will be randomized. If `False`, default orders kept.
+
+        Returns
+        -------
+        iterator : batch of basket indices, batch of baskets (list of list)
+
+        """
+        basket_indices = np.array(list(self.baskets.keys()))
+        baskets = list(self.baskets.values())
+        for batch_ids in self.idx_iter(len(basket_indices), batch_size, shuffle):
+            batch_basket_indices = basket_indices[batch_ids]
+            batch_baskets = [baskets[idx] for idx in batch_ids]
+            yield batch_basket_indices, batch_baskets