Merge pull request #56 from tqtg/master

Improve MatrixTrainSet
PreferredAI · Mar 13, 2019 · b6f39b4 · b6f39b4
2 parents 0721473 + 957dea2
commit b6f39b4
Show file tree

Hide file tree

Showing 23 changed files with 3,734 additions and 3,588 deletions.
diff --git a/.codecov.yml b/.codecov.yml
@@ -11,10 +11,9 @@ coverage:
         # project coverage decrease by more than 1%:
         target: auto
         threshold: 1%
-    patch:
-      default:
-        # Be tolerant on slight code coverage diff on PRs to limit
-        # noisy red coverage status on github PRs.
-        target: auto
-        threshold: 1%
-
+#    patch:
+#      default:
+#        # Be tolerant on slight code coverage diff on PRs to limit
+#        # noisy red coverage status on github PRs.
+#        target: auto
+#        threshold: 1%
diff --git a/cornac/data/graph.py b/cornac/data/graph.py
@@ -19,12 +19,12 @@ def __init__(self, **kwargs):
         self.matrix = None
         self.map_data = []
 
-    def _build_triplet(self, ordered_ids):
-        """Build adjacency matrix in sparse triplet format using maped ids
+    def _build_triplet(self, global_id_map):
+        """Build adjacency matrix in sparse triplet format using mapped ids
         """
 
         for i, j, val in self.raw_data:
-            self.map_data.append([ordered_ids[i], ordered_ids[j], val])
+            self.map_data.append([global_id_map[i], global_id_map[j], val])
         self.map_data = np.asanyarray(self.map_data)
         self.raw_data = None
 
@@ -50,18 +50,17 @@ def get_train_triplet(self, train_row_ids, train_col_ids):
 
         return np.asarray(train_triplet)
 
-    def build(self, ordered_ids):
-        self._build_triplet(ordered_ids)
+    def build(self, global_id_map):
+        self._build_triplet(global_id_map)
         self._build_sparse_matrix(self.map_data)
 
     def batch(self, batch_ids):
-
-        """Collaborative Context Poisson Factorization.
+        """Return batch of vectors from the sparse adjacency matrix corresponding to provided batch_ids.
 
         Parameters
         ----------
         batch_ids: array, required
-            An array conting the ids of rows to be returned from the sparse adjacency matrix.        
+            An array contains the ids of rows to be returned from the sparse adjacency matrix.
         """
 
         return self.matrix[batch_ids]
diff --git a/cornac/data/image.py b/cornac/data/image.py
@@ -28,10 +28,10 @@ def data_image(self):
     def data_image(self, input_image):
         self.__data_image = input_image
 
-    def build(self, ordered_ids):
+    def build(self, global_id_map):
         """Build the model based on provided list of ordered ids
         """
-        super().build(ordered_ids)
+        Module.build(self, global_id_map)
 
     def batch_image(self, batch_ids,
                     target_size=(256, 256),

diff --git a/cornac/data/module.py b/cornac/data/module.py
@@ -37,25 +37,25 @@ def feature_dim(self):
     def feature_dim(self, input_dim):
         self.__feature_dim = input_dim
 
-    def _build_feature(self, ordered_ids):
+    def _build_feature(self, global_id_map):
         """Build data_feature matrix based on provided list of ordered ids
         """
         if self._id_feature is None:
             return
 
-        self.data_feature = np.zeros((len(ordered_ids), self.feature_dim))
-        for map_id, raw_id in enumerate(ordered_ids.keys()):
-            self.data_feature[map_id] = self._id_feature[raw_id]
+        self.data_feature = np.zeros((len(global_id_map), self.feature_dim))
+        for mapped_id, raw_id in enumerate(global_id_map.keys()):
+            self.data_feature[mapped_id] = self._id_feature[raw_id]
         if self._normalized:
             self.data_feature = self.data_feature - np.min(self.data_feature)
             self.data_feature = self.data_feature / (np.max(self.data_feature) + 1e-10)
 
         self._id_feature.clear()
 
-    def build(self, ordered_ids):
+    def build(self, global_id_map):
         """Build the model based on provided list of ordered ids
         """
-        self._build_feature(ordered_ids)
+        self._build_feature(global_id_map)
 
     def batch_feature(self, batch_ids):
         """Return a matrix (batch of feature vectors) corresponding to provided batch_ids

diff --git a/cornac/data/testset.py b/cornac/data/testset.py
@@ -28,7 +28,8 @@ def __init__(self, user_ratings, uid_map, iid_map):
         self._uid_map = uid_map
         self._iid_map = iid_map
 
-    def get_users(self):
+    @property
+    def users(self):
         """Return a list of users"""
         return self._user_ratings.keys()
 
@@ -45,12 +46,12 @@ def get_iid(self, raw_iid):
         return self._iid_map[raw_iid]
 
     @classmethod
-    def from_uir(self, triplet_data, global_uid_map, global_iid_map, global_ui_set, verbose=False):
+    def from_uir(self, data, global_uid_map, global_iid_map, global_ui_set, verbose=False):
         """Constructing TestSet from triplet data.
 
         Parameters
         ----------
-        triplet_data: array-like, shape: [n_examples, 3]
+        data: array-like, shape: [n_examples, 3]
             Data in the form of triplets (user, item, rating)
 
         global_uid_map: :obj:`defaultdict`
@@ -79,7 +80,7 @@ def from_uir(self, triplet_data, global_uid_map, global_iid_map, global_ui_set,
         unk_user_count = 0
         unk_item_count = 0
 
-        for raw_uid, raw_iid, rating in triplet_data:
+        for raw_uid, raw_iid, rating in data:
             if (raw_uid, raw_iid) in global_ui_set:  # duplicate rating
                 continue
             global_ui_set.add((raw_uid, raw_iid))

diff --git a/cornac/data/text.py b/cornac/data/text.py
@@ -15,5 +15,5 @@ class TextModule(Module):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def build(self, ordered_ids):
+    def build(self, global_id_map):
         pass
diff --git a/cornac/data/trainset.py b/cornac/data/trainset.py
@@ -35,6 +35,26 @@ def num_items(self):
         """Return the number of items"""
         return len(self._iid_map)
 
+    @property
+    def uid_list(self):
+        """Return the list of mapped user ids"""
+        return self._uid_map.values()
+
+    @property
+    def raw_uid_list(self):
+        """Return the list of raw user ids"""
+        return self._uid_map.keys()
+
+    @property
+    def iid_list(self):
+        """Return the list of mapped item ids"""
+        return self._iid_map.values()
+
+    @property
+    def raw_iid_list(self):
+        """Return the list of raw item ids"""
+        return self._iid_map.keys()
+
     def is_unk_user(self, mapped_uid):
         """Return whether or not a user is unknown given the mapped id"""
         return mapped_uid >= self.num_users
@@ -51,21 +71,6 @@ def get_iid(self, raw_iid):
         """Return the mapped id of an item given a raw id"""
         return self._iid_map[raw_iid]
 
-    def get_uid_list(self):
-        """Return the list of mapped user ids"""
-        return self._uid_map.values()
-
-    def get_raw_uid_list(self):
-        """Return the list of raw user ids"""
-        return self._uid_map.keys()
-
-    def get_iid_list(self):
-        """Return the list of mapped item ids"""
-        return self._iid_map.values()
-
-    def get_raw_iid_list(self):
-        """Return the list of raw item ids"""
-        return self._iid_map.keys()
 
     @staticmethod
     def idx_iter(idx_range, batch_size=1, shuffle=False):
@@ -133,7 +138,18 @@ def __init__(self, matrix, max_rating, min_rating, global_mean, uid_map, iid_map
     @property
     def uir_tuple(self):
         if not self.__uir_tuple:
-            self.__uir_tuple = find(self.matrix)
+            # rating matrix is assumed in the CSR format
+            if not self.matrix.has_sorted_indices:
+                self.matrix.sort_indices()
+
+            num_users, num_items = self.matrix.shape
+
+            # this basically calculates the 'row' attribute of a COO matrix
+            # without requiring us to get the whole COO matrix
+            user_counts = np.ediff1d(self.matrix.indptr)
+            user_ids = np.repeat(np.arange(num_users), user_counts).astype(self.matrix.indices.dtype)
+
+            self.__uir_tuple = (user_ids, self.matrix.indices, self.matrix.data)
         return self.__uir_tuple
 
     @uir_tuple.setter

diff --git a/cornac/eval_methods/base_method.py b/cornac/eval_methods/base_method.py
@@ -192,12 +192,12 @@ def _build_modules(self):
         for user_module in [self.user_text, self.user_image, self.user_graph]:
             if user_module is None: 
                 continue
-            user_module.build(ordered_ids=self.global_uid_map)
+            user_module.build(global_id_map=self.global_uid_map)
 
         for item_module in [self.item_text, self.item_image, self.item_graph]:
             if item_module is None: 
                 continue
-            item_module.build(ordered_ids=self.global_iid_map)
+            item_module.build(global_id_map=self.global_iid_map)
 
         for data_set in [self.train_set, self.test_set, self.val_set]:
             if data_set is None: continue
@@ -254,7 +254,7 @@ def evaluate(self, model, metrics, user_based):
         for mt in (rating_metrics + ranking_metrics):
             metric_user_results[mt.name] = {}
 
-        for user_id in tqdm.tqdm(self.test_set.get_users(), disable=not self.verbose):
+        for user_id in tqdm.tqdm(self.test_set.users, disable=not self.verbose):
             # ignore unknown users when self.exclude_unknown
             if self.exclude_unknowns and self.train_set.is_unk_user(user_id):
                 continue

diff --git a/cornac/models/README.md b/cornac/models/README.md
@@ -3,17 +3,17 @@
 This directory includes the implementation of all the models (listed below) supported in Cornac. 
 Additional dependencies (CPU versions) for each model are also listed accordingly.
 
-| Model | Additional Dependencies |
-| --- | :---: |
-| [Bayesian Personalized Ranking (BPR)](bpr) | N/A |
-| [Collaborative Context Poisson Factorization (C2PF)](c2pf)| N/A |
-| [Collaborative Deep Learning (CDL)](cdl) | [requirements.txt](cdl/requirements.txt) |
-| [Collaborative Ordinal Embedding (COE)](coe) | [requirements.txt](coe/requirements.txt) |
-| [Hierarchical Poisson Factorization (HPF)](hpf) | N/A |
-| [Indexable Bayesian Personalized Ranking (IBPR)](ibpr) | [requirements.txt](ibpr/requirements.txt) |
-| [Matrix Factorization (MF)](mf) | N/A |
-| [Online Indexable Bayesian Personalized Ranking (Online IBPR)](online_ibpr) | [requirements.txt](online_ibpr/requirements.txt) |
-| [Probabilistic Collaborative Representation Learning (PCRL)](pcrl) | [requirements.txt](pcrl/requirements.txt) |
-| [Probabilistic Matrix Factorization (PMF)](pmf) | N/A |
-| [Spherical K-means (SKM)](skm) | N/A |
-| [Visual Bayesian Personalized Ranking (VBPR)](vbpr) | [requirements.txt](vbpr/requirements.txt) |
+| Model and paper | Additional dependencies | Examples |
+| --- | :---: | :---: |
+| [Bayesian Personalized Ranking (BPR)](bpr), [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) | N/A | [bpr_netflix.py](../../examples/bpr_netflix.py)
+| [Collaborative Context Poisson Factorization (C2PF)](c2pf), [paper](https://www.ijcai.org/proceedings/2018/0370.pdf) | N/A | [c2pf_example.py](../../examples/c2pf_example.py)
+| [Collaborative Deep Learning (CDL)](cdl), [paper](https://arxiv.org/pdf/1409.2944.pdf) | [requirements.txt](cdl/requirements.txt) |
+| [Collaborative Ordinal Embedding (COE)](coe), [paper](http://www.hadylauw.com/publications/sdm16.pdf) | [requirements.txt](coe/requirements.txt) |
+| [Hierarchical Poisson Factorization (HPF)](hpf), [paper](http://jakehofman.com/inprint/poisson_recs.pdf) | N/A |
+| [Indexable Bayesian Personalized Ranking (IBPR)](ibpr), [paper](http://www.hadylauw.com/publications/cikm17a.pdf) | [requirements.txt](ibpr/requirements.txt) | [ibpr_example.py](../../examples/ibpr_example.py)
+| [Matrix Factorization (MF)](mf), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A |
+| [Online Indexable Bayesian Personalized Ranking (Online IBPR)](online_ibpr), [paper](http://www.hadylauw.com/publications/cikm17a.pdf) | [requirements.txt](online_ibpr/requirements.txt) |
+| [Probabilistic Collaborative Representation Learning (PCRL)](pcrl), [paper](http://www.hadylauw.com/publications/uai18.pdf) | [requirements.txt](pcrl/requirements.txt) |
+| [Probabilistic Matrix Factorization (PMF)](pmf), [paper](https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf) | N/A | [biased_mf.py](../../examples/biased_mf.py), [given_data.py](../../examples/given_data.py)
+| [Spherical K-means (SKM)](skm), [paper](https://www.sciencedirect.com/science/article/pii/S092523121501509X) | N/A |
+| [Visual Bayesian Personalized Ranking (VBPR)](vbpr), [paper](https://arxiv.org/pdf/1510.01784.pdf) | [requirements.txt](vbpr/requirements.txt) | [vbpr_tradesy.py](../../examples/vbpr_tradesy.py)