Merge pull request #54 from tqtg/master

Add Netflix dataset
PreferredAI · Feb 27, 2019 · cfffee8 · cfffee8
2 parents 7c91fcc + d1a6d32
commit cfffee8
Show file tree

Hide file tree

Showing 16 changed files with 2,400 additions and 2,358 deletions.
diff --git a/README.md b/README.md
@@ -53,13 +53,13 @@ python3 setup.py install
 
 Additional dependencies required by models are listed [here](cornac/models/README.md).
 
-Some of the algorithms use `OpenMP` to speed up the training with parallelism. For OSX users, in order to run those algorithms, you might need to install `gcc` from Homebrew to have an OpenMP compiler:
+Some of the algorithms use `OpenMP` to speed up training with multithreading. For OSX users, in order to run those algorithms efficiently, you might need to install `gcc` from Homebrew to have an OpenMP compiler and install Cornac from source:
 
 ```sh
-brew install gcc
+brew install gcc | brew link gcc
 ```
 
-If you want to utilize your GPU, you might consider:
+If you want to utilize your GPUs, you might consider:
 
 - [TensorFlow installation instructions](https://www.tensorflow.org/install/).
 - [PyTorch installation instructions](https://pytorch.org/get-started/locally/).

diff --git a/cornac/data/trainset.py b/cornac/data/trainset.py
@@ -132,15 +132,15 @@ def __init__(self, matrix, max_rating, min_rating, global_mean, uid_map, iid_map
 
     @property
     def uir_tuple(self):
-        if not self.__uir_tuples:
-            self.__uir_tuples = find(self.matrix)
-        return self.__uir_tuples
+        if not self.__uir_tuple:
+            self.__uir_tuple = find(self.matrix)
+        return self.__uir_tuple
 
     @uir_tuple.setter
     def uir_tuple(self, input_tuple):
         if input_tuple is not None and len(input_tuple) != 3:
             raise ValueError('input_tuple required to be size 3 but size {}'.format(len(input_tuple)))
-        self.__uir_tuples = input_tuple
+        self.__uir_tuple = input_tuple
 
     @staticmethod
     def _rank_items_by_popularity(rating_matrix):
@@ -244,7 +244,14 @@ def from_uir(cls, data, global_uid_map=None, global_iid_map=None,
             print('Min rating = {:.1f}'.format(min_rating))
             print('Global mean = {:.1f}'.format(global_mean))
 
-        return cls(csr_mat, max_rating, min_rating, global_mean, uid_map, iid_map)
+        train_set = cls(csr_mat, max_rating, min_rating, global_mean, uid_map, iid_map)
+
+        # since we have triplet arrays, let's construct uir_tuple for the train_set
+        train_set.uir_tuple = (np.asarray(u_indices, dtype=np.int),
+                               np.asarray(i_indices, dtype=np.int),
+                               np.asarray(r_values, dtype=np.float))
+
+        return train_set
 
     def uir_iter(self, batch_size=1, shuffle=False):
         """Create an iterator over data yielding batch of users, items, and rating values

diff --git a/cornac/datasets/__init__.py b/cornac/datasets/__init__.py
@@ -1,2 +1,3 @@
 from . import movielens
-from . import tradesy
+from . import tradesy
+from . import netflix
diff --git a/cornac/datasets/movielens.py b/cornac/datasets/movielens.py
@@ -2,6 +2,8 @@
 
 """
 @author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
+
+MovieLens: https://grouplens.org/datasets/movielens/
 """
 
 from ..utils import validate_format

diff --git a/cornac/datasets/netflix.py b/cornac/datasets/netflix.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+"""
+@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
+
+Data: https://www.kaggle.com/netflix-inc/netflix-prize-data/
+"""
+
+from ..utils import validate_format
+from ..utils import cache
+from ..data import reader
+
+VALID_DATA_FORMATS = ['UIR', 'UIRT']
+
+
+def _load(data_file, data_format='UIR'):
+    """Load the Netflix dataset
+
+    Parameters
+    ----------
+    data_file: str, required
+        Data file name.
+
+    data_format: str, default: 'UIR'
+        Data format to be returned.
+
+    Returns
+    -------
+    data: array-like
+        Data in the form of a list of tuples depending on the given data format.
+
+    """
+    data_format = validate_format(data_format, VALID_DATA_FORMATS)
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(data_file),
+                  unzip=True, relative_path='netflix/{}.csv'.format(data_file))
+    if data_format == 'UIR':
+        return reader.read_uir(fpath, sep=',')
+
+
+def load_data(data_format='UIR'):
+    """Load the Netflix entire dataset
+        - Number of ratings: 100,480,507
+        - Number of users:       480,189
+        - Number of items:        17,770
+
+    Parameters
+    ----------
+    data_format: str, default: 'UIR'
+        Data format to be returned.
+
+    Returns
+    -------
+    data: array-like
+        Data in the form of a list of tuples depending on the given data format.
+
+    """
+    return _load('data', data_format)
+
+
+def load_data_small(data_format='UIR'):
+    """Load a small subset of the Netflix dataset. We draw this subsample such that
+    every user has at least 10 items and each item has at least 10 users.
+        - Number of ratings: 607,803
+        - Number of users:    10,000
+        - Number of items:     5,000
+
+    Parameters
+    ----------
+    data_format: str, default: 'UIR'
+        Data format to be returned.
+
+    Returns
+    -------
+    data: array-like
+        Data in the form of a list of tuples depending on the given data format.
+
+    """
+    return _load('data_small', data_format)
diff --git a/cornac/datasets/tradesy.py b/cornac/datasets/tradesy.py
@@ -4,10 +4,10 @@
 @author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
 
 Original data: http://jmcauley.ucsd.edu/data/tradesy/
-This data is used in the VBPR paper:
-- Number of users: 92835
-- Number of items: 166526
-- Number of feedback: 396636 (410186 is reported but there are duplicates)
+This data is used in the VBPR paper. After cleaning the data, we have:
+- Number of feedback: 394,421 (410,186 is reported but there are duplicates)
+- Number of users:     19,243 (19,823 is reported due to duplicates)
+- Number of items:    165,906 (166,521 is reported due to duplicates)
 
 """
 
@@ -26,8 +26,8 @@ def load_data():
         Data in the form of a list of tuples (user, item , feedback).
 
     """
-    fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/data.csv',
-                  relative_path='tradesy/data.csv')
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/data.zip',
+                  unzip=True, relative_path='tradesy/data.csv')
     return reader.read_uir(fpath, sep=',', skip_lines=1)
 
 
@@ -40,7 +40,7 @@ def load_feature():
         Item-feature dictionary. Each feature vector is a Numpy array of size 4096.
 
     """
-    fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/item_feature.pkl',
-                  relative_path='tradesy/item_feature.pkl')
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/item_feature.zip',
+                  unzip=True, relative_path='tradesy/item_feature.pkl')
     with open(fpath, 'rb') as f:
         return pickle.load(f)