Skip to content

Commit

Permalink
Merge pull request #54 from tqtg/master
Browse files Browse the repository at this point in the history
Add Netflix dataset
  • Loading branch information
tqtg committed Feb 27, 2019
2 parents 7c91fcc + d1a6d32 commit cfffee8
Show file tree
Hide file tree
Showing 16 changed files with 2,400 additions and 2,358 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ python3 setup.py install

Additional dependencies required by models are listed [here](cornac/models/README.md).

Some of the algorithms use `OpenMP` to speed up the training with parallelism. For OSX users, in order to run those algorithms, you might need to install `gcc` from Homebrew to have an OpenMP compiler:
Some of the algorithms use `OpenMP` to speed up training with multithreading. For OSX users, in order to run those algorithms efficiently, you might need to install `gcc` from Homebrew to have an OpenMP compiler and install Cornac from source:

```sh
brew install gcc
brew install gcc | brew link gcc
```

If you want to utilize your GPU, you might consider:
If you want to utilize your GPUs, you might consider:

- [TensorFlow installation instructions](https://www.tensorflow.org/install/).
- [PyTorch installation instructions](https://pytorch.org/get-started/locally/).
Expand Down
17 changes: 12 additions & 5 deletions cornac/data/trainset.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,15 +132,15 @@ def __init__(self, matrix, max_rating, min_rating, global_mean, uid_map, iid_map

@property
def uir_tuple(self):
if not self.__uir_tuples:
self.__uir_tuples = find(self.matrix)
return self.__uir_tuples
if not self.__uir_tuple:
self.__uir_tuple = find(self.matrix)
return self.__uir_tuple

@uir_tuple.setter
def uir_tuple(self, input_tuple):
if input_tuple is not None and len(input_tuple) != 3:
raise ValueError('input_tuple required to be size 3 but size {}'.format(len(input_tuple)))
self.__uir_tuples = input_tuple
self.__uir_tuple = input_tuple

@staticmethod
def _rank_items_by_popularity(rating_matrix):
Expand Down Expand Up @@ -244,7 +244,14 @@ def from_uir(cls, data, global_uid_map=None, global_iid_map=None,
print('Min rating = {:.1f}'.format(min_rating))
print('Global mean = {:.1f}'.format(global_mean))

return cls(csr_mat, max_rating, min_rating, global_mean, uid_map, iid_map)
train_set = cls(csr_mat, max_rating, min_rating, global_mean, uid_map, iid_map)

# since we have triplet arrays, let's construct uir_tuple for the train_set
train_set.uir_tuple = (np.asarray(u_indices, dtype=np.int),
np.asarray(i_indices, dtype=np.int),
np.asarray(r_values, dtype=np.float))

return train_set

def uir_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of users, items, and rating values
Expand Down
3 changes: 2 additions & 1 deletion cornac/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from . import movielens
from . import tradesy
from . import tradesy
from . import netflix
2 changes: 2 additions & 0 deletions cornac/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

"""
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
MovieLens: https://grouplens.org/datasets/movielens/
"""

from ..utils import validate_format
Expand Down
78 changes: 78 additions & 0 deletions cornac/datasets/netflix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-

"""
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
Data: https://www.kaggle.com/netflix-inc/netflix-prize-data/
"""

from ..utils import validate_format
from ..utils import cache
from ..data import reader

VALID_DATA_FORMATS = ['UIR', 'UIRT']


def _load(data_file, data_format='UIR'):
"""Load the Netflix dataset
Parameters
----------
data_file: str, required
Data file name.
data_format: str, default: 'UIR'
Data format to be returned.
Returns
-------
data: array-like
Data in the form of a list of tuples depending on the given data format.
"""
data_format = validate_format(data_format, VALID_DATA_FORMATS)
fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(data_file),
unzip=True, relative_path='netflix/{}.csv'.format(data_file))
if data_format == 'UIR':
return reader.read_uir(fpath, sep=',')


def load_data(data_format='UIR'):
"""Load the Netflix entire dataset
- Number of ratings: 100,480,507
- Number of users: 480,189
- Number of items: 17,770
Parameters
----------
data_format: str, default: 'UIR'
Data format to be returned.
Returns
-------
data: array-like
Data in the form of a list of tuples depending on the given data format.
"""
return _load('data', data_format)


def load_data_small(data_format='UIR'):
"""Load a small subset of the Netflix dataset. We draw this subsample such that
every user has at least 10 items and each item has at least 10 users.
- Number of ratings: 607,803
- Number of users: 10,000
- Number of items: 5,000
Parameters
----------
data_format: str, default: 'UIR'
Data format to be returned.
Returns
-------
data: array-like
Data in the form of a list of tuples depending on the given data format.
"""
return _load('data_small', data_format)
16 changes: 8 additions & 8 deletions cornac/datasets/tradesy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
Original data: http://jmcauley.ucsd.edu/data/tradesy/
This data is used in the VBPR paper:
- Number of users: 92835
- Number of items: 166526
- Number of feedback: 396636 (410186 is reported but there are duplicates)
This data is used in the VBPR paper. After cleaning the data, we have:
- Number of feedback: 394,421 (410,186 is reported but there are duplicates)
- Number of users: 19,243 (19,823 is reported due to duplicates)
- Number of items: 165,906 (166,521 is reported due to duplicates)
"""

Expand All @@ -26,8 +26,8 @@ def load_data():
Data in the form of a list of tuples (user, item , feedback).
"""
fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/data.csv',
relative_path='tradesy/data.csv')
fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/data.zip',
unzip=True, relative_path='tradesy/data.csv')
return reader.read_uir(fpath, sep=',', skip_lines=1)


Expand All @@ -40,7 +40,7 @@ def load_feature():
Item-feature dictionary. Each feature vector is a Numpy array of size 4096.
"""
fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/item_feature.pkl',
relative_path='tradesy/item_feature.pkl')
fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/item_feature.zip',
unzip=True, relative_path='tradesy/item_feature.pkl')
with open(fpath, 'rb') as f:
return pickle.load(f)

0 comments on commit cfffee8

Please sign in to comment.