-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #54 from tqtg/master
Add Netflix dataset
- Loading branch information
Showing
16 changed files
with
2,400 additions
and
2,358 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from . import movielens | ||
from . import tradesy | ||
from . import tradesy | ||
from . import netflix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com> | ||
Data: https://www.kaggle.com/netflix-inc/netflix-prize-data/ | ||
""" | ||
|
||
from ..utils import validate_format | ||
from ..utils import cache | ||
from ..data import reader | ||
|
||
VALID_DATA_FORMATS = ['UIR', 'UIRT'] | ||
|
||
|
||
def _load(data_file, data_format='UIR'): | ||
"""Load the Netflix dataset | ||
Parameters | ||
---------- | ||
data_file: str, required | ||
Data file name. | ||
data_format: str, default: 'UIR' | ||
Data format to be returned. | ||
Returns | ||
------- | ||
data: array-like | ||
Data in the form of a list of tuples depending on the given data format. | ||
""" | ||
data_format = validate_format(data_format, VALID_DATA_FORMATS) | ||
fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(data_file), | ||
unzip=True, relative_path='netflix/{}.csv'.format(data_file)) | ||
if data_format == 'UIR': | ||
return reader.read_uir(fpath, sep=',') | ||
|
||
|
||
def load_data(data_format='UIR'): | ||
"""Load the Netflix entire dataset | ||
- Number of ratings: 100,480,507 | ||
- Number of users: 480,189 | ||
- Number of items: 17,770 | ||
Parameters | ||
---------- | ||
data_format: str, default: 'UIR' | ||
Data format to be returned. | ||
Returns | ||
------- | ||
data: array-like | ||
Data in the form of a list of tuples depending on the given data format. | ||
""" | ||
return _load('data', data_format) | ||
|
||
|
||
def load_data_small(data_format='UIR'): | ||
"""Load a small subset of the Netflix dataset. We draw this subsample such that | ||
every user has at least 10 items and each item has at least 10 users. | ||
- Number of ratings: 607,803 | ||
- Number of users: 10,000 | ||
- Number of items: 5,000 | ||
Parameters | ||
---------- | ||
data_format: str, default: 'UIR' | ||
Data format to be returned. | ||
Returns | ||
------- | ||
data: array-like | ||
Data in the form of a list of tuples depending on the given data format. | ||
""" | ||
return _load('data_small', data_format) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.