Skip to content

Commit

Permalink
Update Tradesy dataset and refactor code (#58)
Browse files Browse the repository at this point in the history
* Add read_ui function in reader for reading implicit feedback

* Update Tradesy dataset with new data format

* Refactor code
  • Loading branch information
tqtg committed Mar 18, 2019
1 parent 1907603 commit a01af60
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 51 deletions.
46 changes: 39 additions & 7 deletions cornac/data/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
import itertools


def read_uir(path_to_data_file, u_col=0, i_col=1, r_col=2, sep='\t', skip_lines=0):
def read_uir(fpath, u_col=0, i_col=1, r_col=2, sep='\t', skip_lines=0):
"""Read data in the form of triplets (user, item, rating).
Parameters
----------
path_to_data_file: str
fpath: str
Path to the data file
u_col: int, default: 0
Expand All @@ -32,13 +32,45 @@ def read_uir(path_to_data_file, u_col=0, i_col=1, r_col=2, sep='\t', skip_lines=
Returns
-------
uir_triplets: :obj:`iterable`
triplets: :obj:`iterable`
Data in the form of list of tuples of (user, item, rating).
"""
uir_triplets = []
with open(path_to_data_file, 'r') as f:
triplets = []
with open(fpath, 'r') as f:
for line in itertools.islice(f, skip_lines, None):
tokens = [token.strip() for token in line.split(sep)]
uir_triplets.append((tokens[u_col], tokens[i_col], float(tokens[r_col])))
return uir_triplets
triplets.append((tokens[u_col], tokens[i_col], float(tokens[r_col])))
return triplets


def read_ui(fpath, value=1.0, sep='\t', skip_lines=0):
"""Read data in the form of implicit feedback user-items.
Each line starts with user id followed by multiple of item ids.
Parameters
----------
fpath: str
Path to the data file
value: float, default: 1.0
Value for the feedback
sep: str, default: \t
The delimiter string.
skip_lines: int, default: 0
Number of first lines to skip
Returns
-------
triplets: :obj:`iterable`
Data in the form of list of tuples of (user, item, 1).
"""
triplets = []
with open(fpath, 'r') as f:
for line in itertools.islice(f, skip_lines, None):
tokens = [token.strip() for token in line.split(sep)]
triplets.extend([tokens[0], iid, value] for iid in tokens[1:])
return triplets
16 changes: 8 additions & 8 deletions cornac/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
VALID_DATA_FORMATS = ['UIR', 'UIRT']


def load_100k(data_format='UIR'):
def load_100k(fmt='UIR'):
"""Load the MovieLens 100K dataset
Parameters
----------
data_format: str, default: 'UIR'
fmt: str, default: 'UIR'
Data format to be returned.
Returns
Expand All @@ -27,19 +27,19 @@ def load_100k(data_format='UIR'):
Data in the form of a list of tuples depending on the given data format.
"""
data_format = validate_format(data_format, VALID_DATA_FORMATS)
fmt = validate_format(fmt, VALID_DATA_FORMATS)
fpath = cache(url='http://files.grouplens.org/datasets/movielens/ml-100k/u.data',
relative_path='ml-100k/u.data')
if data_format == 'UIR':
if fmt == 'UIR':
return reader.read_uir(fpath)


def load_1m(data_format='UIR'):
def load_1m(fmt='UIR'):
"""Load the MovieLens 1M dataset
Parameters
----------
data_format: str, default: 'UIR'
fmt: str, default: 'UIR'
Data format to be returned.
Returns
Expand All @@ -48,8 +48,8 @@ def load_1m(data_format='UIR'):
Data in the form of a list of tuples depending on the given data format.
"""
data_format = validate_format(data_format, VALID_DATA_FORMATS)
fmt = validate_format(fmt, VALID_DATA_FORMATS)
fpath = cache(url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
relative_path='ml-1m/ratings.dat', unzip=True)
if data_format == 'UIR':
if fmt == 'UIR':
return reader.read_uir(fpath, sep='::')
26 changes: 13 additions & 13 deletions cornac/datasets/netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
VALID_DATA_FORMATS = ['UIR', 'UIRT']


def _load(data_file, data_format='UIR'):
def _load(fname, fmt='UIR'):
"""Load the Netflix dataset
Parameters
----------
data_file: str, required
fname: str, required
Data file name.
data_format: str, default: 'UIR'
fmt: str, default: 'UIR'
Data format to be returned.
Returns
Expand All @@ -30,22 +30,22 @@ def _load(data_file, data_format='UIR'):
Data in the form of a list of tuples depending on the given data format.
"""
data_format = validate_format(data_format, VALID_DATA_FORMATS)
fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(data_file),
unzip=True, relative_path='netflix/{}.csv'.format(data_file))
if data_format == 'UIR':
fmt = validate_format(fmt, VALID_DATA_FORMATS)
fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(fname),
unzip=True, relative_path='netflix/{}.csv'.format(fname))
if fmt == 'UIR':
return reader.read_uir(fpath, sep=',')


def load_data(data_format='UIR'):
def load_data(fmt='UIR'):
"""Load the Netflix entire dataset
- Number of ratings: 100,480,507
- Number of users: 480,189
- Number of items: 17,770
Parameters
----------
data_format: str, default: 'UIR'
fmt: str, default: 'UIR'
Data format to be returned.
Returns
Expand All @@ -54,10 +54,10 @@ def load_data(data_format='UIR'):
Data in the form of a list of tuples depending on the given data format.
"""
return _load('data', data_format)
return _load('data', fmt)


def load_data_small(data_format='UIR'):
def load_data_small(fmt='UIR'):
"""Load a small subset of the Netflix dataset. We draw this subsample such that
every user has at least 10 items and each item has at least 10 users.
- Number of ratings: 607,803
Expand All @@ -66,7 +66,7 @@ def load_data_small(data_format='UIR'):
Parameters
----------
data_format: str, default: 'UIR'
fmt: str, default: 'UIR'
Data format to be returned.
Returns
Expand All @@ -75,4 +75,4 @@ def load_data_small(data_format='UIR'):
Data in the form of a list of tuples depending on the given data format.
"""
return _load('data_small', data_format)
return _load('data_small', fmt)
12 changes: 6 additions & 6 deletions cornac/datasets/tradesy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,19 @@
from ..utils import cache
from ..data import reader

import pickle


def load_data():
"""Load the feedback observations
Returns
-------
data: array-like
Data in the form of a list of tuples (user, item , feedback).
Data in the form of a list of tuples (user, item, 1).
"""
fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/data.zip',
unzip=True, relative_path='tradesy/data.csv')
return reader.read_uir(fpath, sep=',', skip_lines=1)
fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/users.zip',
unzip=True, relative_path='tradesy/users.csv')
return reader.read_ui(fpath, sep=',')


def load_feature():
Expand All @@ -40,6 +38,8 @@ def load_feature():
Item-feature dictionary. Each feature vector is a Numpy array of size 4096.
"""
import pickle

fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/item_feature.zip',
unzip=True, relative_path='tradesy/item_feature.pkl')
with open(fpath, 'rb') as f:
Expand Down
16 changes: 7 additions & 9 deletions cornac/eval_methods/base_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,28 @@ class BaseMethod:
The format of given data.
total_users: int, optional, default: None
Total number of unique users in the data including train, val, and test sets
Total number of unique users in the data including train, val, and test sets.
total_users: int, optional, default: None
Total number of unique items in the data including train, val, and test sets
Total number of unique items in the data including train, val, and test sets.
rating_threshold: float, optional, default: 1
The minimum value that is considered to be a good rating used for ranking, \
e.g, if the ratings are in {1, ..., 5}, then good_rating = 4.
rating_threshold: float, optional, default: 1.0
The threshold to convert ratings into positive or negative feedback for ranking metrics.
exclude_unknowns: bool, optional, default: False
Ignore unknown users and items (cold-start) during evaluation and testing
Ignore unknown users and items (cold-start) during evaluation.
verbose: bool, optional, default: False
Output running log
"""

def __init__(self, data=None,
data_format='UIR',
fmt='UIR',
rating_threshold=1.0,
exclude_unknowns=False,
verbose=False,
**kwargs):
self._data = data
self.data_format = validate_format(data_format, VALID_DATA_FORMATS)
self.data_format = validate_format(fmt, VALID_DATA_FORMATS)
self.train_set = None
self.test_set = None
self.val_set = None
Expand Down
4 changes: 2 additions & 2 deletions cornac/eval_methods/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ class CrossValidation(BaseMethod):
Output running log
"""

def __init__(self, data, data_format='UIR', n_folds=5, rating_threshold=1., partition=None,
def __init__(self, data, fmt='UIR', n_folds=5, rating_threshold=1., partition=None,
exclude_unknowns=True, verbose=False, **kwargs):
BaseMethod.__init__(self, data=data, data_format=data_format, rating_threshold=rating_threshold,
BaseMethod.__init__(self, data=data, fmt=fmt, rating_threshold=rating_threshold,
exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs)
self.n_folds = n_folds
self.current_fold = 0
Expand Down
10 changes: 4 additions & 6 deletions cornac/eval_methods/ratio_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
"""

from ..utils.common import safe_indexing, validate_format
from ..utils.common import safe_indexing
from math import ceil
from .base_method import BaseMethod
from ..data import MatrixTrainSet, TestSet
from ..experiment.result import SingleModelResult
import numpy as np

Expand All @@ -21,7 +20,7 @@ class RatioSplit(BaseMethod):
data: ..., required
The input data in the form of triplets (user, item, rating).
data_format: str, optional, default: "UIR"
fmt: str, optional, default: "UIR"
The format of input data:
- UIR: (user, item, rating) triplet data
- UIRT: (user, item , rating, timestamp) quadruplet data
Expand Down Expand Up @@ -51,11 +50,10 @@ class RatioSplit(BaseMethod):
Output running log
"""

def __init__(self, data, data_format='UIR', test_size=0.2, val_size=0.0, rating_threshold=1.0, shuffle=True,
def __init__(self, data, fmt='UIR', test_size=0.2, val_size=0.0, rating_threshold=1.0, shuffle=True,
seed=None, exclude_unknowns=False, verbose=False, **kwargs):
BaseMethod.__init__(self, data=data, data_format=data_format, rating_threshold=rating_threshold,
BaseMethod.__init__(self, data=data, fmt=fmt, rating_threshold=rating_threshold,
exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs)

self._shuffle = shuffle
self._seed = seed
self._train_size, self._val_size, self._test_size = self.validate_size(val_size, test_size, len(self._data))
Expand Down

0 comments on commit a01af60

Please sign in to comment.