Update Tradesy dataset and refactor code (#58)

* Add read_ui function in reader for reading implicit feedback * Update Tradesy dataset with new data format * Refactor code
PreferredAI · Mar 18, 2019 · a01af60 · a01af60
1 parent 1907603
commit a01af60
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 51 deletions.
diff --git a/cornac/data/reader.py b/cornac/data/reader.py
@@ -7,12 +7,12 @@
 import itertools
 
 
-def read_uir(path_to_data_file, u_col=0, i_col=1, r_col=2, sep='\t', skip_lines=0):
+def read_uir(fpath, u_col=0, i_col=1, r_col=2, sep='\t', skip_lines=0):
     """Read data in the form of triplets (user, item, rating).
 
     Parameters
     ----------
-    path_to_data_file: str
+    fpath: str
         Path to the data file
 
     u_col: int, default: 0
@@ -32,13 +32,45 @@ def read_uir(path_to_data_file, u_col=0, i_col=1, r_col=2, sep='\t', skip_lines=
 
     Returns
     -------
-    uir_triplets: :obj:`iterable`
+    triplets: :obj:`iterable`
         Data in the form of list of tuples of (user, item, rating).
 
     """
-    uir_triplets = []
-    with open(path_to_data_file, 'r') as f:
+    triplets = []
+    with open(fpath, 'r') as f:
         for line in itertools.islice(f, skip_lines, None):
             tokens = [token.strip() for token in line.split(sep)]
-            uir_triplets.append((tokens[u_col], tokens[i_col], float(tokens[r_col])))
-    return uir_triplets
+            triplets.append((tokens[u_col], tokens[i_col], float(tokens[r_col])))
+    return triplets
+
+
+def read_ui(fpath, value=1.0, sep='\t', skip_lines=0):
+    """Read data in the form of implicit feedback user-items.
+    Each line starts with user id followed by multiple of item ids.
+
+    Parameters
+    ----------
+    fpath: str
+        Path to the data file
+
+    value: float, default: 1.0
+        Value for the feedback
+
+    sep: str, default: \t
+        The delimiter string.
+
+    skip_lines: int, default: 0
+        Number of first lines to skip
+
+    Returns
+    -------
+    triplets: :obj:`iterable`
+        Data in the form of list of tuples of (user, item, 1).
+
+    """
+    triplets = []
+    with open(fpath, 'r') as f:
+        for line in itertools.islice(f, skip_lines, None):
+            tokens = [token.strip() for token in line.split(sep)]
+            triplets.extend([tokens[0], iid, value] for iid in tokens[1:])
+    return triplets
diff --git a/cornac/datasets/movielens.py b/cornac/datasets/movielens.py
@@ -13,12 +13,12 @@
 VALID_DATA_FORMATS = ['UIR', 'UIRT']
 
 
-def load_100k(data_format='UIR'):
+def load_100k(fmt='UIR'):
     """Load the MovieLens 100K dataset
 
     Parameters
     ----------
-    data_format: str, default: 'UIR'
+    fmt: str, default: 'UIR'
         Data format to be returned.
 
     Returns
@@ -27,19 +27,19 @@ def load_100k(data_format='UIR'):
         Data in the form of a list of tuples depending on the given data format.
 
     """
-    data_format = validate_format(data_format, VALID_DATA_FORMATS)
+    fmt = validate_format(fmt, VALID_DATA_FORMATS)
     fpath = cache(url='http://files.grouplens.org/datasets/movielens/ml-100k/u.data',
                   relative_path='ml-100k/u.data')
-    if data_format == 'UIR':
+    if fmt == 'UIR':
         return reader.read_uir(fpath)
 
 
-def load_1m(data_format='UIR'):
+def load_1m(fmt='UIR'):
     """Load the MovieLens 1M dataset
 
     Parameters
     ----------
-    data_format: str, default: 'UIR'
+    fmt: str, default: 'UIR'
         Data format to be returned.
 
     Returns
@@ -48,8 +48,8 @@ def load_1m(data_format='UIR'):
         Data in the form of a list of tuples depending on the given data format.
 
     """
-    data_format = validate_format(data_format, VALID_DATA_FORMATS)
+    fmt = validate_format(fmt, VALID_DATA_FORMATS)
     fpath = cache(url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
                   relative_path='ml-1m/ratings.dat', unzip=True)
-    if data_format == 'UIR':
+    if fmt == 'UIR':
         return reader.read_uir(fpath, sep='::')
diff --git a/cornac/datasets/netflix.py b/cornac/datasets/netflix.py
@@ -13,15 +13,15 @@
 VALID_DATA_FORMATS = ['UIR', 'UIRT']
 
 
-def _load(data_file, data_format='UIR'):
+def _load(fname, fmt='UIR'):
     """Load the Netflix dataset
 
     Parameters
     ----------
-    data_file: str, required
+    fname: str, required
         Data file name.
 
-    data_format: str, default: 'UIR'
+    fmt: str, default: 'UIR'
         Data format to be returned.
 
     Returns
@@ -30,22 +30,22 @@ def _load(data_file, data_format='UIR'):
         Data in the form of a list of tuples depending on the given data format.
 
     """
-    data_format = validate_format(data_format, VALID_DATA_FORMATS)
-    fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(data_file),
-                  unzip=True, relative_path='netflix/{}.csv'.format(data_file))
-    if data_format == 'UIR':
+    fmt = validate_format(fmt, VALID_DATA_FORMATS)
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/netflix/{}.zip'.format(fname),
+                  unzip=True, relative_path='netflix/{}.csv'.format(fname))
+    if fmt == 'UIR':
         return reader.read_uir(fpath, sep=',')
 
 
-def load_data(data_format='UIR'):
+def load_data(fmt='UIR'):
     """Load the Netflix entire dataset
         - Number of ratings: 100,480,507
         - Number of users:       480,189
         - Number of items:        17,770
 
     Parameters
     ----------
-    data_format: str, default: 'UIR'
+    fmt: str, default: 'UIR'
         Data format to be returned.
 
     Returns
@@ -54,10 +54,10 @@ def load_data(data_format='UIR'):
         Data in the form of a list of tuples depending on the given data format.
 
     """
-    return _load('data', data_format)
+    return _load('data', fmt)
 
 
-def load_data_small(data_format='UIR'):
+def load_data_small(fmt='UIR'):
     """Load a small subset of the Netflix dataset. We draw this subsample such that
     every user has at least 10 items and each item has at least 10 users.
         - Number of ratings: 607,803
@@ -66,7 +66,7 @@ def load_data_small(data_format='UIR'):
 
     Parameters
     ----------
-    data_format: str, default: 'UIR'
+    fmt: str, default: 'UIR'
         Data format to be returned.
 
     Returns
@@ -75,4 +75,4 @@ def load_data_small(data_format='UIR'):
         Data in the form of a list of tuples depending on the given data format.
 
     """
-    return _load('data_small', data_format)
+    return _load('data_small', fmt)
diff --git a/cornac/datasets/tradesy.py b/cornac/datasets/tradesy.py
@@ -14,21 +14,19 @@
 from ..utils import cache
 from ..data import reader
 
-import pickle
-
 
 def load_data():
     """Load the feedback observations
 
     Returns
     -------
     data: array-like
-        Data in the form of a list of tuples (user, item , feedback).
+        Data in the form of a list of tuples (user, item, 1).
 
     """
-    fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/data.zip',
-                  unzip=True, relative_path='tradesy/data.csv')
-    return reader.read_uir(fpath, sep=',', skip_lines=1)
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/users.zip',
+                  unzip=True, relative_path='tradesy/users.csv')
+    return reader.read_ui(fpath, sep=',')
 
 
 def load_feature():
@@ -40,6 +38,8 @@ def load_feature():
         Item-feature dictionary. Each feature vector is a Numpy array of size 4096.
 
     """
+    import pickle
+
     fpath = cache(url='https://static.preferred.ai/cornac/datasets/tradesy/item_feature.zip',
                   unzip=True, relative_path='tradesy/item_feature.pkl')
     with open(fpath, 'rb') as f:

diff --git a/cornac/eval_methods/base_method.py b/cornac/eval_methods/base_method.py
@@ -32,30 +32,28 @@ class BaseMethod:
         The format of given data.
 
     total_users: int, optional, default: None
-        Total number of unique users in the data including train, val, and test sets
+        Total number of unique users in the data including train, val, and test sets.
 
     total_users: int, optional, default: None
-        Total number of unique items in the data including train, val, and test sets
+        Total number of unique items in the data including train, val, and test sets.
 
-    rating_threshold: float, optional, default: 1
-        The minimum value that is considered to be a good rating used for ranking, \
-        e.g, if the ratings are in {1, ..., 5}, then good_rating = 4.
+    rating_threshold: float, optional, default: 1.0
+        The threshold to convert ratings into positive or negative feedback for ranking metrics.
 
     exclude_unknowns: bool, optional, default: False
-        Ignore unknown users and items (cold-start) during evaluation and testing
+        Ignore unknown users and items (cold-start) during evaluation.
 
     verbose: bool, optional, default: False
         Output running log
     """
-
     def __init__(self, data=None,
-                 data_format='UIR',
+                 fmt='UIR',
                  rating_threshold=1.0,
                  exclude_unknowns=False,
                  verbose=False,
                  **kwargs):
         self._data = data
-        self.data_format = validate_format(data_format, VALID_DATA_FORMATS)
+        self.data_format = validate_format(fmt, VALID_DATA_FORMATS)
         self.train_set = None
         self.test_set = None
         self.val_set = None

diff --git a/cornac/eval_methods/cross_validation.py b/cornac/eval_methods/cross_validation.py
@@ -40,9 +40,9 @@ class CrossValidation(BaseMethod):
         Output running log
     """
 
-    def __init__(self, data, data_format='UIR', n_folds=5, rating_threshold=1., partition=None,
+    def __init__(self, data, fmt='UIR', n_folds=5, rating_threshold=1., partition=None,
                  exclude_unknowns=True, verbose=False, **kwargs):
-        BaseMethod.__init__(self, data=data, data_format=data_format, rating_threshold=rating_threshold,
+        BaseMethod.__init__(self, data=data, fmt=fmt, rating_threshold=rating_threshold,
                             exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs)
         self.n_folds = n_folds
         self.current_fold = 0

diff --git a/cornac/eval_methods/ratio_split.py b/cornac/eval_methods/ratio_split.py
@@ -4,10 +4,9 @@
 @author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
 """
 
-from ..utils.common import safe_indexing, validate_format
+from ..utils.common import safe_indexing
 from math import ceil
 from .base_method import BaseMethod
-from ..data import MatrixTrainSet, TestSet
 from ..experiment.result import SingleModelResult
 import numpy as np
 
@@ -21,7 +20,7 @@ class RatioSplit(BaseMethod):
     data: ..., required
         The input data in the form of triplets (user, item, rating).
 
-    data_format: str, optional, default: "UIR"
+    fmt: str, optional, default: "UIR"
         The format of input data:
         - UIR: (user, item, rating) triplet data
         - UIRT: (user, item , rating, timestamp) quadruplet data
@@ -51,11 +50,10 @@ class RatioSplit(BaseMethod):
         Output running log
     """
 
-    def __init__(self, data, data_format='UIR', test_size=0.2, val_size=0.0, rating_threshold=1.0, shuffle=True,
+    def __init__(self, data, fmt='UIR', test_size=0.2, val_size=0.0, rating_threshold=1.0, shuffle=True,
                  seed=None, exclude_unknowns=False, verbose=False, **kwargs):
-        BaseMethod.__init__(self, data=data, data_format=data_format, rating_threshold=rating_threshold,
+        BaseMethod.__init__(self, data=data, fmt=fmt, rating_threshold=rating_threshold,
                             exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs)
-
         self._shuffle = shuffle
         self._seed = seed
         self._train_size, self._val_size, self._test_size = self.validate_size(val_size, test_size, len(self._data))