Merge pull request #64 from saghiles/master

Reorganize: results, Add: Amazon Office datasets, Tests Graph module and office data
PreferredAI · Mar 22, 2019 · 177508e · 177508e
2 parents fcc5bfd + 9bd66a0
commit 177508e
Show file tree

Hide file tree

Showing 14 changed files with 6,632 additions and 5,859 deletions.
diff --git a/cornac/data/graph.py b/cornac/data/graph.py
@@ -26,7 +26,7 @@ def _build_triplet(self, global_id_map):
         for i, j, val in self.raw_data:
             self.map_data.append([global_id_map[i], global_id_map[j], val])
         self.map_data = np.asanyarray(self.map_data)
-        self.raw_data = None
+        #self.raw_data = None
 
     def _build_sparse_matrix(self, triplet):
         """Build sparse adjacency matrix

diff --git a/cornac/datasets/amazon_office.py b/cornac/datasets/amazon_office.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+"""
+@author: Aghiles Salah <asalah@smu.edu.sg>
+
+This data is built based on the Amazon datasets provided by Julian McAuley at: http://jmcauley.ucsd.edu/data/amazon/
+"""
+
+from ..utils import validate_format
+from ..utils import cache
+from ..data import reader
+
+VALID_DATA_FORMATS = ['UIR', 'UIRT']
+
+
+def load_rating(data_format='UIR'):
+    """Load the user-item ratings
+
+    Parameters
+    ----------
+    data_format: str, default: 'UIR'
+        Data format to be returned.
+
+    Returns
+    -------
+    data: array-like
+        Data in the form of a list of tuples depending on the specified data format.
+    """
+
+    data_format = validate_format(data_format, VALID_DATA_FORMATS)
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/amazon_office/rating.txt',
+                  relative_path='amazon_office/rating.txt')
+    if data_format == 'UIR':
+        return reader.read_uir(fpath,sep=' ')
+
+
+def load_context(data_format='UIR'):
+    """Load the item-item interactions
+
+    Parameters
+    ----------
+    data_format: str, default: 'UIR'
+        Data format to be returned.
+
+    Returns
+    -------
+    data: array-like
+        Data in the form of a list of tuples depending on the specified data format.
+    """
+
+    data_format = validate_format(data_format, VALID_DATA_FORMATS)
+    fpath = cache(url='https://static.preferred.ai/cornac/datasets/amazon_office/context.txt',
+                  relative_path='amazon_office/context.txt')
+    if data_format == 'UIR':
+        return reader.read_uir(fpath,sep=' ')
diff --git a/cornac/eval_methods/cross_validation.py b/cornac/eval_methods/cross_validation.py
@@ -98,12 +98,12 @@ def _next_fold(self):
             self.current_fold = 0        
 
     def evaluate(self, model, metrics, user_based):
-        result = CVSingleModelResult()
+        result = CVSingleModelResult(model.name, metrics)
 
         for fold in range(self.n_folds):
             self._get_train_test()
             avg_res, per_user_res = BaseMethod.evaluate(self, model, metrics, user_based)
-            result._add_fold_res(fold=fold, metric_avg_results=avg_res)
+            result.add_fold_res(fold=fold, metric_avg_results=avg_res)
             self._next_fold()
         result._compute_avg_res()
         return result
diff --git a/cornac/eval_methods/ratio_split.py b/cornac/eval_methods/ratio_split.py
@@ -125,4 +125,6 @@ def split(self):
     def evaluate(self, model, metrics, user_based):
         self.split()
         metric_avg_results, per_user_results = BaseMethod.evaluate(self, model, metrics, user_based)
-        return SingleModelResult(metric_avg_results, per_user_results)
+        res = SingleModelResult(model.name, metrics, metric_avg_results, per_user_results)
+        res.organize_avg_res()
+        return res
diff --git a/cornac/experiment/experiment.py b/cornac/experiment/experiment.py
@@ -44,7 +44,7 @@ def __init__(self, eval_method, models, metrics, user_based=True, verbose=False)
         self.verbose = verbose
         from ..eval_methods.ratio_split import RatioSplit
         from ..eval_methods.cross_validation import CrossValidation
-        if isinstance(eval_method, RatioSplit):
+        if isinstance(self.eval_method, RatioSplit):
             self.results = Result()
         elif isinstance(eval_method, CrossValidation):
             self.results = CVResult(eval_method.n_folds)
@@ -86,7 +86,6 @@ def dict_depth(self, d):
 
     # modify this function to accommodate several models
     def run(self):
-        model_names = []
         metric_names = []
         organized_metrics = {'ranking': [], 'rating': []}
 
@@ -98,10 +97,7 @@ def run(self):
         for model in self.models:
             if self.verbose:
                 print(model.name)
-
-            model_names.append(model.name)
-            model_res = self.eval_method.evaluate(model=model, metrics=organized_metrics, user_based=self.user_based)
-            model_res._organize_avg_res(model_name=model.name, metric_names=metric_names)
+            model_res = self.eval_method.evaluate(model=model, metrics=self.metrics, user_based=self.user_based)
             self.results._add_model_res(res=model_res, model_name=model.name)
 
         self.results.show()
diff --git a/cornac/experiment/result.py b/cornac/experiment/result.py
@@ -15,20 +15,28 @@ class SingleModelResult:
     ----------
     """
 
-    def __init__(self, metric_avg_results, metric_user_results=None):
+    def __init__(self, model_name, metrics, metric_avg_results, metric_user_results=None):
+        self.model_name = model_name
+        self.metric_names = self._get_metric_names(metrics)
         self.avg = metric_avg_results
         self.per_user = metric_user_results
 
-    def _organize_avg_res(self, model_name, metric_names):
-        self.avg = self._get_data_frame(avg_res=self.avg, model_name=model_name, metric_names=metric_names)
+    def organize_avg_res(self):
+        self.avg = self._get_data_frame(avg_res=self.avg)
 
-    def _get_data_frame(self, avg_res, model_name, metric_names):
-        avg_res = [avg_res.get(mt_name, np.nan) for mt_name in metric_names]
+    def _get_data_frame(self, avg_res):
+        avg_res = [avg_res.get(mt_name, np.nan) for mt_name in self.metric_names]
         avg_res = np.asarray(avg_res)
-        avg_res = avg_res.reshape(1, len(metric_names))
-        avg_res = pd.DataFrame(data=avg_res, index=np.asarray([model_name]), columns=np.asarray(metric_names))
+        avg_res = avg_res.reshape(1, len(self.metric_names))
+        avg_res = pd.DataFrame(data=avg_res, index=np.asarray([self.model_name]), columns=np.asarray(self.metric_names))
         return avg_res
 
+    def _get_metric_names(self, metrics):
+        metric_names = []
+        for mt in metrics:
+            metric_names.append(mt.name)
+        return metric_names
+
 
 class CVSingleModelResult(SingleModelResult):
     """ Cross Validation Result Class for a single model
@@ -37,12 +45,13 @@ class CVSingleModelResult(SingleModelResult):
     ----------
     """
 
-    def __init__(self, metric_avg_results=None):
-        self.avg = metric_avg_results
+    def __init__(self, model_name, metrics, metric_avg_results={}):
+        SingleModelResult.__init__(self, model_name, metrics, metric_avg_results)
+        #self.avg = metric_avg_results
         self.per_fold_avg = {}
-        self.avg = {}
+        #self.avg = {}
 
-    def _add_fold_res(self, fold, metric_avg_results):
+    def add_fold_res(self, fold, metric_avg_results):
         # think to organize the results first
         self.per_fold_avg[fold] = metric_avg_results
 
@@ -52,14 +61,14 @@ def _compute_avg_res(self):
         for f in self.per_fold_avg:
             for mt in self.per_fold_avg[f]:
                 self.avg[mt] += self.per_fold_avg[f][mt] / len(self.per_fold_avg)
+        self._organize_avg_res()
 
-    def _organize_avg_res(self, model_name, metric_names):
+    def _organize_avg_res(self):
         # global avg
-        self.avg = self._get_data_frame(avg_res=self.avg, model_name=model_name, metric_names=metric_names)
+        self.avg = self._get_data_frame(avg_res=self.avg)
         # per_fold avg
         for f in self.per_fold_avg:
-            self.per_fold_avg[f] = self._get_data_frame(avg_res=self.per_fold_avg[f], model_name=model_name,
-                                                        metric_names=metric_names)
+            self.per_fold_avg[f] = self._get_data_frame(avg_res=self.per_fold_avg[f])
 
 
 class Result: