Merge pull request #189 from TUW-GEO/rolmetrics

Fix p-value bug, wrong clip parameter
TUW-GEO · Jun 24, 2020 · 4954aee · 4954aee
2 parents ca49902 + fc54e36
commit 4954aee
Show file tree

Hide file tree

Showing 7 changed files with 145 additions and 107 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -42,7 +42,8 @@ install:
   - which python
 
 script:
-  - python setup.py test
+  - python setup.py test --addopts "--cache-clear -m 'not full_framework'"
+  - python setup.py test --addopts "--cache-clear -m 'full_framework'"
 after_success:
   # report coverage results to coveralls.io
   - pip install coveralls

diff --git a/appveyor.yml b/appveyor.yml
@@ -2,26 +2,21 @@ build: false
 
 environment:
   matrix:
-    - PYTHON_ARCH: 64
-      PYTHON_VERSION: "2.7"
-      PYTHON: "C:\\Python27-x64"
-      MINICONDA: "C:\\Miniconda-x64" # use preinstalled conda
-    - PYTHON_ARCH: 32
-      PYTHON_VERSION: "2.7"
-      PYTHON: "C:\\Python27"
-      MINICONDA: "C:\\Miniconda"
     - PYTHON_ARCH: 64
       PYTHON: "C:\\Python36-x64"
       PYTHON_VERSION: "3.6"
       MINICONDA: "C:\\Miniconda36-x64"
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
     - PYTHON_ARCH: 64
       PYTHON: "C:\\Python37-x64"
       PYTHON_VERSION: "3.7"
       MINICONDA: "C:\\Miniconda37-x64"
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
     - PYTHON_ARCH: 64
       PYTHON: "C:\\Python38-x64"
       PYTHON_VERSION: "3.8"
-      MINICONDA: "C:\\Miniconda37-x64"
+      MINICONDA: "C:\\Miniconda38-x64"
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
 
 
 install:
@@ -37,9 +32,6 @@ install:
   - ECHO "Filesystem root:"
   - ps: "ls \"C:/\""
 
-  - ECHO "Installed SDKs:"
-  - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\""
-
   # Upgrade to the latest version of pip to avoid it displaying warnings
   # about it being out of date.
   - cmd: python -m pip install --upgrade pip
@@ -61,8 +53,7 @@ install:
   - cmd: activate pytesmo
   # Install dependencies
   - conda env update -f environment.yml -n pytesmo
-  - pip install pytest==3.8.2 # While supporting python2 x86
-
+
   # Print debug infos
   - git status
   - conda info -a

diff --git a/pytesmo/validation_framework/adapters.py b/pytesmo/validation_framework/adapters.py
@@ -54,7 +54,7 @@ def __get_dataframe(self, data):
         return data
 
     def __drop_tz_info(self, data):
-        if data.index.tz is not None:
+        if (hasattr(data.index, 'tz') and (data.index.tz is not None)):
             warnings.warn('Dropping timezone information ({}) for data from reader {}'.format(data.index.tz, self.cls.__class__.__name__))
             data.index = data.index.tz_convert(None)
         return data

diff --git a/pytesmo/validation_framework/metric_calculators.py b/pytesmo/validation_framework/metric_calculators.py
@@ -100,6 +100,106 @@ def _get_metric_template(metr):
 
     return {m: lut[m] for m in metr}
 
+class MonthsMetricsAdapter(object):
+    """ Adapt MetricCalculators to calculate metrics for groups across months """
+    def __init__(self, calculator, sets=None):
+        """
+        Add functionality to a metric calculator to calculate validation metrics
+        for subsets of certain months in a time series (e.g. seasonal).
+        Parameters
+        ----------
+        calculator : MetadataMetrics or any child of it
+        sets : dict, optional (default: None)
+            A dictionary consisting of a set name (which is added to the metric
+            name as a suffix) and the list of months that belong to that set.
+            If None is passed, we use 4 (seasonal) sets named after the fist
+            letter of each month used.
+        """
+        self.cls = calculator
+        if sets is None:
+            sets = {'DJF': [12, 1, 2], 'MAM': [3, 4, 5],
+                    'JJA': [6, 7, 8], 'SON': [9, 10, 11],
+                    'ALL': list(range(1,13))}
+
+        self.sets = sets
+
+        # metadata metrics and lon, lat, gpi are excluded from applying seasonally
+        self.non_seas_metrics = ['gpi', 'lon', 'lat']
+        if self.cls.metadata_template is not None:
+            self.non_seas_metrics += list(self.cls.metadata_template.keys())
+
+        all_metrics = calculator.result_template
+        subset_metrics = {}
+
+        # for each subset create a copy of the metric template
+        for name in sets.keys():
+            for k, v in all_metrics.items():
+                if k in self.non_seas_metrics:
+                    subset_metrics[f"{k}"] = v
+                else:
+                    subset_metrics[f"{name}_{k}"] = v
+
+        self.result_template = subset_metrics
+
+    @staticmethod
+    def filter_months(df, months, dropna=False):
+        """
+        Select only entries of a time series that are within certain month(s)
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Time series (index.month must exist) that is filtered
+        months : list
+            Months for which data is kept, e.g. [12,1,2] to keep data for winter
+        dropna : bool, optional (default: False)
+            Drop lines for months that are not to be kept, if this is false, the
+            original index is not changed, but filtered values are replaced with nan.
+
+        Returns
+        -------
+        df_filtered : pd.DataFrame
+            The filtered series
+        """
+        dat = df.copy(True)
+        dat['__index_month'] = dat.index.month
+        cond = ['__index_month == {}'.format(m) for m in months]
+        selection = dat.query(' | '.join(cond)).index
+        dat.drop('__index_month', axis=1, inplace=True)
+
+        if dropna:
+            return dat.loc[selection]
+        else:
+            dat.loc[dat.index.difference(selection)] = np.nan
+            return dat
+
+    def calc_metrics(self, data, gpi_info):
+        """
+        Calculates the desired statistics, for each set that was defined.
+
+        Parameters
+        ----------
+        data : pandas.DataFrame
+            with 2 columns, the first column is the reference dataset
+            named 'ref'
+            the second column the dataset to compare against named 'other'
+        gpi_info : tuple
+            Grid point info (i.e. gpi, lon, lat)
+        """
+        dataset = self.result_template.copy()
+
+        for setname, months in self.sets.items():
+            df = self.filter_months(data, months=months, dropna=True)
+            ds = self.cls.calc_metrics(df, gpi_info=gpi_info)
+            for metric, res in ds.items():
+                if metric in self.non_seas_metrics:
+                    k = f"{metric}"
+                else:
+                    k = f"{setname}_{metric}"
+                dataset[k] = res
+
+        return dataset
+
 
 class MetadataMetrics(object):
     """
@@ -352,79 +452,6 @@ def calc_metrics(self, data, gpi_info):
 
         return dataset
 
-
-class BasicSeasonalMetrics(MetadataMetrics):
-    """
-    This class just computes basic metrics on a seasonal basis. It also stores information about
-    gpi, lat, lon and number of observations.
-    """
-
-    def __init__(self, result_path=None, other_name='k1',
-                 metadata_template=None):
-
-        self.result_path = result_path
-        self.other_name = other_name
-
-        super(BasicSeasonalMetrics, self).__init__(other_name=other_name,
-                                                   metadata_template=metadata_template)
-
-        metrics = {'R': np.float32([np.nan]),
-                   'p_R': np.float32([np.nan]),
-                   'rho': np.float32([np.nan]),
-                   'p_rho': np.float32([np.nan]),
-                   'n_obs': np.int32([0])}
-
-        self.seasons = ['ALL', 'DJF', 'MAM', 'JJA', 'SON']
-
-        for season in self.seasons:
-            for metric in metrics.keys():
-                key = "{:}_{:}".format(season, metric)
-                self.result_template[key] = metrics[metric].copy()
-
-        self.month_to_season = np.array(['', 'DJF', 'DJF', 'MAM', 'MAM',
-                                         'MAM', 'JJA', 'JJA', 'JJA', 'SON',
-                                         'SON', 'SON', 'DJF'])
-
-    def calc_metrics(self, data, gpi_info):
-        """
-        calculates the desired statistics
-
-        Parameters
-        ----------
-        data : pandas.DataFrame
-            with 2 columns, the first column is the reference dataset
-            named 'ref'
-            the second column the dataset to compare against named 'other'
-        gpi_info : tuple
-            Grid point info (i.e. gpi, lon, lat)
-        """
-        dataset = super(BasicSeasonalMetrics,
-                        self).calc_metrics(data, gpi_info)
-
-        for season in self.seasons:
-
-            if season != 'ALL':
-                subset = self.month_to_season[data.index.month] == season
-            else:
-                subset = np.ones(len(data), dtype=bool)
-
-            if subset.sum() < 10:
-                continue
-
-            x = data['ref'].values[subset]
-            y = data[self.other_name].values[subset]
-            R, p_R = metrics.pearsonr(x, y)
-            rho, p_rho = metrics.spearmanr(x, y)
-
-            dataset['{:}_n_obs'.format(season)][0] = subset.sum()
-            dataset['{:}_R'.format(season)][0] = R
-            dataset['{:}_p_R'.format(season)][0] = p_R
-            dataset['{:}_rho'.format(season)][0] = rho
-            dataset['{:}_p_rho'.format(season)][0] = p_rho
-
-        return dataset
-
-
 class HSAF_Metrics(MetadataMetrics):
     """
     This class computes metrics as defined by the H-SAF consortium in
@@ -613,9 +640,11 @@ class IntercomparisonMetrics(MetadataMetrics):
         if True then also tau is calculated. This is set to False by default
         since the calculation of Kendalls tau is rather slow and can significantly
         impact performance of e.g. global validation studies
-    dataset_names : list
+    dataset_names : list, optional (default: None)
         Names of the original datasets, that are used to find the lookup table
         for the df cols.
+    metadata_template: dict, optional (default: None)
+        See MetadataMetrics
     """
 
     def __init__(self, other_names=('k1', 'k2', 'k3'), calc_tau=False,
@@ -1129,7 +1158,7 @@ def calc_metrics(self, data, gpi_info, window_size='30d', center=True,
         return dataset
 
 
-@jit(parallel=True)
+@jit
 def rolling_pr_rmsd(timestamps, data, window_size, center, min_periods):
     """
     Computation of rolling Pearson R.
@@ -1180,11 +1209,12 @@ def rolling_pr_rmsd(timestamps, data, window_size, center, min_periods):
             if np.abs(pr_arr[i, 0]) == 1.0:
                 pr_arr[i, 1] = 0.0
             else:
-                df = n_obs - 2
+                df = n_obs - 2.
                 t_squared = pr_arr[i, 0]*pr_arr[i, 0] * \
                     (df / ((1.0 - pr_arr[i, 0]) * (1.0 + pr_arr[i, 0])))
-                t_squared = np.clip(t_squared, None, 1.0)
-                pr_arr[i, 1] = betainc(0.5*df, 0.5, df / (df + t_squared))
+                x = df / (df + t_squared)
+                x = np.ma.where(x < 1.0, x, 1.0)
+                pr_arr[i, 1] = betainc(0.5*df, 0.5, x)
 
             # rmsd
             rmsd_arr[i] = np.sqrt(
@@ -1227,3 +1257,20 @@ def get_dataset_names(ref_key, datasets, n=3):
         dataset_names.append(name[0])
 
     return dataset_names
+
+if __name__ == '__main__':
+    calc = IntercomparisonMetrics(other_names=('k1', 'k2', 'k3'),
+                                  calc_tau=False,
+                                  metadata_template=dict(meta1=np.array(['TBD']),
+                                                         meta2=np.float32([np.nan])))
+
+    adapted = MonthsMetricsAdapter(calc)
+
+    idx = pd.date_range('2000-01-01', '2010-07-21', freq='D')
+    df = pd.DataFrame(index=idx,
+                      data={'ref': np.random.rand(idx.size),
+                            'k1': np.random.rand(idx.size),
+                            'k2': np.random.rand(idx.size),
+                            'k3': np.random.rand(idx.size)})
+
+    adapted.calc_metrics(df, (0,1,2,{'meta1':'meta', 'meta2':12}))
diff --git a/setup.cfg b/setup.cfg
@@ -48,6 +48,7 @@ addopts = tests
 # in order to write a coverage file that can be read by Jenkins.
 addopts =
     --cov pytesmo --cov-report term-missing
+    --cov-append
     --verbose
     --mpl
 

diff --git a/tests/test_validation_framwork/test_metric_calculators.py b/tests/test_validation_framwork/test_metric_calculators.py
@@ -35,8 +35,8 @@
 from pytesmo.validation_framework.metric_calculators import TCMetrics
 from pytesmo.validation_framework.metric_calculators import FTMetrics
 from pytesmo.validation_framework.metric_calculators import HSAF_Metrics
-from pytesmo.validation_framework.metric_calculators import BasicSeasonalMetrics
 from pytesmo.validation_framework.metric_calculators import RollingMetrics
+from pytesmo.validation_framework.metric_calculators import MonthsMetricsAdapter
 import pytesmo.metrics as metrics
 
 import warnings
@@ -385,7 +385,7 @@ def test_BasicSeasonalMetrics():
     df = make_some_data()
     data = df[['ref', 'k1']]
 
-    metriccalc = BasicSeasonalMetrics(other_name='k1')
+    metriccalc = MonthsMetricsAdapter(BasicMetrics(other_name='k1'))
     res = metriccalc.calc_metrics(data, gpi_info=(0, 0, 0))
 
     should = dict(ALL_n_obs=np.array([366]), dtype='float32')
@@ -403,8 +403,9 @@ def test_BasicSeasonalMetrics_metadata():
 
     metadata_dict_template = {'network': np.array(['None'], dtype='U256')}
 
-    metriccalc = BasicSeasonalMetrics(
-        other_name='k1', metadata_template=metadata_dict_template)
+
+    metriccalc = MonthsMetricsAdapter(BasicMetrics(
+        other_name='k1', metadata_template=metadata_dict_template))
     res = metriccalc.calc_metrics(
         data, gpi_info=(0, 0, 0, {'network': 'SOILSCAPE'}))
 
@@ -470,7 +471,3 @@ def test_RollingMetrics():
 
     rmsd_arr = np.array(rmsd_arr)
     np.testing.assert_almost_equal(dataset['RMSD'][0][29:-1], rmsd_arr)
-
-
-if __name__ == '__main__':
-    test_RollingMetrics()