lognormal_for_df broadcasts duplicate indexes code

RSGInc · Feb 13, 2020 · 7b0c332 · 7b0c332
1 parent 5163385
commit 7b0c332
Show file tree

Hide file tree

Showing 9 changed files with 85 additions and 18 deletions.
diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py
@@ -131,7 +131,7 @@ def atwork_subtour_destination_logsums(
         tour_purpose,
         logsum_settings, model_settings,
         skim_dict, skim_stack,
-        chunk_size, trace_hh_id,
+        chunk_size,
         trace_label)
 
     destination_sample['mode_choice_logsum'] = logsums

diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py
@@ -234,7 +234,7 @@ def joint_tour_destination_logsums(
             tour_purpose,
             logsum_settings, model_settings,
             skim_dict, skim_stack,
-            chunk_size, trace_hh_id,
+            chunk_size,
             trace_label=tracing.extend_trace_label(trace_label, tour_type))
 
         logsums_list.append(logsums)

diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py
@@ -222,7 +222,7 @@ def run_location_logsums(
         tour_purpose,
         logsum_settings, model_settings,
         skim_dict, skim_stack,
-        chunk_size, trace_hh_id,
+        chunk_size,
         trace_label)
 
     # "add_column series should have an index matching the table to which it is being added"

diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py
@@ -155,7 +155,7 @@ def compute_logsums(
         tours_merged,
         model_settings,
         skims,
-        chunk_size, trace_hh_id,
+        chunk_size,
         trace_label):
     """
     Calculate mode choice logsums using the same recipe as for trip_mode_choice, but do it twice
@@ -325,7 +325,7 @@ def choose_trip_destination(
         tours_merged=tours_merged,
         model_settings=model_settings,
         skims=skims,
-        chunk_size=chunk_size, trace_hh_id=trace_hh_id,
+        chunk_size=chunk_size,
         trace_label=trace_label)
 
     t0 = print_elapsed_time("%s.compute_logsums" % trace_label, t0)

diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py
@@ -53,7 +53,7 @@ def compute_logsums(choosers,
                     tour_purpose,
                     logsum_settings, model_settings,
                     skim_dict, skim_stack,
-                    chunk_size, trace_hh_id, trace_label):
+                    chunk_size, trace_label):
     """
 
     Parameters

diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py
@@ -154,7 +154,7 @@ def run_destination_logsums(
         tour_purpose,
         logsum_settings, model_settings,
         skim_dict, skim_stack,
-        chunk_size, trace_hh_id,
+        chunk_size,
         trace_label)
 
     destination_sample['mode_choice_logsum'] = logsums

diff --git a/activitysim/core/input.py b/activitysim/core/input.py
@@ -92,6 +92,9 @@ def read_from_table_info(table_info):
         logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
         df.to_hdf(h5_filepath, key=h5_tablename, mode='a')
 
+        #bug
+        #df.to_csv(config.output_file_path('input_data/%s.csv' % tablename), index=True)
+
     if drop_columns:
         for c in drop_columns:
             logger.info("dropping column '%s'" % c)

diff --git a/activitysim/core/random.py b/activitysim/core/random.py
@@ -12,6 +12,7 @@
 
 import numpy as np
 import pandas as pd
+from activitysim.core.util import reindex
 
 from .tracing import print_elapsed_time
 
@@ -194,6 +195,11 @@ def _generators_for_df(self, df):
         """
 
         # assert no dupes
+        #bug
+        if len(df.index.unique()) < len(df.index):
+            #print(df)
+            bug
+
         assert len(df.index.unique()) == len(df.index)
 
         df_row_states = self.row_states.loc[df.index]
@@ -250,10 +256,10 @@ def random_for_df(self, df, step_name, n=1):
         self.row_states.loc[df.index, 'offset'] += n
         return rands
 
-    def lognormal_for_df(self, df, step_name, mu, sigma):
+    def normal_for_df(self, df, step_name, mu, sigma, lognormal=False):
         """
-        Return a floating point random number in lognormal distribution for each row in df
-        using the appropriate random channel for each row.
+        Return a floating point random number in normal (or lognormal) distribution
+        for each row in df using the appropriate random channel for each row.
 
         Subsequent calls (in the same step) will return the next rand for each df row
 
@@ -296,9 +302,14 @@ def to_series(x):
         mu = to_series(mu)
         sigma = to_series(sigma)
 
-        rands = \
-            np.asanyarray([prng.lognormal(mean=mu[i], sigma=sigma[i])
-                           for i, prng in enumerate(generators)])
+        if lognormal:
+            rands = \
+                np.asanyarray([prng.lognormal(mean=mu[i], sigma=sigma[i])
+                               for i, prng in enumerate(generators)])
+        else:
+            rands = \
+                np.asanyarray([prng.normal(loc=mu[i], scale=sigma[i])
+                               for i, prng in enumerate(generators)])
 
         # update offset for rows we handled
         self.row_states.loc[df.index, 'offset'] += 1
@@ -599,9 +610,9 @@ def random_for_df(self, df, n=1):
         rands = channel.random_for_df(df, self.step_name, n)
         return rands
 
-    def lognormal_for_df(self, df, mu, sigma):
+    def normal_for_df(self, df, mu=0, sigma=1, broadcast=False):
         """
-        Return a single floating point random number in range [0, 1) for each row in df
+        Return a single floating point normal random number in range (-inf, inf) for each row in df
         using the appropriate random channel for each row.
 
         Subsequent calls (in the same step) will return the next rand for each df row
@@ -628,12 +639,65 @@ def lognormal_for_df(self, df, mu, sigma):
 
         Returns
         -------
-        rands : 1-D ndarray the same length as df
+        rands : 1-D ndarray the same length as df (or Series with same index as df)
             a single float in lognormal distribution for each row in df
         """
 
         channel = self.get_channel_for_df(df)
-        rands = channel.lognormal_for_df(df, self.step_name, mu, sigma)
+
+        if broadcast:
+            alts_df = df
+            df = df.index.unique().to_series()
+            rands = channel.normal_for_df(df, self.step_name, mu=0, sigma=1, lognormal=False)
+            rands = reindex(pd.Series(rands, index=df.index), alts_df.index)
+            rands = rands*sigma + mu
+        else:
+            rands = channel.normal_for_df(df, self.step_name, mu, sigma, lognormal=False)
+
+        return rands
+
+    def lognormal_for_df(self, df, mu, sigma, broadcast=False):
+        """
+        Return a single floating point lognormal random number in range [0, inf) for each row in df
+        using the appropriate random channel for each row.
+
+        Subsequent calls (in the same step) will return the next rand for each df row
+
+        The resulting array will be the same length (and order) as df
+        This method is designed to support alternative selection from a probability array
+
+        The columns in df are ignored; the index name and values are used to determine
+        which random number sequence to to use.
+
+        We assume that we can identify the channel to used based on the name of df.index
+        This channel should have already been registered by a call to add_channel (q.v.)
+
+        If "true pseudo random" behavior is desired (i.e. NOT repeatable) the set_base_seed
+        method (q.v.) may be used to globally reseed all random streams.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame, Series, or Index
+            df with index name and values corresponding to a registered channel
+
+        mu : float or array of floats with one value per df row
+        sigma : float or array of floats with one value per df row
+
+        Returns
+        -------
+        rands : 1-D ndarray the same length as df (or Series with same index as df)
+            a single float in lognormal distribution for each row in df
+        """
+
+        if broadcast:
+            # Note that the mean and standard deviation are not the values for the distribution itself,
+            # but of the underlying normal distribution it is derived from.
+            rands = self.normal_for_df(df, mu=mu, sigma=sigma, broadcast=True)
+            rands = np.exp(rands)
+        else:
+            channel = self.get_channel_for_df(df)
+            rands = channel.normal_for_df(df, self.step_name, mu=mu, sigma=sigma, lognormal=True)
+
         return rands
 
     def choice_for_df(self, df, a, size, replace):

diff --git a/docs/howitworks.rst b/docs/howitworks.rst
@@ -390,7 +390,7 @@ logsums settings and expression files.  The resulting logsums are added to the c
        tour_purpose,
        logsum_settings, model_settings,
        skim_dict, skim_stack,
-       chunk_size, trace_hh_id,
+       chunk_size,
        trace_label)
 
     location_sample_df['mode_choice_logsum'] = logsums