In [23]:
import pandas as pd
import numpy as np
import simulation_func as sf  # self-written simulation functions
import time

<module 'simulation_func' from '/Users/tsung_macbook/Dropbox/Ad_python/final/simulation_func.py'>

## simulation_setup

* This script sets up the environment required for our simulation
* The output will be a Python dictionary saved at "simulated_values.pkl"
```
simulated_values.keys() == dict_keys(['simulated_time', 'simulated_distance', 'simulated_trip', 'wait_time', 'simulated_fare'])
```
**Time simulation**

- turn buffer to numpy
- avoid repetitive filtering
- try to use numba.jit but not compatible with interp1d function
- multiprocessing


**Trip simulation**

- multiprocessing

**wait time**
- turn to np.array

**Distance simulation**

- pandas group by 
- calcualte all at once by pandas apply

**Fare simulation**
- numpy broadcasting

In [158]:
# !python simulation_setup.py   # simulation setup

Time took for time simulation: 11.989735126495361
Time took for trip simulation: 29.841912984848022
The R squared of the model is:  0.8705780665239197
Time took for distance simulation: 0.23627686500549316
Time took for distance & wait time simulation: 0.0191953182220459
Finish simulation setup, total time spent: 43.08929991722107


In [6]:
simulated_values = sf.load_py("simulated_values.pkl") # load simulation setup values

In [77]:
start_time = time.time()
seed = 0                           # set specific seed for reproduction
sim1_zone_freq = np.zeros(40)      # buffer for saving 40 zones frequency
iter_per_zone = 100                # no. of iteration for 1 starting zone
sim1_trips = np.zeros((40*iter_per_zone, 3)) # empty 2D array to store results
for i_start in range(40):
    for j in range(iter_per_zone):
        seed += 1 
        sim1_result = sf.driver_sim1(simulated_values, i_start, verbose=0, seed=seed)
        sim1_zone_freq += sim1_result["zone_freq"]
        sim1_trips[i_start*iter_per_zone + j, :] = [i_start, sim1_result["num_of_trip"], sim1_result["total_fare"]]
        # save i_start, sim1_result["total_fare"], sim1_result["num_of_trip"] into a array
print(f"Time spent for {iter_per_zone} trips per zone: {time.time()-start_time}")
sim1_df = pd.DataFrame(sim1_trips, columns=["start_zone", "num_of_trip", "total_fare"])
sim1_df.start_zone = sim1_df.start_zone.astype("i")
sim1_df.head()


Time spent for 100 trips per zone: 1.2216053009033203


Unnamed: 0,start_zone,num_of_trip,total_fare
0,0,53.0,449.996712
1,0,71.0,605.584812
2,0,60.0,525.350425
3,0,61.0,501.770993
4,0,58.0,453.616456


In [79]:
# some simple summary:
sim1_df.groupby('start_zone', as_index=True).describe()

Unnamed: 0_level_0,num_of_trip,num_of_trip,num_of_trip,num_of_trip,num_of_trip,num_of_trip,num_of_trip,num_of_trip,total_fare,total_fare,total_fare,total_fare,total_fare,total_fare,total_fare,total_fare
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
start_zone,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,100.0,60.71,4.68243,50.0,58.0,60.0,64.0,74.0,100.0,494.083472,47.285553,415.990478,461.515492,490.264831,517.858814,628.855801
1,100.0,60.67,4.9687,48.0,57.0,60.0,64.0,80.0,100.0,495.948827,47.960193,383.974648,464.210243,490.097455,522.996626,668.330392
2,100.0,61.17,4.413146,51.0,59.0,61.0,64.0,73.0,100.0,501.506374,44.863997,398.347353,474.271489,500.52319,531.988378,601.915394
3,100.0,60.36,4.768415,47.0,57.0,60.0,64.0,72.0,100.0,488.787921,48.551976,397.849605,454.732621,487.686993,519.327837,627.409715
4,100.0,61.23,4.730847,50.0,58.0,61.0,64.0,72.0,100.0,479.158653,43.840605,379.44813,453.318875,474.884816,509.232842,584.749806
5,100.0,60.39,4.496677,49.0,58.0,60.5,63.0,71.0,100.0,488.624596,41.55406,390.395627,465.465616,485.298697,513.197008,585.142573
6,100.0,61.1,5.170819,49.0,58.0,60.5,65.0,73.0,100.0,496.231246,48.734611,358.570273,464.868544,491.900106,527.052157,635.825229
7,100.0,60.48,5.256031,48.0,57.0,61.0,64.0,74.0,100.0,498.028455,48.083314,389.71316,466.623034,495.636458,533.225498,608.255574
8,100.0,60.81,4.441983,52.0,58.0,60.5,63.25,73.0,100.0,492.602244,40.399941,406.026039,460.040686,493.213084,519.004699,607.319629
9,100.0,61.02,4.24973,50.0,59.0,61.0,63.0,74.0,100.0,492.101278,40.906024,407.625929,468.032032,490.392064,511.079945,606.079476


In [99]:
# TODO: add folium plot
df = pd.DataFrame(np.vstack((np.arange(40), sim1_zone_freq)).T, columns=["zone", "frequency"])
df.head()

Unnamed: 0,zone,frequency
0,0.0,3923.0
1,1.0,2634.0
2,2.0,4643.0
3,3.0,3786.0
4,4.0,1519.0
