In [13]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
import pandas as pd
import random
import numpy as np
import timeit
from gorlib import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [14]:
#load data
dfs = load_file("./data/data.xlsx")
print(dfs.keys())
for key in dfs:
    if "Date" in dfs[key].columns:
        dfs[key]['Date'] = pd.to_datetime(dfs[key]['Date'])

dict_keys(['Forecasted Consumption', 'Meter List', 'Rates'])


## Results 

In [15]:
res = calc_costs(dfs['Forecasted Consumption'], dfs['Rates'], dfs['Meter List'], "2020-10-01", "2022-09-30")
print(res)

     Meter ID  Total Estimated Consumption (kWh)  Total Cost (Pounds)
0  1000000603                           24656.07                 8.26
1    10588707                          122376.66                28.48
2    10626610                          303294.62                88.16


In [16]:
randmeters = gen_rand_meters(dfs['Rates'], mcnt=100)
randconsm  = gen_mock_consn(randmeters["Meter ID"].tolist(),"2020-10-01", "2022-09-30")

In [17]:
res = calc_costs(randconsm, dfs['Rates'],randmeters, "2020-10-01", "2022-09-30")
print(res.head(10))

   Meter ID  Total Estimated Consumption (kWh)  Total Cost (Pounds)
0      1031                          108622.72                32.11
1      1017                          111533.77                33.45
2      1081                          115243.56                34.37
3      1029                          112660.54                38.62
4      1020                          106894.60                30.95
5      1026                          109057.60                31.57
6      1028                          110720.55                33.18
7      1089                          105243.94                30.51
8      1075                          108752.05                37.27
9      1048                          115538.20                43.26


# Profiling

In [18]:
randmeters = gen_rand_meters(dfs['Rates'], mcnt=100)
randconsm  = gen_mock_consn(randmeters["Meter ID"].tolist(),"2020-10-01", "2022-09-30")
permtr_cnt = randconsm.shape[0]/100.0

In [19]:
%%timeit
res = calc_costs(randconsm, dfs['Rates'],randmeters, "2020-10-01", "2022-09-30")

1 loop, best of 5: 654 ms per loop


In [20]:
randmeters = gen_rand_meters(dfs['Rates'], mcnt=1000)
randconsm  = gen_mock_consn(randmeters["Meter ID"].tolist(),"2020-10-01", "2022-09-30")

In [21]:
%%timeit
res = calc_costs(randconsm, dfs['Rates'],randmeters, "2020-10-01", "2022-09-30")

1 loop, best of 5: 11.3 s per loop


In [22]:
imports = '''
from gorlib import calc_costs, gen_rand_meters, gen_mock_consn, load_file
import pandas as pd
dfs = load_file("./data/data.xlsx")
for key in dfs:
    if "Date" in dfs[key].columns:
        dfs[key]['Date'] = pd.to_datetime(dfs[key]['Date'])
randmeters = gen_rand_meters(dfs['Rates'], mcnt={mcnt})
randconsm  = gen_mock_consn(randmeters["Meter ID"].tolist(),"2020-10-01", "2022-09-30")
'''
testcode= '''
calc_costs(randconsm, dfs['Rates'],randmeters, "2020-10-01", "2022-09-30")
'''

cntlist = [10,50,100,500,1000,2000]
pftime = [timeit.timeit(stmt=testcode, setup=imports.format(mcnt=cnt), number=10)/10.0 for cnt in cntlist]

In [None]:
import matplotlib.pyplot as plt
import numpy.polynomial.polynomial as poly
plt.figure()
plt.plot(cntlist,pftime, marker="o")
plt.xlabel("Meter count")
plt.ylabel("Execution time (s)")
plt.title("Meter cnt vs Execution Time")
plt.grid()

plt.figure()
plt.plot(np.array(cntlist)*permtr_cnt,pftime, marker="o")
plt.xlabel("Consumption dataframe row count")
plt.ylabel("Execution time (s)")
plt.title("Row cnt vs Execution Time")
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.grid()

plt.figure()
rcnt = np.array(cntlist)*permtr_cnt
plt.plot(rcnt,pftime, marker="o", label="actual")
p1coefs = poly.polyfit(rcnt, pftime, 1)
ffit = poly.polyval(rcnt, p1coefs)
plt.plot(rcnt, ffit,"--", label="degree=1")
p2coefs = poly.polyfit(rcnt, pftime, 2)
ffit = poly.polyval(rcnt, p2coefs)
plt.plot(rcnt, ffit,"--", label="degree=2")
plt.plot(rcnt, np.log(rcnt),"--", label="log(n)")
plt.legend()
plt.xlabel("Consumption dataframe row count")
plt.ylabel("Execution time (s)")
plt.title("Row cnt vs Execution Time with polyfit")
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.grid()
plt.show()

In [24]:
%lprun -f calc_meter_cost calc_costs(randconsm, dfs['Rates'], randmeters, "2020-10-01", "2022-09-30")

Timer unit: 1e-06 s

Total time: 22.1592 s
File: /home/radhika/dev/consumption_analysis/gorlib.py
Function: calc_meter_cost at line 17

Line #      Hits         Time  Per Hit   % Time  Line Contents
    17                                           def calc_meter_cost(forecast, rates, mid, ezone, aq, sdate, edate):
    18                                               """
    19                                               """
    20      1000    8261625.0   8261.6     37.3      fdsel = (forecast["Meter ID"]== mid) & (forecast["Date"] >= sdate) & (forecast["Date"] < edate)
    21      1000    1728509.0   1728.5      7.8      rsel = (rates['Exit Zone'] == ezone) & (rates['Annual Quantity (Min)'] <= aq) & (rates['Annual Quantity (Max)'] > aq)
    22      1000    1033609.0   1033.6      4.7      dff = forecast.loc[fdsel]
    23      1000    1118552.0   1118.6      5.0      dff = dff.set_index('Date')
    24      1000     231119.0    231.1      1.0      dff = dff.sort_index()
    25        

In [25]:
%%timeit
res = calc_costs_opt(randconsm, dfs['Rates'], randmeters, "2020-10-01", "2022-09-30")

1 loop, best of 5: 12.1 s per loop


In [None]:
%lprun -f calc_meter_cost_opt calc_costs_opt(randconsm, dfs['Rates'], randmeters, "2020-10-01", "2022-09-30")