In [40]:
import pandas as pd
import numpy as np
from timeit import timeit
pd.__version__

'0.24.2'

In [8]:
df = pd.read_csv('demand_profile.csv')
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [9]:
df.dtypes

date_time      object
energy_kwh    float64
dtype: object

### Leer sin nada

In [24]:
%%timeit
df = pd.read_csv('demand_profile.csv')

10.3 ms ± 356 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit
df = pd.read_csv('demand_profile.csv')
df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%y %H:%M')

40.6 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
%%timeit
df = pd.read_csv('demand_profile.csv')
df['date_time'] = pd.to_datetime(df['date_time'])

1.21 s ± 68.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%%timeit
dateparse = lambda x: pd.datetime.strptime(x, '%d/%m/%y %H:%M')
df = pd.read_csv('demand_profile.csv', parse_dates=['date_time'], date_parser=dateparse)

198 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
# df.set_index('date_time', inplace=True)

pd.cut(x=df.index.hour,
       bins=[0, 7, 17, 24],
       include_lowest=True,
       labels=[12, 20, 28]).astype(int)

array([12, 12, 12, ..., 28, 28, 28])

In [32]:
df.index.hour

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
            14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
           dtype='int64', name='date_time', length=8760)

### Manera super rapida

In [38]:
def apply_tariff_cut(df):
    cents_per_kwh = pd.cut(x=df.index.hour,
                           bins=[0, 7, 17, 24],
                           include_lowest=True,
                           labels=[12, 20, 28]).astype(int)
    df['cost_cents'] = cents_per_kwh * df['energy_kwh']

In [39]:
%timeit apply_tariff_cut(df)

3.13 ms ± 535 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


## What about using numpy:

In [54]:
def apply_tariff_digitize(df):
    prices = np.array([12, 20, 28])
    bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
    df['cost_cents'] = prices[bins] * df['energy_kwh'].values

In [55]:
%timeit apply_tariff_digitize(df)

1.13 ms ± 99.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Pruebas random con map

In [None]:
df['entero'] = np.random.randint(1,5000, len(df))

In [80]:
def function1(entero):
    aux = df[df.entero == entero]
    return sum(aux.entero)
    

In [82]:
%%timeit
aux_list = []
for i in range(len(df)):
    aux_list.append(function1(i))

8.61 s ± 630 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [84]:
%%timeit
list(map(function1, range(len(df))))

8.57 s ± 530 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
%timeit [function1(i) for i in range(len(df))]

7.95 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [78]:
aux_list1

NameError: name 'aux_list1' is not defined

In [98]:
%timeit df.iloc[1:10, 2]

221 µs ± 4.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [89]:
%timeit df.entero[1:10]

184 µs ± 13 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [90]:
%timeit df[1:10].entero

250 µs ± 6.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [104]:
%timeit np.repeat(1,1000)

10.7 µs ± 90.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [105]:
%timeit np.array([1]*1000)

84.1 µs ± 1.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [107]:
np.repeat([1,2],4)

array([1, 1, 1, 1, 2, 2, 2, 2])

In [108]:
[1,2]*3

[1, 2, 1, 2, 1, 2]

In [109]:
np.arange(1,10, 2)

array([1, 3, 5, 7, 9])

In [112]:
np.random

<module 'numpy.random' from 'C:\\Users\\rrecarey\\AppData\\Local\\Continuum\\miniconda3\\lib\\site-packages\\numpy\\random\\__init__.py'>

In [139]:
df1.head(3)

Unnamed: 0,date_time,energy_kwh,cost_cents,entero
0,2013-01-01 00:00:00,0.586,7.032,8
1,2013-01-01 01:00:00,0.58,6.96,4
2,2013-01-01 02:00:00,0.572,6.864,3


In [129]:
%timeit df1.iloc[1:100, [0,2]]

1.03 ms ± 60.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [138]:
%timeit df1.loc[1:100, ["energy_kwh","entero"]]

1.58 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [143]:
df1[]

0       8
1       4
2       3
3       2
4       8
5       9
6       7
7       6
8       6
9       3
10      2
11      6
12      7
13      4
14      2
15      5
16      1
17      4
18      9
19      2
20      5
21      7
22      2
23      6
24      1
25      9
26      8
27      9
28      5
29      4
       ..
8730    3
8731    9
8732    1
8733    1
8734    1
8735    8
8736    7
8737    3
8738    3
8739    5
8740    2
8741    3
8742    8
8743    5
8744    2
8745    6
8746    8
8747    9
8748    7
8749    6
8750    7
8751    4
8752    1
8753    4
8754    8
8755    8
8756    4
8757    8
8758    1
8759    1
Name: entero, Length: 8760, dtype: int32

In [147]:
np.where(np.array(np.repeat([1,2],100)) == 2)

(array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
        139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
        152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
        165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
        178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
        191, 192, 193, 194, 195, 196, 197, 198, 199], dtype=int64),)