In [1]:
import numpy as np
from datetime import time
from numpy.lib import recfunctions as rfn

np.set_printoptions(suppress=True, precision=3)

# Household power consumption
## Numpy implementation
[Data source](https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption)

In [2]:
#if you dont have data locally 

#import data
#data.download_power_ds()

In [3]:
%%time

info = {
    'names':('Active','Reactive','Voltage','Intensity','met1', 'met2', 'met3'),
    'formats':('f4','f4','f4','f4','i4','i4','i4')}

power_data = np.genfromtxt("data/household_power_consumption.txt", delimiter=';', dtype=info,usecols=range(2,9), skip_header=True, missing_values=['?'])

power_datetime = np.loadtxt("data/household_power_consumption.txt", delimiter=';', usecols=[0,1], dtype='U', skiprows=1)

CPU times: user 36.1 s, sys: 1.41 s, total: 37.5 s
Wall time: 37.7 s


In [4]:
power_data

array([(4.216, 0.418, 234.84, 18.4, 0, 1, 17),
       (5.36 , 0.436, 233.63, 23. , 0, 1, 16),
       (5.374, 0.498, 233.29, 23. , 0, 2, 17), ...,
       (0.938, 0.   , 239.82,  3.8, 0, 0,  0),
       (0.934, 0.   , 239.7 ,  3.8, 0, 0,  0),
       (0.932, 0.   , 239.55,  3.8, 0, 0,  0)],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4')])

In [5]:
power_datetime

array([['16/12/2006', '17:24:00'],
       ['16/12/2006', '17:25:00'],
       ['16/12/2006', '17:26:00'],
       ...,
       ['26/11/2010', '21:00:00'],
       ['26/11/2010', '21:01:00'],
       ['26/11/2010', '21:02:00']], dtype='<U10')

In [6]:
%%time

def to_dt(row):
    date_split = row[0].split('/')[::-1]    
    for i in [1,2]:        
        if len(date_split[i]) < 2:
            date_split[i] = "0" + date_split[i]
    dateISO = "-".join(date_split)
    return np.datetime64(dateISO + "T" + row[1]) 

dts = np.apply_along_axis(to_dt, 1, power_datetime)

CPU times: user 21.2 s, sys: 2.78 ms, total: 21.2 s
Wall time: 21.2 s


In [7]:
power = rfn.append_fields(power_data, 'DateTime', dts, usemask=False)
power

array([(4.216, 0.418, 234.84, 18.4, 0, 1, 17, '2006-12-16T17:24:00'),
       (5.36 , 0.436, 233.63, 23. , 0, 1, 16, '2006-12-16T17:25:00'),
       (5.374, 0.498, 233.29, 23. , 0, 2, 17, '2006-12-16T17:26:00'), ...,
       (0.938, 0.   , 239.82,  3.8, 0, 0,  0, '2010-11-26T21:00:00'),
       (0.934, 0.   , 239.7 ,  3.8, 0, 0,  0, '2010-11-26T21:01:00'),
       (0.932, 0.   , 239.55,  3.8, 0, 0,  0, '2010-11-26T21:02:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

In [8]:
nan_mask = np.zeros((len(power)), dtype=bool)

for column in power.dtype.names:
    nan_mask = nan_mask | np.isnan(power[column])

power = power[~nan_mask]
power

array([(4.216, 0.418, 234.84, 18.4, 0, 1, 17, '2006-12-16T17:24:00'),
       (5.36 , 0.436, 233.63, 23. , 0, 1, 16, '2006-12-16T17:25:00'),
       (5.374, 0.498, 233.29, 23. , 0, 2, 17, '2006-12-16T17:26:00'), ...,
       (0.938, 0.   , 239.82,  3.8, 0, 0,  0, '2010-11-26T21:00:00'),
       (0.934, 0.   , 239.7 ,  3.8, 0, 0,  0, '2010-11-26T21:01:00'),
       (0.932, 0.   , 239.55,  3.8, 0, 0,  0, '2010-11-26T21:02:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

## Task 1
Select all rows where Active > 5 kW

In [9]:
%%time

power[power['Active'] > 5]

CPU times: user 9.8 ms, sys: 53 µs, total: 9.85 ms
Wall time: 8.85 ms


array([(5.36 , 0.436, 233.63, 23. , 0,  1, 16, '2006-12-16T17:25:00'),
       (5.374, 0.498, 233.29, 23. , 0,  2, 17, '2006-12-16T17:26:00'),
       (5.388, 0.502, 233.74, 23. , 0,  1, 17, '2006-12-16T17:27:00'),
       ...,
       (5.172, 0.05 , 235.18, 22. , 0, 38, 17, '2010-11-24T07:50:00'),
       (5.75 , 0.   , 234.4 , 24.6, 0, 39, 17, '2010-11-24T07:51:00'),
       (5.074, 0.24 , 238.55, 21.4, 1,  2, 18, '2010-11-25T07:21:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

## Task 2
Select all rows where Voltage > 235 V

In [10]:
%%time

power[power['Voltage'] > 235]

CPU times: user 25 ms, sys: 7.95 ms, total: 33 ms
Wall time: 32.7 ms


array([(3.666, 0.528, 235.68, 15.8, 0, 1, 17, '2006-12-16T17:28:00'),
       (3.52 , 0.522, 235.02, 15. , 0, 2, 17, '2006-12-16T17:29:00'),
       (3.702, 0.52 , 235.09, 15.8, 0, 1, 17, '2006-12-16T17:30:00'), ...,
       (0.938, 0.   , 239.82,  3.8, 0, 0,  0, '2010-11-26T21:00:00'),
       (0.934, 0.   , 239.7 ,  3.8, 0, 0,  0, '2010-11-26T21:01:00'),
       (0.932, 0.   , 239.55,  3.8, 0, 0,  0, '2010-11-26T21:02:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

## Task 3
Select all where met2 > met3 from rows where Intensity in \[19,20\]

In [11]:
%%time

condition = np.logical_and.reduce([ power['Intensity'] >= 19, power['Intensity'] <= 20, power['met2'] > power['met3'] ])

power[condition]

CPU times: user 24.7 ms, sys: 8.09 ms, total: 32.8 ms
Wall time: 30.8 ms


array([(4.464, 0.136, 234.66, 19. , 0, 37, 16, '2006-12-16T18:09:00'),
       (4.582, 0.258, 238.08, 19.6, 0, 13,  0, '2006-12-17T01:04:00'),
       (4.618, 0.104, 239.61, 19.6, 0, 27,  0, '2006-12-17T01:08:00'),
       ...,
       (4.602, 0.   , 237.08, 19.4, 0, 40, 17, '2010-11-24T07:55:00'),
       (4.536, 0.   , 237.03, 19. , 0, 39, 17, '2010-11-24T07:56:00'),
       (4.626, 0.   , 236.78, 19.4, 0, 39, 17, '2010-11-24T07:57:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

## Task 4
Select 500000 random unique rows. Calculate mean of all mets.

In [12]:
%%time

rnd_row_index = np.random.choice(np.arange(0,len(power)), 500000, replace=False)
power[rnd_row_index]

CPU times: user 309 ms, sys: 0 ns, total: 309 ms
Wall time: 307 ms


array([(1.388, 0.068, 240.41, 5.8, 0, 1, 19, '2008-10-18T05:05:00'),
       (0.838, 0.246, 242.8 , 3.6, 0, 0, 12, '2008-08-23T00:32:00'),
       (1.438, 0.062, 242.64, 5.8, 0, 0, 18, '2007-04-04T23:40:00'), ...,
       (0.44 , 0.046, 245.22, 2. , 0, 0,  0, '2008-11-02T06:51:00'),
       (1.458, 0.102, 241.06, 6. , 1, 0, 18, '2008-02-11T08:14:00'),
       (0.258, 0.   , 241.52, 1. , 0, 0,  1, '2010-05-02T04:52:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

In [13]:
def print_mean(array : np.ndarray, column : int):
    mean = np.mean(power[column])
    print(f"Column {column}: {mean}")

print_mean(power, 'met1')
print_mean(power, 'met2')
print_mean(power, 'met3')

Column met1: 1.1219233096502186
Column met2: 1.2985199679887571
Column met3: 6.45844735712055


## Task 5
From night(>=18:00) rows select all where minute consumption > 6kW

Split this subset by max met and for each select every third from 1 part and every fourth from 2 part

In [14]:
def row_time(row):    
    return row[0].time()

def get_time(arr):        
    dts = arr['DateTime'].astype('object').reshape((len(arr),1))
    return np.apply_along_axis(row_time, 1, dts)

In [15]:
%%time

condition = (get_time(power) >= time(18)) & (power['Active'] > 6)

power_night = power[condition]
power_night

CPU times: user 14.7 s, sys: 119 ms, total: 14.8 s
Wall time: 14.8 s


array([(6.052, 0.192, 232.93, 26.2,  0, 37, 17, '2006-12-16T18:05:00'),
       (6.752, 0.186, 232.12, 29. ,  0, 36, 17, '2006-12-16T18:06:00'),
       (6.474, 0.144, 231.85, 27.8,  0, 37, 16, '2006-12-16T18:07:00'),
       ...,
       (6.438, 0.308, 229.26, 28.4, 13, 39, 16, '2010-11-20T18:46:00'),
       (6.21 , 0.174, 228.82, 27.4, 21, 34, 17, '2010-11-20T18:49:00'),
       (6.364, 0.126, 229.38, 28.2, 17, 34, 16, '2010-11-20T18:52:00')],
      dtype=[('Active', '<f4'), ('Reactive', '<f4'), ('Voltage', '<f4'), ('Intensity', '<f4'), ('met1', '<i4'), ('met2', '<i4'), ('met3', '<i4'), ('DateTime', '<M8[s]')])

In [16]:
met1_max_condition = (power_night['met1'] >= power_night['met2']) & (power_night['met1'] >= power_night['met3'])
met2_max_condition = (power_night['met2'] > power_night['met1']) & (power_night['met2'] >= power_night['met3'])
met3_max_condition = (power_night['met3'] > power_night['met1']) & (power_night['met3'] > power_night['met2'])

power_met1 = power_night[met1_max_condition]
power_met2 = power_night[met2_max_condition]
power_met3 = power_night[met3_max_condition]

In [17]:
met1_first, met1_second = np.array_split(power_met1, 2)
met2_first, met2_second = np.array_split(power_met2, 2)
met3_first, met3_second = np.array_split(power_met3, 2)

In [18]:
met1_first[::3].shape

(290,)

In [19]:
met1_second[::4]

array([(7.568, 0.422, 230.46, 32.8, 36, 34, 16, '2008-02-02T19:26:00'),
       (7.988, 0.608, 229.99, 34.8, 35, 35, 16, '2008-02-02T19:33:00'),
       (8.456, 0.546, 227.48, 37.2, 35, 34, 15, '2008-02-02T19:41:00'),
       (6.058, 0.478, 230.7 , 26.2, 36,  5, 16, '2008-02-02T19:53:00'),
       (7.898, 0.298, 231.37, 34. , 71,  1, 16, '2008-02-02T20:44:00'),
       (7.764, 0.284, 229.46, 33.8, 71,  0, 16, '2008-02-02T20:50:00'),
       (6.31 , 0.246, 233.47, 27.2, 32,  0, 17, '2008-02-10T18:40:00'),
       (6.678, 0.14 , 231.01, 28.8, 36,  1, 16, '2008-02-10T18:44:00'),
       (6.478, 0.132, 228.72, 28.2, 36,  0, 16, '2008-02-10T18:48:00'),
       (6.368, 0.134, 229.6 , 27.6, 36,  1, 16, '2008-02-10T18:52:00'),
       (6.516, 0.242, 231.1 , 28.2, 36,  2, 16, '2008-02-10T18:56:00'),
       (6.582, 0.324, 232.91, 28.2, 36,  1, 17, '2008-02-10T19:00:00'),
       (6.544, 0.322, 232.59, 28.2, 37,  1, 16, '2008-02-10T19:04:00'),
       (6.726, 0.538, 232.59, 29. , 37,  2, 17, '2008-02-10T19:1

# Performance tests

In [20]:
import performance as pf
import pandas as pd

step = 100000
count = 10
performance_df = pd.DataFrame()

In [21]:
performance_df['Selection'] = pf.get_mean_execution_time_iterative(power, lambda arr: arr[arr['Active']>5], count=count, step=step)

In [22]:
performance_df['BuiltInCalculation'] = pf.get_mean_execution_time_iterative(power, lambda arr: arr['Active'].mean(), count=count, step=step)

In [23]:
import numpy.lib.recfunctions as rfn


def test_func(arr):
    rfn.apply_along_fields(np.mean, arr[['Active']])


performance_df['CustomCalculation'] = pf.get_mean_execution_time_iterative(power, test_func, count=count, step=step)

In [24]:
performance_df

Unnamed: 0,Selection,BuiltInCalculation,CustomCalculation
0,1.1e-05,3.8e-05,0.000193
1,0.000548,0.000432,0.001126
2,0.000934,0.000787,0.001955
3,0.001351,0.001096,0.002564
4,0.001786,0.001379,0.00329
5,0.002173,0.001837,0.003792
6,0.002371,0.002073,0.004413
7,0.002924,0.002131,0.004996
8,0.003167,0.002777,0.00611
9,0.003836,0.002993,0.007585


In [25]:
performance_df.to_csv("data/numpy-performance.csv", index=False)