# Comparing the efficiency of certain functions on DataFrame and Array

In [1]:
import pandas as pd
import numpy as np

# 1. For-traversal

In [9]:
a = np.ones(100000)
array = a.reshape(100,1000)

In [10]:
type(array)

numpy.ndarray

In [11]:
df = pd.DataFrame(array)

In [12]:
type(df)

pandas.core.frame.DataFrame

In [17]:
def df_circle_loc():
    sum = 0
    for index in df.index:
        for column in df.columns:
            sum += df.loc[index,column]
    return sum

In [18]:
%timeit df_circle_loc()

1.03 s ± 53.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
def df_circle_iloc():
    sum = 0
    for i in range(100):
        for j in range(1000):
            sum += df.iloc[i,j]
    return sum

In [22]:
%timeit df_circle_iloc()

1.17 s ± 55.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


df.iloc效率低于df.loc

In [23]:
def array_circle():
    sum = 0
    for i in range(100):
        for j in range(1000):
            sum += array[i,j]
    return sum

In [24]:
%timeit array_circle()

34.4 ms ± 2.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


array遍历的效率远高于DataFrame

In [27]:
def array_circle_transform():
    sum = 0
    arr = df.values
    for i in range(100):
        for j in range(1000):
            sum += arr[i,j]
    return sum

In [28]:
%timeit array_circle_transform()

34.2 ms ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
def transform_array():
    a = pd.DataFrame(array)

In [30]:
def transform_dataframe():
    b = df.values

In [31]:
%timeit transform_array()

101 µs ± 7.63 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [32]:
%timeit transform_dataframe()

6.36 µs ± 338 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


The transformation between DataFrame and Array takes little time; But the transform from DataFrame to Array takes less

# 2. Index

In [39]:
array = np.array(range(100000)).reshape(100,1000).T

In [40]:
df = pd.DataFrame(array)

In [42]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,1000,2000,3000,4000,5000,6000,7000,8000,9000,...,90000,91000,92000,93000,94000,95000,96000,97000,98000,99000
1,1,1001,2001,3001,4001,5001,6001,7001,8001,9001,...,90001,91001,92001,93001,94001,95001,96001,97001,98001,99001
2,2,1002,2002,3002,4002,5002,6002,7002,8002,9002,...,90002,91002,92002,93002,94002,95002,96002,97002,98002,99002
3,3,1003,2003,3003,4003,5003,6003,7003,8003,9003,...,90003,91003,92003,93003,94003,95003,96003,97003,98003,99003
4,4,1004,2004,3004,4004,5004,6004,7004,8004,9004,...,90004,91004,92004,93004,94004,95004,96004,97004,98004,99004


In [48]:
def array_index():
    for i in range(1000):
        a = array[array[:,0]==i]

In [49]:
%timeit array_index()

7.48 ms ± 500 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
def df_index():
    for i in range(1000):
        a = df.loc[df.loc[:,0]==i]

In [58]:
%timeit df_index()

588 ms ± 30.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Indexing elements in DataFrame takes far more time than that in Array

# 3. Value assignment

In [60]:
def array_value_assignment():
    copy = array.copy()
    for i in range(1000):
        for j in range(100):
            copy[i,j] = copy[i,j] + 1
    return copy

In [64]:
%timeit array_copy = array_value_assignment()

68 ms ± 4.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [65]:
def df_value_assignment():
    copy = df.copy()
    for i in range(1000):
        for j in range(100):
            copy.iloc[i,j] = copy.iloc[i,j] + 1
    return copy

In [66]:
%timeit df_copy = df_value_assignment()

32 s ± 6.96 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


Value assignment plus index for on elements in DataFrame takes far far more time than that in Array 