# Dealing with Data Frames

## A. Using Pandas

In [1]:
import pandas as pd
import time

In [2]:
# Generate a large data frame
df = pd.DataFrame({'A': range(1,20000001),'B': range(20000001,40000001)})
df.to_csv('large_data_frame.csv', index=False)

In [2]:
# Time Pandas reading and summation
start_time=time.time()
df = pd.read_csv('large_data_frame.csv')
result_pandas = df['A'].sum()
end_time=time.time()

# Print out the result and timing
print(f"Pandas result: {result_pandas}")
print(f"Pandas time: {end_time-start_time} seconds")

Pandas result: 200000010000000
Pandas time: 2.807502508163452 seconds


## B. Using Dask data frames 

In [3]:
import dask.dataframe as dd

In [4]:
# Time Dask data frame reading and summation
start_time=time.time()
ddf = dd.read_csv('large_data_frame.csv',blocksize=20e6)
result_pandas = ddf['A'].sum().compute()
end_time=time.time()

# Print out th result and timing
print(f"Dask result: {result_pandas}")
print(f"Dask time: {end_time-start_time} seconds")

Dask result: 200000010000000
Dask time: 1.1678805351257324 seconds


In [5]:
ddf

Unnamed: 0_level_0,A,B
npartitions=17,Unnamed: 1_level_1,Unnamed: 2_level_1
,int64,int64
,...,...
...,...,...
,...,...
,...,...


# Dealing with large arrays

## A. Using Numpy

In [6]:
import numpy as np

In [7]:
# Generate a large array
data_size = 100000000
numpy_array = np.random.random(data_size)

# Timing Numpy computation
start_time = time.time()
result_numpy = np.mean(numpy_array)
end_time = time.time()

# Print out the results
print(f"NumPy result: {result_numpy}")
print(f"NumPy time: {end_time - start_time} seconds")

NumPy result: 0.4999988994707882
NumPy time: 0.16039705276489258 seconds


## B. Using Dask arrays

In [8]:
import dask.array as da

In [10]:
# Timing Dask computation
dask_array = da.from_array(numpy_array, chunks=len(numpy_array) // 8)

start_time = time.time()
result_dask = da.mean(dask_array).compute()
end_time = time.time()

print(f"Dask result: {result_dask}")
print(f"Dask time: {end_time - start_time} seconds")

Dask result: 0.4999988994707872
Dask time: 0.02246546745300293 seconds


In [11]:
dask_array

Unnamed: 0,Array,Chunk
Bytes,762.94 MiB,95.37 MiB
Shape,"(100000000,)","(12500000,)"
Dask graph,8 chunks in 1 graph layer,8 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 762.94 MiB 95.37 MiB Shape (100000000,) (12500000,) Dask graph 8 chunks in 1 graph layer Data type float64 numpy.ndarray",100000000  1,

Unnamed: 0,Array,Chunk
Bytes,762.94 MiB,95.37 MiB
Shape,"(100000000,)","(12500000,)"
Dask graph,8 chunks in 1 graph layer,8 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
