## Data formats with Pandas and Numpy

In [1]:
import pandas as pd
import numpy as np

n_rows = 100000

dataset = pd.DataFrame(
    data={
        'string': np.random.choice(('apple', 'banana', 'carrot'), size=n_rows),
        'timestamp': pd.date_range("20130101", periods=n_rows, freq="s"),
        'integer': np.random.choice(range(0,10), size=n_rows),
        'float': np.random.uniform(size=n_rows),
    },
)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   string     100000 non-null  object        
 1   timestamp  100000 non-null  datetime64[ns]
 2   integer    100000 non-null  int32         
 3   float      100000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(1), object(1)
memory usage: 2.7+ MB


In [2]:
n = 1000

data_array = np.random.uniform(size=(n,n))
np.info(data_array)

class:  ndarray
shape:  (1000, 1000)
strides:  (8000, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x23df9bb6040
byteorder:  little
byteswap:  False
type: float64


In [3]:
import pickle

with open('data_array.pickle', 'wb') as f:
    pickle.dump(data_array, f)

with open('data_array.pickle', 'rb') as f:
    data_array_pickle = pickle.load(f)

## Exercise 1

In [4]:
import pickle 
test = 'this is string'
with open('teststring.pickle', 'wb') as f:
    pickle.dump(test, f)

with open('teststring.pickle', 'rb') as f:
    teststring_pickle = pickle.load(f)

print(test, teststring_pickle)
print(test == teststring_pickle)

this is string this is string
True


## Storing tidy data


In [5]:
dataset.to_csv('dataset.csv', index=False)

dataset_csv = pd.read_csv('dataset.csv')

In [6]:
np.savetxt('data_array.csv', data_array)

data_array_csv = np.loadtxt('data_array.csv')

In [12]:
dataset.to_feather('dataset.feather')
dataset_feather = pd.read_feather('dataset.feather')

In [13]:
dataset.to_parquet('dataset.parquet')
dataset_parquet = pd.read_parquet('dataset.parquet')

## Exercise 2

In [14]:
import pandas as pd
import numpy as np

n_rows = 100000

dataset = pd.DataFrame(
    data={
        'string': np.random.choice(('apple', 'banana', 'carrot'), size=n_rows),
        'timestamp': pd.date_range("20130101", periods=n_rows, freq="s"),
        'integer': np.random.choice(range(0,10), size=n_rows),
        'float': np.random.uniform(size=n_rows),
    },
)

In [15]:
dataset.to_csv('dataset.csv', index=False)

dataset_csv = pd.read_csv('dataset.csv')

print(dataset.compare(dataset_csv))

          float          
           self     other
0      0.270126  0.270126
3      0.371348  0.371348
4      0.023318  0.023318
5      0.442812  0.442812
7      0.020375  0.020375
...         ...       ...
99987  0.192471  0.192471
99990  0.957900  0.957900
99992  0.262314  0.262314
99995  0.113735  0.113735
99996  0.046833  0.046833

[36244 rows x 2 columns]


## Storing array data

In [16]:
np.save('data_array.npy', data_array)
data_array_npy = np.load('data_array.npy')

In [17]:
np.savez('data_arrays.npz', data_array0=data_array, data_array1=data_array)
data_arrays = np.load('data_arrays.npz')
data_arrays['data_array0']

array([[0.27947882, 0.43673478, 0.60732861, ..., 0.4466572 , 0.58437816,
        0.44634046],
       [0.19893867, 0.37120525, 0.88987245, ..., 0.74517748, 0.94108994,
        0.4918413 ],
       [0.1270019 , 0.81074491, 0.80777276, ..., 0.50451509, 0.21498927,
        0.99134728],
       ...,
       [0.24713794, 0.63883172, 0.94980864, ..., 0.37222326, 0.30727534,
        0.08626338],
       [0.75021352, 0.71982179, 0.72187804, ..., 0.47907573, 0.80351434,
        0.13792698],
       [0.9627765 , 0.10380325, 0.44014827, ..., 0.68165828, 0.41728566,
        0.98563194]])

## HDF5 (Hierarchical Data Format version 5)

In [24]:
dataset.to_hdf('dataset.h5', key='dataset', mode='w')
dataset_hdf5 = pd.read_hdf('dataset.h5')

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.

In [26]:
import h5py

# Writing:

# Open HDF5 file
h5_file = h5py.File('data_array.h5', 'w')
# Write dataset
h5_file.create_dataset('data_array', data=data_array)
# Close file and write data to disk. Important!
h5_file.close()

# Reading:

# Open HDF5 file again
h5_file = h5py.File('data_array.h5', 'r')
# Read the full dataset
data_array_h5 = h5_file['data_array'][()]
# Close file
h5_file.close()

## NetCDF4 (Network Common Data Form version 4)

In [31]:
# Write tidy data as NetCDF4
dataset.to_xarray().to_netcdf('dataset.nc', engine='h5netcdf')
# Read tidy data from NetCDF4
import xarray as xr
dataset_xarray = xr.open_dataset('dataset.nc', engine='h5netcdf')
dataset_netcdf4 = dataset_xarray.to_pandas()
dataset_xarray.close()

ValueError: unrecognized engine h5netcdf must be one of: ['store']

In [32]:
# Write array data as NetCDF4
xr.DataArray(data_array).to_netcdf('data_array.nc', engine='h5netcdf')
# Read array data from NetCDF4
data_array_xarray = xr.open_dataarray('data_array.nc', engine='h5netcdf')
data_array_netcdf4 = data_array_xarray.to_numpy()
data_array_xarray.close()

ValueError: unrecognized engine h5netcdf must be one of: ['store']

## Exercise 3

In [33]:
n = 1000

data_array = np.random.uniform(size=(n,n))

In [34]:
np.save('data_array.npy', data_array)
data_array_npy = np.load('data_array.npy')
np.all(data_array == data_array_npy)

True

## JSON

In [36]:
dataset.to_json('dataset.json')
dataset_json = pd.read_json('dataset.json')