In [None]:
#%config Completer.use_jedi = False

# **Chapter 4**
## *Persisting Time Series Data to Files*, 

This chapter covers different options and use cases to store time series data for later retrieval. The techniques will cover various methods and file types, whether on-premises or in the cloud. In addition, this chapter covers serialization, compression, overwriting, or appending to files. 

We will cover the following recipes on how to ingest data into a pandas DataFrame:
* Time series data serialization with pickle
* Writing to CSV and other delimited files
* Writing data to an Excel file
* Storing data to a private S3 bucket

# Recipe 1: Serializing time series data with pickle
* This recipe explains the concept of data serialization 
* Demonstrate how pickling works 

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
pd.__version__

In [None]:
file = \
Path('../../datasets/Ch4/time_series_covid19_confirmed_global.csv')
df = pd.read_csv(file)
df.head()

In [None]:
df.info()

In [None]:
# filter data where Country is United States
df_usa = df[df['Country/Region'] == 'US']
# filter columns from June to end of September
df_usa_summer = df_usa.loc[:, '6/1/21':'9/30/21']
# pivot the data 
df_usa_summer

In [None]:
# unpivot using pd.melt()
df_usa_summer_unpivoted = \
    pd.melt(df_usa_summer,
            value_vars=df_usa_summer.columns,
            value_name='cases',
            var_name='date').set_index('date')


df_usa_summer_unpivoted.index = \
    pd.to_datetime(df_usa_summer_unpivoted.index, format="%m/%d/%y")

df_usa_summer_unpivoted.info()

In [None]:
# df_usa_summer_unpivoted.index = \
# pd.to_datetime(df_usa_summer_unpivoted.index, format="%m/%d/%y")

# df_usa_summer_unpivoted.info()

In [None]:
print(df_usa_summer_unpivoted.head())

In [None]:
output =\
Path('../../datasets/Ch4/covid_usa_summer_2021.pkl')

df_usa_summer_unpivoted.to_pickle(output)

In [None]:
unpickled_df = pd.read_pickle(output)
unpickled_df.info()

## Writing a Pickle file using the Pickle library

In [None]:
import pickle

In [None]:
file_path = \
Path('../../datasets/Ch4/covid_usa_summer_2021_v2.pkl')

with open(file_path, "wb") as file:
    pickle.dump(df_usa_summer_unpivoted, file)

In [None]:
with open(file_path, "rb") as file:
    df = pickle.load(file)
type(df)

In [None]:
with open(output, "wb") as file:
    pickle.dump(df_usa_summer_unpivoted, file)

In [None]:
with open(output, "rb") as file:
    df = pickle.load(file)

In [None]:
zip_output =\
Path('../../datasets/Ch4/covid_usa_summer_2021.zip')

# write the Dataframe
df_usa_summer_unpivoted.to_pickle(zip_output)

In [None]:
# Read the DataFrame
pd.read_pickle(zip_output)

#### Check seriealization protocols

In [None]:
pickle.HIGHEST_PROTOCOL

In [None]:
with open(output, "wb") as file:
    pickle.dump(df_usa_summer_unpivoted,
                file,
                pickle.HIGHEST_PROTOCOL)

# this is equivalent to the following
with open(output, "wb") as file:
    pickle.dump(df_usa_summer_unpivoted,
                file,
                5)

## There is more

In [None]:
def covid_by_country(file, days, country):
    ts = pd.read_csv(file)
    ts = ts[ts['Country/Region'] == country]
    final = ts.iloc[:, -days:].sum()
    final.index = pd.to_datetime(final.index, 
                                format="%m/%d/%y")
    return final

In [None]:
file = \
Path('../../datasets/Ch4/time_series_covid19_confirmed_global.csv')

us_past_120_days = covid_by_country(file, 200, 'US')
us_past_120_days

In [None]:
plot_example = \
us_past_120_days.plot(title=f'COVID confirmed case for US',
                xlabel='Date',
                ylabel='Number of Confirmed Cases');

In [None]:
fig = plot_example.get_figure()

In [None]:
from datetime import datetime

metadata = {
    'date': datetime.now(),
    'data': '''
        COVID-19 Data Repository by the 
        Center for Systems Science and Engineering (CSSE) 
        at Johns Hopkins University'
        ''',
    'author': 'Tarek Atwan',
    'version': 1.0,
    'function': covid_by_country,
    'example_df' : us_past_120_days,
    'example_plot': plot_example
}

file_path = Path('../../datasets/Ch4/covid_data.pkl')

with open(file_path, 'wb') as file:
    pickle.dump(metadata, file)

In [None]:
with open(file_path, 'rb') as file:
    content = pickle.load(file)
content.keys()

In [None]:
file_path =\
Path('../../datasets/Ch4/time_series_covid19_confirmed_global.csv')

loaded_func = content['function']
loaded_func(file_path, 120, 'China').tail()

In [None]:
loaded_df = content['example_df']
loaded_df.tail()

In [None]:
loaded_plot = content['example_plot']
loaded_plot.get_figure()

In [None]:
pickle.HIGHEST_PROTOCOL

# Recipe 2: Writing as CSV and other delimited files

In [None]:
pd.read_csv(filepath)

In [None]:
import pandas as pd
from pathlib import Path
filepath = Path('../../datasets/Ch4/movieboxoffice.csv')

movies = pd.read_csv(filepath,
                 header=0,
                 parse_dates=[0],
                 index_col=0,
                 usecols=['Date',
                          'Daily'],
                date_format="%d-%b-%y")
movies.info()

In [None]:
movies.head()

In [None]:
output = Path('../../datasets/Ch4/df_movies.csv')
movies.to_csv(output)

In [None]:
output = Path('../../datasets/Ch4/piped_df_movies.csv')
movies.to_csv(output, sep='|')

In [None]:
movies_df = pd.read_csv(output, sep='|')

In [None]:
movies_df.info()


### Special cases when using `.to_csv()`

In [None]:
import pandas as pd
person = pd.DataFrame({
     'name': ['Bond, James', 'Smith, James', 'Bacon, Kevin'],
     'location': ['Los Angeles, CA', 'Phoenix, AZ', 'New York, NY'],
     'net_worth': [10000, 9000, 8000]
    })

print(person)

In [None]:
person.to_csv('person_a.csv', index=False)

In [None]:
pd.read_csv('person_a.csv')

In [None]:
import csv

person.to_csv('person_b.csv', 
               index=False, 
               quoting=csv.QUOTE_ALL)

person.to_csv('person_c.csv', 
               index=False, 
               quoting=csv.QUOTE_MINIMAL)

person.to_csv('person_d.csv', 
               index=False, 
               quoting= csv.QUOTE_NONNUMERIC)

person.to_csv('person_e.csv', 
               index=False, 
               quoting= csv.QUOTE_NONE, escapechar='\t')

# Recipe 3: Writing data to an Excel file

In the *Reading data from an Excel file* recipe in **Chapter 2**, *Reading Time Series Data from Files*, you were instructed to install `openpyxl` for the read engine. For this recipe, you will be using the same openpyxl for the write engine.  

* To install `openpyxl` using `conda`, run the following:

```
conda install openpyxl
```
* You can also use `pip`:

```
pip install openpyxl
```

In [None]:
import openpyxl
# import xlsxwriter

In [None]:
print(openpyxl.__version__)
# print(xlsxwriter.__version__)

In [None]:
# Preparing for the recipe
import pandas as pd
from pathlib import Path

filepath = Path('../../datasets/Ch4/movieboxoffice.csv')

movies = pd.read_csv(filepath,
                 header=0,
                 parse_dates=[0],
                 index_col=0,
                 usecols=['Date',
                          'Daily'],
                date_format="%d-%b-%y")


In [None]:
output = \
Path('../../datasets/Ch4/daily_boxoffice.xlsx')

movies.to_excel(output,
               sheet_name='movies_data',
               engine='openpyxl', # default engine for xlsx files
               index=True)


In [None]:
movies.info()

In [None]:
pd.read_excel(output).head()

In [None]:
from openpyxl.styles import NamedStyle

date_style = NamedStyle(name='datetime', number_format='DD/MM/YYYY HH:MM:MM')
ws['A1'].style = date_style

In [None]:
#!conda install conda-forge::xlsxwriter -y

# this is a fix for the OpenPyXL issue
date_col = 'Date'
with pd.ExcelWriter(output,  
                    engine='openpyxl',
                    mode='a',
                    if_sheet_exists='replace') as writer:
    movies.to_excel(writer, sheet_name='movies_fixed_dates', index=True)
   
    worksheet = writer.sheets['movies_fixed_dates']

    for col in worksheet.iter_cols():
        header = col[0] # capture headers
        if header.value == date_col:
            for row in range(2, # skip first row
                             worksheet.max_row+1):
                    worksheet.cell(
                        row, 
                        header.column
                                  ).number_format='MM-DD-YYYY'

In [None]:
# with pd.ExcelWriter(output,  
#                     engine='openpyxl',
#                     mode='a',
#                     date_format='%m-%d-%Y',
#                     # datetime_format='MM-DD-YYYY',
#                     if_sheet_exists='replace') as writer:
    
#     # wr = writer
    
#     movies.to_excel(writer, sheet_name='movies_fixed_dates', index=True)

In [None]:
# # wr.datetime_format
# # pd.ExcelWriter.engine
# date_format = 'MM-DD-YYYY'

# with pd.ExcelWriter(output,  
#                     engine='openpyxl',
#                     date_format = 'MM-DD-YYYY',
#                    datetime_format = 'MM-DD-YYYY') as writer:
#     # writer.datetime_format = date_format
#     movies.to_excel(writer, sheet_name='movies_fixed_dates')

In [None]:
with pd.ExcelWriter(output,  
                    engine='openpyxl',
                    mode='a',
                   if_sheet_exists='new') as writer:
 
    movies.to_excel(writer, sheet_name='movies_fixed_dates')

In [None]:
movies['Month'] = movies.index.month_name()
movies.head()

In [None]:
from openpyxl.utils.dataframe import dataframe_to_rows

### Split the DataFrame into multiple sheets by Month

In [None]:
def sheet_date_format(sheet_name, writer, date_col):
    worksheet = writer.sheets[sheet_name]
    
    for col in worksheet.iter_cols():
        header = col[0] 
        if header.value == date_col:
            for row in range(2, worksheet.max_row+1):
                    worksheet.cell(
                        row, 
                        header.column).number_format='MM-DD-YYYY'

In [None]:
movies['Month'] = movies.index.month_name()

output = Path('../../datasets/Ch4/boxoffice_by_month.xlsx')
with pd.ExcelWriter(output,
                    engine='openpyxl') as writer:
    for month, data in movies.groupby('Month'):
        data.to_excel(writer, sheet_name=month)
        sheet_date_format(month, writer, date_col='Date')


# Recipe 4: Storing Data to S3
In this recipe, you will explore writing to AWS S3 using pandas and another approach using the AWS Python SDK. The pandas approach can be used to write files to other cloud storage locations, such as Azure or Google Cloud.

In the *Reading data from a URL* recipe in **Chapter 2**, *Reading Time Series Data from Files*, you were instructed to install boto3 and s3fs in order to read from AWS S3 buckets. In this recipe, you will be leveraging the same libraries.

* To install using pip, you can use this:

```
pip install boto3 s3fs
```

* To install using conda, you can use this:

```
conda install boto3 s3fs -y
```

In [None]:
# !conda install -c conda-forge azure-storage-blob azure-identity -y

In [None]:
# !conda install -c conda-forge adlfs gcsfs -y

In [None]:
#!conda install boto3 s3fs google-cloud-storage gcsfs -y
# from google.cloud import storage
# storage_client = storage.Client(project=GCP_PROJECTID, credentials=GCP_API_KEY)
# bucket = storage_client.bucket('tscookbook')

In [None]:
import configparser
config = configparser.ConfigParser()
config.read('cloud.cfg')

AWS_ACCESS_KEY = config['AWS']['aws_access_key']
AWS_SECRET_KEY = config['AWS']['aws_secret_key']
AZURE_ACCOUNT_KEY = config['AZURE']['storage_account_key']
GCP_KEY_FILE = config['GCP']['key_file_path']


In [None]:
import pandas as pd
from pathlib import Path

source = "../../datasets/Ch4/boxoffice_by_month.xlsx"
movies = pd.concat(pd.read_excel(source,
             sheet_name=None,
             index_col='Date',
             parse_dates=True)).droplevel(0)

In [None]:
print(movies.head())
movies.info()

In [None]:
# import pandas as pd
# from google.cloud import storage

# # Replace with the path to your service account key
# key_file_path = 'valiant-cycle-431419-h2-c5bc48019533.json'

# # Authenticate using the service account key
# storage_client = storage.Client.from_service_account_json(key_file_path)

# # Rest of your code to read the CSV
# bucket_name = 'tscookbook'
# file_path = 'my_movies.csv'

# blob = storage_client.bucket(bucket_name).blob(file_path)
# blob.download_to_filename('temp.csv')

# df = pd.read_csv('temp.csv')

# # Delete the temporary file
# import os
# os.remove('temp.csv')

In [None]:
# Writing to Amazon S3

movies.to_csv('s3://tscookbook-private/movies_s3.csv',
               storage_options={
                   'key': AWS_ACCESS_KEY,
                   'secret': AWS_SECRET_KEY
               })

movies.to_excel('s3://tscookbook-private/movies_s3.xlsx',
               storage_options={
                   'key': AWS_ACCESS_KEY,
                   'secret': AWS_SECRET_KEY
               })

# Writing to Google Cloud Storage 


movies.to_csv('gs://tscookbook/movies_gs.csv',
               storage_options={'token': GCP_KEY_FILE})

movies.to_excel('gs://tscookbook/movies_gs.xlsx',
               storage_options={'token': GCP_KEY_FILE})

# Writing to Azure Blob Storage 

movies.to_csv("abfs://objects@tscookbook.dfs.core.windows.net/movies_abfs.csv",
             storage_options={
                 'account_key': AZURE_ACCOUNT_KEY
             })

movies.to_csv("az://objects@tscookbook.dfs.core.windows.net/movies_az.csv",
             storage_options={
                 'account_key': AZURE_ACCOUNT_KEY
             })

movies.to_csv("az://objects/movies_az2.csv",
             storage_options={
                 'account_name': "tscookbook",
                 'account_key': AZURE_ACCOUNT_KEY
             })


## There is more
### Using `boto3`, `google.cloud`, and `azure.storage.blob`

In [None]:
# import boto3
# from io import StringIO

# bucket = "tscookbook-private"
# s3_client = boto3.resource("s3",
#             aws_access_key_id = AWS_ACCESS_KEY,
#             aws_secret_access_key = AWS_SECRET_KEY)

# with StringIO() as in_memory_buffer:
#     movies.to_csv(in_memory_buffer)
#     response = s3_client.Object(bucket, 'new_df.csv').put(Body=in_memory_buffer.getvalue())
#     status = response['ResponseMetadata']['HTTPStatusCode']
#     if status == 200:
#         print('Successful Write')
#     else:
#         print('Unsucessful Write - ', status)

data = movies.to_csv(encoding='utf-8', index=True)
    

In [None]:
import boto3

bucket = "tscookbook-private"

# Using the Resource API
s3_resource = boto3.resource("s3",
            aws_access_key_id = AWS_ACCESS_KEY,
            aws_secret_access_key = AWS_SECRET_KEY)

s3_resource.Object(bucket, 'movies_boto3_resourceapi.csv').put(Body=data)


# Using the Client API
s3_client = boto3.client("s3",
            aws_access_key_id = AWS_ACCESS_KEY,
            aws_secret_access_key = AWS_SECRET_KEY)

s3_client.put_object(Body=data, Bucket=bucket, Key='movies_boto3_clientapi.csv')


In [None]:
from google.cloud import storage

# Authenticate using the service account key
storage_client = storage.Client.from_service_account_json(GCP_KEY_FILE)

bucket_name = 'tscookbook'
file_path = 'movies_gsapi.csv'

blob = storage_client.bucket(bucket_name).blob(file_path)

blob.upload_from_string(data)




In [None]:
type(storage_client)

In [None]:
from azure.storage.blob import BlobServiceClient

blob_service_client = BlobServiceClient(
        account_url="https://tscookbook.blob.core.windows.net",
        credential=AZURE_ACCOUNT_KEY)

blob_client = blob_service_client.get_blob_client(
    container='objects',
    blob='movies_blobapi.csv')

blob_client.upload_blob(data)

# Recipe 5: Writing Large Datasets

In [1]:
# !conda install pytables -y
# ! conda install -c conda-forge fastavro -y

In [2]:
import pandas as pd
from pathlib import Path

file_path = Path('../../datasets/Ch2/yellow_tripdata_2023-01.parquet')

In [3]:
df = pd.read_parquet(file_path, engine='pyarrow')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [5]:
import os
def size_in_mb(file):
    size_bytes = os.path.getsize(file)
    size_m = size_bytes / (1024**2)
    return round(size_m,2)

In [6]:
%%time
df.to_json('formats/yellow_tripdata.json', orient='records')
size_in_mb('formats/yellow_tripdata.json')

CPU times: user 4.61 s, sys: 488 ms, total: 5.1 s
Wall time: 5.15 s


1165.21

In [7]:
%%time
df.to_csv('formats/yellow_tripdata.csv', index=False)
size_in_mb('formats/yellow_tripdata.csv')

CPU times: user 16.6 s, sys: 451 ms, total: 17.1 s
Wall time: 17.2 s


307.04

In [8]:
%%time
df.to_orc('formats/yellow_tripdata_uncompressed.orc', 
          engine_kwargs={'compression':'uncompressed'})
size_in_mb('formats/yellow_tripdata_uncompressed.orc')

CPU times: user 966 ms, sys: 66.3 ms, total: 1.03 s
Wall time: 976 ms


319.94

In [9]:
%%time
df.to_orc('formats/yellow_tripdata_lz4.orc', 
          engine_kwargs={'compression':'lz4'})
size_in_mb('formats/yellow_tripdata_lz4.orc')

CPU times: user 973 ms, sys: 52.6 ms, total: 1.03 s
Wall time: 1 s


319.65

In [10]:
%%time
df.to_orc('formats/yellow_tripdata_zstd.orc', 
          engine_kwargs={'compression':'zstd'})
size_in_mb('formats/yellow_tripdata_zstd.orc')

CPU times: user 1.42 s, sys: 20.1 ms, total: 1.44 s
Wall time: 1.37 s


53.58

In [11]:
%%time
df.to_feather('formats/yellow_tripdata_uncompressed.feather', compression='uncompressed')
size_in_mb('formats/yellow_tripdata_uncompressed.feather')

CPU times: user 142 ms, sys: 63 ms, total: 205 ms
Wall time: 268 ms


435.84

In [12]:
%%time
df.to_feather('formats/yellow_tripdata_lz4.feather', compression='lz4')
size_in_mb('formats/yellow_tripdata_lz4.feather')

CPU times: user 611 ms, sys: 29.9 ms, total: 641 ms
Wall time: 187 ms


116.44

In [13]:
%%time
df.to_feather('formats/yellow_tripdata_zstd.feather', compression='zstd', compression_level=3)
size_in_mb('formats/yellow_tripdata_zstd.feather')

CPU times: user 940 ms, sys: 36.9 ms, total: 977 ms
Wall time: 216 ms


61.79

In [14]:
%%time
df.to_parquet('formats/yellow_tripdata_snappy.parquet', 
              compression='snappy')
size_in_mb('formats/yellow_tripdata_snappy.parquet')

CPU times: user 838 ms, sys: 30.7 ms, total: 868 ms
Wall time: 814 ms


59.89

In [15]:
%%time
df.to_parquet('formats/yellow_tripdata_lz4.parquet', 
              compression='lz4')
size_in_mb('formats/yellow_tripdata_lz4.parquet')

CPU times: user 855 ms, sys: 19.6 ms, total: 875 ms
Wall time: 795 ms


59.92

In [16]:
%%time
df.to_parquet('formats/yellow_tripdata_zstd.parquet', 
              compression='zstd')
size_in_mb('formats/yellow_tripdata_zstd.parquet')

CPU times: user 899 ms, sys: 29.4 ms, total: 928 ms
Wall time: 843 ms


48.95

In [17]:
import glob
for filepath in glob.glob('formats/*'):
    print(os.path.splitext(filepath))

('formats/yellow_tripdata_zstd', '.orc')
('formats/yellow_tripdata_lz4', '.feather')
('formats/yellow_tripdata_zstd', '.feather')
('formats/yellow_tripdata_uncompressed', '.feather')
('formats/yellow_tripdata_snappy', '.parquet')
('formats/yellow_tripdata', '.json')
('formats/yellow_tripdata_uncompressed', '.orc')
('formats/yellow_tripdata', '.csv')
('formats/yellow_tripdata_zstd', '.parquet')
('formats/yellow_tripdata_lz4', '.parquet')
('formats/yellow_tripdata_lz4', '.orc')


In [18]:
import pandas as pd
import os
import glob
import time

def measure_read_performance(folder_path):

  performance_data = []
  for file_path in glob.glob(f'{folder_path}/*'):
    _, ext = os.path.splitext(file_path)
    start_time = time.time()
      
    if ext == '.csv':
      pd.read_csv(file_path, low_memory=False)
    elif ext == '.parquet':
      pd.read_parquet(file_path)
    elif ext == '.feather':
      pd.read_feather(file_path)
    elif ext == '.orc':
      pd.read_orc(file_path)
    elif ext == '.json':
      pd.read_json(file_path)

    end_time = time.time()
    performance_data.append({'filename': file_path, 
                             'read_time': end_time - start_time})

    df = pd.DataFrame(performance_data)

  return df.sort_values('read_time').reset_index(drop=True)

In [19]:
results =\
    measure_read_performance(folder_path='formats')

In [21]:
print(results)

                                        filename  read_time
0            formats/yellow_tripdata_lz4.parquet   0.070845
1         formats/yellow_tripdata_snappy.parquet   0.072083
2           formats/yellow_tripdata_zstd.parquet   0.078382
3            formats/yellow_tripdata_lz4.feather   0.103172
4           formats/yellow_tripdata_zstd.feather   0.103918
5   formats/yellow_tripdata_uncompressed.feather   0.116974
6               formats/yellow_tripdata_zstd.orc   0.474430
7       formats/yellow_tripdata_uncompressed.orc   0.592284
8                formats/yellow_tripdata_lz4.orc   0.613846
9                    formats/yellow_tripdata.csv   4.557402
10                  formats/yellow_tripdata.json  14.590845
