# Recipe 1: Reading from CSVs and Other Delimited Files

In [None]:
import pandas as pd
from pathlib import Path
pd.__version__

In [None]:
filepath = Path('../../datasets/Ch2/movieboxoffice.csv')

In [None]:
ts = pd.read_csv(filepath,
                 header=0,
             
                 parse_dates=[0],
                 index_col=0,
                 usecols=['Date',
                          'DOW',
                          'Daily',
                          'Forecast',
                          'Percent Diff'])
ts.head()

In [None]:
ts.info()

**Data Cleanup**

In [None]:
clean = lambda x: x.str.replace('[^\d]', '', regex=True)                                                      
c_df = ts[['Daily', 'Forecast']].apply(clean, axis=1)
ts[['Daily', 'Forecast']] = c_df.astype(float)
                                

In [None]:
ts.head()

In [None]:
ts.info()

In [None]:
ts.memory_usage()

In [None]:
ts.memory_usage().sum()

## Using `date_parser`

In [None]:
ts.dtypes

In [None]:
date_parser = lambda x: pd.to_datetime(x, format="%d-%b-%y")
ts = pd.read_csv(filepath,
                 parse_dates=[0],
                 index_col=0,
                 date_parser=date_parser,
                 usecols=[0,1,3, 7, 6])
ts.head()

In [None]:
ts = pd.read_csv(filepath,
                 header=0,
                 parse_dates=['Date'],
                 index_col=0,
                 infer_datetime_format= True,
                 usecols=['Date',
                          'DOW',
                          'Daily',
                          'Forecast',
                          'Percent Diff'])
ts.head()

# Recipe 2: Reading data from an Excel file
* Learn how to read data from Excel files using pandas 
* Learn how to read from multiple sheets into DataFrames

* To install openpyxl using conda, run the following command in the terminal:

```
conda install openpyxl
```
* To install using pip, run the following command:

```
pip install openpyxl
```

In [None]:
#!conda install openpyxl -y
#!pip install openpyxl

In [None]:
import pandas as pd
from pathlib import Path
filepath = Path('../../datasets/Ch2/sales_trx_data.xlsx')

In [None]:
import openpyxl
openpyxl.__version__

In [None]:
excelfile = pd.ExcelFile(filepath)
excelfile.sheet_names

In [None]:
excelfile.parse('2017')

In [None]:
ts = pd.read_excel(filepath,
                    engine='openpyxl',
                    index_col=1,
                    sheet_name=[0,1],
                    parse_dates=True)
ts.keys()

In [None]:
ts = pd.read_excel(filepath,
                    engine='openpyxl',
                    index_col=1,
                    sheet_name=['2017','2018'],
                    parse_dates=True)
ts.keys()

In [None]:
ts = pd.read_excel(filepath,
                    engine='openpyxl',
                    index_col=1,
                    sheet_name=None,
                    parse_dates=True)
ts.keys()

In [None]:
ts['2017'].info()

In [None]:
ts['2018'].info()

In [None]:
ts_combined = pd.concat([ts['2017'],ts['2018']])

In [None]:
ts_combined.info()

In [None]:
pd.concat(ts).index

In [None]:
ts_combined = pd.concat(ts).droplevel(level=0)
ts_combined.head()

In [None]:
ts = pd.read_excel(filepath,
                   index_col=1,
                   sheet_name='2018',
                   parse_dates=True)
type(ts)

## There is more
* Using `ExcelFile`

In [None]:
excelfile = pd.ExcelFile(filepath)
excelfile.parse(sheet_name='2017',
                index_col=1,
                parse_dates=True).head()


# Recipe 3: Reading data from a URL
* Learn how to read data as a DataFrame from GitHub
* Learn how to read data as a DataFrame from AWS S3 using `pandas`
* Learn how to read an HTML page and extract tables using `pandas`

In [None]:
#!conda install boto3 s3fs html5lib lxml -y
#!pip install boto3 s3fs html5lib lxml

* To install using pip, you can use the following command:

```
pip install boto3 s3fs lxml
```

* To install using Conda, you can use:

```
conda install boto3 s3fs lxml -y
```

In [None]:
import pandas as pd

In [None]:
import pandas as pd
import boto3, s3fs, lxml
print(f'''
pandas -> {pd.__version__}
boto3 -> {boto3.__version__}
s3fs -> {s3fs.__version__}
lxml -> {lxml.__version__}
''')

## Reading from GitHub

In [None]:
# example of produced error
url = 'https://github.com/PacktPublishing/Time-Series-Analysis-with-Python-Cookbook./blob/main/datasets/Ch2/AirQualityUCI.csv'
# pd.read_csv(url)


In [None]:
url = 'https://raw.githubusercontent.com/PacktPublishing/Time-Series-Analysis-with-Python-Cookbook/main/datasets/Ch2/AirQualityUCI.csv'
date_parser = lambda x: pd.to_datetime(x, format="%d/%m/%Y")

df = pd.read_csv(url,
                 delimiter=';',
                 index_col='Date',
                 date_parser=date_parser)

df.iloc[:3,1:4]

## Reading from Public S3 Bucket

**Path-style access**

In [None]:
url = 'https://s3.us-east-1.amazonaws.com/tscookbook/AirQualityUCI.xlsx'

df = pd.read_excel(url,
                   index_col='Date',
                   parse_dates=True)
df.head()

**Virtual-hosted–style access**

In [None]:
url = 'https://tscookbook.s3.amazonaws.com/AirQualityUCI.xlsx'
df = pd.read_excel(url,
                   index_col='Date',
                   parse_dates=True)
df.head()

**Accessing a bucket using S3://**

In [None]:
s3uri = 's3://tscookbook/AirQualityUCI.xlsx'
df = pd.read_excel(s3uri,
                   index_col='Date',
                   parse_dates=True)
df.head()

## Reading from Private S3 Bucket

In [None]:
import configparser
config = configparser.ConfigParser()
config.read('aws.cfg')

AWS_ACCESS_KEY = config['AWS']['aws_access_key']
AWS_SECRET_KEY = config['AWS']['aws_secret_key']


In [None]:
s3uri = "s3://tscookbook-private/AirQuality.csv"

df = pd.read_csv(s3uri,
                 index_col='Date',
                 parse_dates=True,
                 storage_options= {
                         'key': AWS_ACCESS_KEY,
                         'secret': AWS_SECRET_KEY
                     })

In [None]:
df.iloc[:3, 1:4]

**Using BOTO3**

In [None]:
import boto3
bucket = "tscookbook-private"
client = boto3.client("s3",
                  aws_access_key_id =AWS_ACCESS_KEY,
                  aws_secret_access_key = AWS_SECRET_KEY)


In [None]:
data = client.get_object(Bucket=bucket, Key='AirQuality.csv')

In [None]:
data.keys()

In [None]:
df = pd.read_csv(data['Body'],
                 index_col='Date',
                 parse_dates=True)


In [None]:
df.iloc[:3, 1:4]

## Reading from HTML

In [None]:
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
results = pd.read_html(url)
print(len(results))


In [None]:
# for i, k in enumerate(results):
#     print(i)
#     display(k.head())
    

In [None]:
df = results[15]
df.columns

In [None]:
df[['Total cases', 'Total deaths', 'Cases per million']].head()

## Example how `read_html()` works

In [None]:
import pandas as pd
html = """
 <table>
   <tr>
     <th>Ticker</th>
     <th>Price</th>
   </tr>
   <tr>
     <td>MSFT</td>
     <td>230</td>
   </tr>
   <tr>
     <td>APPL</td>
     <td>300</td>
   </tr>
     <tr>
     <td>MSTR</td>
     <td>120</td>
   </tr>
 </table>

 </body>
 </html>
 """

df = pd.read_html(html)
df[0]


## Using `attr` option in `read.html()`

In [None]:
#!conda install html5lib beautifulSoup4

In [None]:
import pandas as pd
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
df = pd.read_html(url, attrs={'class': 'sortable'})
len(df)

In [None]:
df[3].columns

# Recipe 4: Reading from a SAS dataset --to be deleted recipe--
* Dataset 1: http://support.sas.com/kb/61/960.html
* Learn about chunkisize (chunking) in pandas when working when reading large data files
* Learn about `Modin` using a separate Jupyter Notebook on how to improve performance with minimal effort 


In [None]:
import pandas as pd
path = '../../datasets/Ch2/DCSKINPRODUCT.sas7bdat'


In [None]:
df = pd.read_sas(path, chunksize=10000)
type(df)


In [None]:
results = []
for chunk in df:
    results.append(
        chunk)
len(results)

In [None]:
df = pd.concat(results)
df.shape

In [None]:
df = pd.read_sas(path, chunksize=10000)
results = []
for chunk in df:
    results.append(
        chunk.groupby('DATE')['Revenue']
             .agg(['sum', 'count']))
len(results)

In [None]:
pd.concat(results).shape

In [None]:
results[0].loc['2013-02-10']

In [None]:
results[1].loc['2013-02-10']

In [None]:
results[2].loc['2013-02-10']

In [None]:
from functools import reduce
final = reduce(lambda x1, x2: x1.add(x2, fill_value=0), results)
type(final)

In [None]:
final.loc['2013-02-10']

In [None]:
final.shape

## There is more

In [None]:
# !conda install dask # install everything
#!conda install dask-core # install only core parts of dash
#!python -m pip install "dask[complete]"    # Install everything
#!python -m pip install dask                # Install only core parts of dask

# Recipe 4 - Read data from Parquet files

The following code reads the original data from NCEI NOAA in CSV then stores the data as Parquet files partitioned by year. The recipe focuses on reading the Parquet files and demonstrating how you can filter/query a specific partition.

In [None]:
import pandas as pd
from pathlib import Path
pd.__version__

Optional: Script to read the original dataset in CSV format and convert into a partitioned Parquet files by year

In [None]:
# file = 'https://www.ncei.noaa.gov/orders/cdo/3352259.csv'
# df = pd.read_csv(file)
# df['DT'] = pd.to_datetime(df['DATE'])
# df['year'] = df['DT'].dt.year
# df.to_parquet('../../datasets/Ch2/LA_weather.parquet', engine='pyarrow', partition_cols=['year'], compression='snappy')

Reading the Parquet files

In [None]:
file = Path('../../datasets/Ch2/LA_weather.parquet/')
df = pd.read_parquet(file,
                    engine='pyarrow')

In [None]:
df.info()

Reading a specific partition for year 2012

In [None]:
filters = [('year', '==', 2012)]
df_2012 = pd.read_parquet(file, 
                          engine='pyarrow', 
                          filters=filters)

Filtering to read files for years > 2020

In [None]:
df_2012.info()

In [None]:
filters = [('year', '>', 2020)]

df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters)
df.info()

In [None]:
filters = [('year', '>=', 2021)]
pd.read_parquet(file, 
                engine='pyarrow', 
                filters= filters).info()

In [None]:
filters = [('year', 'in', [2021, 2022, 2023])]
df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters)
df.info()

In [None]:
df.head()

In [None]:
columns = ['DATE', 'year', 'TMAX']
df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters, 
                     columns=columns)

df.head()

In [None]:
df.info()

## There is more

In [None]:
pa.parquet.read_table()

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
from pathlib import Path

file = Path('../../datasets/Ch2/LA_weather.parquet/')
table = pq.read_table(file, filters=filters, columns=columns)

In [None]:
import pyarrow as pa
isinstance(table, pa.Table)

In [None]:
df = table.to_pandas()
df.info()

In [None]:
df.head()

In [None]:
table.column_names

In [None]:
table.schema

In [None]:
table.column_names

In [None]:
type(table)

In [None]:
table.schema.pandas_metadata

In [None]:
pq_dataset = pq.ParquetDataset(file, filters=filters)

In [None]:
pq_dataset

In [None]:
type(pq_dataset)

In [None]:
pq_dataset.files

In [None]:
pq_dataset.schema

In [None]:
pq_dataset.schema.pandas_metadata

In [None]:
pq_dataset.schema.metadata

In [None]:
data = pq_dataset.read()

In [None]:
type(data)

In [None]:
data

In [None]:
data.num_columns

In [None]:
data.num_rows

In [None]:
data.schema

In [None]:
file = Path('../../datasets/Ch2/LA_weather.parquet/')
table = pq.read_table(file)

In [None]:
table.num_rows

In [None]:
table.to_pandas()

In [None]:
pa.Table.from_pandas(df)

### Passing a Schema

In [None]:
columns = ['DATE','year', 'TMAX']
filters = [('year', 'in', [2021, 2022, 2023])]

tb = pq.read_table(file,
                   filters=filters, 
                   columns=columns,
                   use_pandas_metadata=True)

df_pa = tb.to_pandas()

In [None]:
df_pa.info()

In [None]:
df_pd = pd.read_parquet(file, 
                        filters=filters, 
                        columns=columns,
                        use_pandas_metadata=True)

df_pd.info()

In [None]:
schema = pa.schema([
    ('DATE', pa.string()),
    ('year', pa.int64()),
    ('TMAX', pa.int64())
])


# Recipe 5 - Working with Large Data Files

In [1]:
# Script to create one large data file 

# import pandas as pd
# import glob

# def run_once():
#     # Directory path where Parquet files are located
#     directory = '../../datasets/Ch2/yellow_tripdata_2023-*.parquet'
    
#     # Get a list of all Parquet files in the directory
#     parquet_files = glob.glob(directory)
    
#     # Read all Parquet files into a single DataFrame
#     dfs = []
#     for file in parquet_files:
#         df = pd.read_parquet(file)
#         dfs.append(df)
    
#     # Concatenate all DataFrames into a single DataFrame
#     combined_df = pd.concat(dfs)
#     # combined_df.to_parquet('../../datasets/Ch2/yellow_tripdata_2023.parquet', engine='pyarrow')
#     combined_df.to_csv('../../datasets/Ch2/yellow_tripdata_2023.csv', index=False)

# run_once()

### Using other librarys
* Polars
* Dask

In [2]:
import pandas as pd
import time
import psutil
import polars as pl
import dask.dataframe as dd
from pathlib import Path

In [3]:
import memory_profiler 
%load_ext memory_profiler

In [4]:
%whos

Variable          Type        Data/Info
---------------------------------------
Path              type        <class 'pathlib.Path'>
dd                module      <module 'dask.dataframe' <...>k/dataframe/__init__.py'>
glob              module      <module 'glob' from '/opt<...>v/lib/python3.9/glob.py'>
memory_profiler   module      <module 'memory_profiler'<...>ages/memory_profiler.py'>
pd                module      <module 'pandas' from '/o<...>ages/pandas/__init__.py'>
pl                module      <module 'polars' from '/o<...>ages/polars/__init__.py'>
psutil            module      <module 'psutil' from '/o<...>ages/psutil/__init__.py'>
run_once          function    <function run_once at 0x1077acca0>
time              module      <module 'time' (built-in)>


In [5]:
# Read in the CSV file
file_path = Path('../../datasets/Ch2/yellow_tripdata_2023.csv')

In [6]:
# # Function to calculate memory usage
# def get_memory_usage():
#     process = psutil.Process()
#     return process.memory_info().rss

In [8]:
%%timeit 
%memit df_pd = pd.read_csv(file_path, low_memory=False)

peak memory: 12815.84 MiB, increment: 2826.17 MiB
peak memory: 13588.72 MiB, increment: 4376.66 MiB
peak memory: 13511.14 MiB, increment: 3938.94 MiB
peak memory: 13692.38 MiB, increment: 4350.02 MiB
peak memory: 12506.48 MiB, increment: 3171.41 MiB
peak memory: 13803.03 MiB, increment: 4360.72 MiB
peak memory: 13806.06 MiB, increment: 4315.69 MiB
peak memory: 13785.55 MiB, increment: 4293.48 MiB
26.2 s ± 129 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16186386 entries, 0 to 16186385
Data columns (total 20 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               int64  
 1   tpep_pickup_datetime   object 
 2   tpep_dropoff_datetime  object 
 3   passenger_count        float64
 4   trip_distance          float64
 5   RatecodeID             float64
 6   store_and_fwd_flag     object 
 7   PULocationID           int64  
 8   DOLocationID           int64  
 9   payment_type           int64  
 10  fare_amount            float64
 11  extra                  float64
 12  mta_tax                float64
 13  tip_amount             float64
 14  tolls_amount           float64
 15  improvement_surcharge  float64
 16  total_amount           float64
 17  congestion_surcharge   float64
 18  Airport_fee            float64
 19  airport_fee            float64
dtypes: float64(13), int64(4), object(3)
memory usage: 2.4+ GB


In [11]:
del df_pd

In [35]:
%%timeit 
%memit df_pl = pl.read_csv(file_path)

peak memory: 14019.83 MiB, increment: 1660.86 MiB
peak memory: 13053.50 MiB, increment: 1952.77 MiB
peak memory: 13035.81 MiB, increment: 1743.11 MiB
peak memory: 13851.59 MiB, increment: 3070.61 MiB
peak memory: 13596.44 MiB, increment: 1793.02 MiB
peak memory: 13694.06 MiB, increment: 2218.30 MiB
peak memory: 12872.70 MiB, increment: 1955.62 MiB
peak memory: 13055.06 MiB, increment: 1816.05 MiB
2.17 s ± 146 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
df_pl.dtypes

[Int64,
 Utf8,
 Utf8,
 Float64,
 Float64,
 Float64,
 Utf8,
 Int64,
 Int64,
 Int64,
 Float64,
 Float64,
 Float64,
 Float64,
 Float64,
 Float64,
 Float64,
 Float64,
 Float64,
 Utf8]

In [25]:
df_pl.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
i64,str,str,f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,"""2023-05-01 00:…","""2023-05-01 00:…",0.0,7.8,1.0,"""N""",138,43,1,33.8,7.75,0.5,8.6,0.0,1.0,51.65,0.0,1.75,
1,"""2023-05-01 00:…","""2023-05-01 01:…",2.0,8.1,1.0,"""N""",138,262,1,35.9,10.25,0.5,9.5,0.0,1.0,57.15,2.5,1.75,
1,"""2023-05-01 00:…","""2023-05-01 01:…",2.0,9.1,1.0,"""N""",138,141,1,35.2,10.25,0.5,10.7,6.55,1.0,64.2,2.5,1.75,
2,"""2023-05-01 00:…","""2023-05-01 00:…",1.0,8.21,1.0,"""N""",138,140,1,33.1,6.0,0.5,2.24,0.0,1.0,47.09,2.5,1.75,
1,"""2023-05-01 00:…","""2023-05-01 00:…",0.0,7.9,1.0,"""N""",138,263,1,31.0,10.25,0.5,9.85,6.55,1.0,59.15,2.5,1.75,


In [42]:
df_pl.height

16186386

In [43]:
del df_pl

In [44]:
%%timeit 
%memit df_dk = dd.read_csv(file_path)

peak memory: 11752.80 MiB, increment: 0.00 MiB
peak memory: 11752.81 MiB, increment: 0.02 MiB
peak memory: 11754.09 MiB, increment: 1.28 MiB
peak memory: 11754.36 MiB, increment: 0.27 MiB
peak memory: 11754.62 MiB, increment: 0.27 MiB
peak memory: 11754.62 MiB, increment: 0.00 MiB
peak memory: 11754.89 MiB, increment: 0.27 MiB
peak memory: 11754.91 MiB, increment: 0.02 MiB
384 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
df_dk.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 20 entries, VendorID to airport_fee
dtypes: object(3), float64(13), int64(4)

In [46]:
df_dk.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
0,1,2023-05-01 00:33:13,2023-05-01 00:53:01,0.0,7.8,1.0,N,138,43,1,33.8,7.75,0.5,8.6,0.0,1.0,51.65,0.0,1.75,
1,1,2023-05-01 00:42:49,2023-05-01 01:11:18,2.0,8.1,1.0,N,138,262,1,35.9,10.25,0.5,9.5,0.0,1.0,57.15,2.5,1.75,
2,1,2023-05-01 00:56:34,2023-05-01 01:13:39,2.0,9.1,1.0,N,138,141,1,35.2,10.25,0.5,10.7,6.55,1.0,64.2,2.5,1.75,
3,2,2023-05-01 00:00:52,2023-05-01 00:20:12,1.0,8.21,1.0,N,138,140,1,33.1,6.0,0.5,2.24,0.0,1.0,47.09,2.5,1.75,
4,1,2023-05-01 00:05:50,2023-05-01 00:19:41,0.0,7.9,1.0,N,138,263,1,31.0,10.25,0.5,9.85,6.55,1.0,59.15,2.5,1.75,


In [48]:
del df_dk

### Chunking 

In [49]:
import pandas as pd
from pathlib import Path

import memory_profiler 
%load_ext memory_profiler

file_path = Path('../../datasets/Ch2/yellow_tripdata_2023.csv')

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [50]:
%%timeit
%memit df_pd = pd.read_csv(file_path, low_memory=False)

peak memory: 12144.06 MiB, increment: 123.19 MiB
peak memory: 12148.73 MiB, increment: 1850.88 MiB
peak memory: 12344.50 MiB, increment: 2589.38 MiB
peak memory: 12394.56 MiB, increment: 2854.52 MiB
peak memory: 12413.80 MiB, increment: 2587.84 MiB
peak memory: 12002.17 MiB, increment: 2187.16 MiB
peak memory: 12880.08 MiB, increment: 3121.31 MiB
peak memory: 11788.30 MiB, increment: 2488.20 MiB
26.8 s ± 196 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
%%timeit
%memit pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False)])

peak memory: 11198.70 MiB, increment: 1913.84 MiB
peak memory: 9856.34 MiB, increment: 4343.16 MiB
peak memory: 10165.14 MiB, increment: 3854.72 MiB
peak memory: 9247.17 MiB, increment: 2535.80 MiB
peak memory: 10126.86 MiB, increment: 3614.67 MiB
peak memory: 9533.97 MiB, increment: 2393.84 MiB
peak memory: 8612.58 MiB, increment: 1832.39 MiB
peak memory: 8650.14 MiB, increment: 2737.81 MiB
20.7 s ± 972 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])

  df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])
  df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])
  df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])
  df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])
  df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])
  df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, low_memory=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])
  df_pd = pd.con

In [None]:
# memory_map

In [53]:
df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16186386 entries, 0 to 16186385
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee         

In [54]:
df_pd.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
0,1,2023-05-01 00:33:13,2023-05-01 00:53:01,0.0,7.8,1.0,N,138,43,1,33.8,7.75,0.5,8.6,0.0,1.0,51.65,0.0,1.75,
1,1,2023-05-01 00:42:49,2023-05-01 01:11:18,2.0,8.1,1.0,N,138,262,1,35.9,10.25,0.5,9.5,0.0,1.0,57.15,2.5,1.75,
2,1,2023-05-01 00:56:34,2023-05-01 01:13:39,2.0,9.1,1.0,N,138,141,1,35.2,10.25,0.5,10.7,6.55,1.0,64.2,2.5,1.75,
3,2,2023-05-01 00:00:52,2023-05-01 00:20:12,1.0,8.21,1.0,N,138,140,1,33.1,6.0,0.5,2.24,0.0,1.0,47.09,2.5,1.75,
4,1,2023-05-01 00:05:50,2023-05-01 00:19:41,0.0,7.9,1.0,N,138,263,1,31.0,10.25,0.5,9.85,6.55,1.0,59.15,2.5,1.75,


In [55]:
df_pd['tpep_dropoff_datetime'] - df_pd['tpep_pickup_datetime']

0          0 days 00:19:48
1          0 days 00:28:29
2          0 days 00:17:05
3          0 days 00:19:20
4          0 days 00:13:51
                 ...      
16186381   0 days 00:16:29
16186382   0 days 00:39:22
16186383   0 days 00:23:08
16186384   0 days 00:08:05
16186385   0 days 00:05:35
Length: 16186386, dtype: timedelta64[ns]

In [2]:
import pandas as pd
from pathlib import Path

import memory_profiler 
%load_ext memory_profiler

file_path = Path('../../datasets/Ch2/yellow_tripdata_2023.csv')

In [31]:
%%time
%memit df_pd = pd.read_csv(file_path, memory_map=True, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])



peak memory: 12795.27 MiB, increment: 3174.50 MiB
CPU times: user 15.1 s, sys: 3.07 s, total: 18.2 s
Wall time: 21.5 s


In [7]:
%%time
%memit df_pd = pd.read_csv(file_path, memory_map=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])



peak memory: 10951.66 MiB, increment: 1467.52 MiB
CPU times: user 14.8 s, sys: 2.01 s, total: 16.8 s
Wall time: 17.4 s


In [8]:
%%time
%memit df_pd = pd.concat([chunk for chunk  in pd.read_csv(file_path, chunksize=10000, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])])



peak memory: 10590.58 MiB, increment: 890.47 MiB
CPU times: user 16.7 s, sys: 2.3 s, total: 19 s
Wall time: 19.5 s


In [9]:
df_pd

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
0,0,1,2023-04-01 00:14:49,2023-04-01 00:45:01,2.0,4.90,1.0,N,48,223,...,28.90,3.5,0.5,6.00,0.00,1.0,39.90,2.5,0.00,
1,1,2,2023-04-01 00:00:24,2023-04-01 00:56:19,1.0,21.89,2.0,N,132,43,...,70.00,0.0,0.5,0.00,6.55,1.0,81.80,2.5,1.25,
2,2,1,2023-04-01 00:03:50,2023-04-01 00:14:42,2.0,1.30,1.0,N,148,113,...,11.40,3.5,0.5,2.00,0.00,1.0,18.40,2.5,0.00,
3,3,1,2023-04-01 00:53:18,2023-04-01 01:01:28,1.0,1.50,1.0,N,249,79,...,10.00,3.5,0.5,1.00,0.00,1.0,16.00,2.5,0.00,
4,4,2,2023-04-01 00:07:00,2023-04-01 00:17:16,2.0,1.49,1.0,N,158,246,...,11.40,1.0,0.5,1.00,0.00,1.0,17.40,2.5,0.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12672732,3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,...,12.13,0.0,0.5,4.23,0.00,1.0,20.36,,,
12672733,3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,...,40.92,0.0,0.5,8.98,0.00,1.0,53.90,,,
12672734,3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,...,24.02,0.0,0.5,0.00,0.00,1.0,28.02,,,
12672735,3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,...,8.51,0.0,0.5,3.50,0.00,1.0,16.01,,,


In [39]:
df_pd['time_delta'] = df_pd['tpep_dropoff_datetime'] - df_pd['tpep_pickup_datetime']
df_pd['year_month'] = df_pd['tpep_dropoff_datetime'].dt.year.astype('str') + df_pd['tpep_dropoff_datetime'].dt.month.astype('str')
df_pd['day'] = df_pd['tpep_dropoff_datetime'].dt.date
df_pd['year'] = df_pd['tpep_dropoff_datetime'].dt.year

In [67]:
df_pd.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount',
       'congestion_surcharge', 'Airport_fee', 'airport_fee', 'time_delta',
       'year_month', 'day', 'year'],
      dtype='object')

In [66]:
df_pd.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,time_delta,year_month,day,year
0,0,1,2023-04-01 00:14:49,2023-04-01 00:45:01,2.0,4.9,1.0,N,48,223,...,0.0,1.0,39.9,2.5,0.0,,0 days 00:30:12,20234,2023-04-01,2023
1,1,2,2023-04-01 00:00:24,2023-04-01 00:56:19,1.0,21.89,2.0,N,132,43,...,6.55,1.0,81.8,2.5,1.25,,0 days 00:55:55,20234,2023-04-01,2023
2,2,1,2023-04-01 00:03:50,2023-04-01 00:14:42,2.0,1.3,1.0,N,148,113,...,0.0,1.0,18.4,2.5,0.0,,0 days 00:10:52,20234,2023-04-01,2023
3,3,1,2023-04-01 00:53:18,2023-04-01 01:01:28,1.0,1.5,1.0,N,249,79,...,0.0,1.0,16.0,2.5,0.0,,0 days 00:08:10,20234,2023-04-01,2023
4,4,2,2023-04-01 00:07:00,2023-04-01 00:17:16,2.0,1.49,1.0,N,158,246,...,0.0,1.0,17.4,2.5,0.0,,0 days 00:10:16,20234,2023-04-01,2023


In [70]:
columns = ['PULocationID', 'time_delta', 'year_month', 'trip_distance', 'fare_amount', 'day', 'year', 'passenger_count']
# df_pd = df_pd[columns]

In [71]:
df_pd = df_pd[df_pd['year'] == 2023]

In [45]:
df_pd['year_month'].value_counts()

year_month
20233    3402522
20234    3289252
20231    3066125
20232    2913959
20235        830
Name: count, dtype: int64

In [72]:
df = df_pd[columns]

In [95]:
momthly_performance = (
df.groupby(
    ['PULocationID','day','year_month']
    )[['trip_distance', 
       'fare_amount', 
       'passenger_count']]
             .sum()
             .groupby(
            ['PULocationID','year_month']
                )
                 .mean()
)

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12672688 entries, 0 to 12672736
Data columns (total 8 columns):
 #   Column           Dtype          
---  ------           -----          
 0   PULocationID     int64          
 1   time_delta       timedelta64[ns]
 2   year_month       object         
 3   trip_distance    float64        
 4   fare_amount      float64        
 5   day              object         
 6   year             int32          
 7   passenger_count  float64        
dtypes: float64(3), int32(1), int64(1), object(2), timedelta64[ns](1)
memory usage: 821.8+ MB


In [109]:
momthly_performance.reset_index().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093 entries, 0 to 1092
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PULocationID     1093 non-null   int64  
 1   year_month       1093 non-null   object 
 2   trip_distance    1093 non-null   float64
 3   fare_amount      1093 non-null   float64
 4   passenger_count  1093 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 42.8+ KB


In [129]:
final = momthly_performance.reset_index().set_index(pd.to_datetime(momthly_performance.reset_index()['year_month'], format='%Y%m')).drop(columns='year_month')

In [155]:
momthly_performance

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,20231,21.091613,1156.118065,20.838710
1,20232,12.742500,1159.394286,22.535714
1,20233,19.640323,1233.041290,22.129032
1,20234,9.450667,1382.733333,26.600000
2,20231,13.120000,0.000000,4.000000
...,...,...,...,...
264,20235,82.890000,265.800000,11.000000
265,20231,126.912258,5008.275161,75.419355
265,20232,585.654643,6214.102500,69.464286
265,20233,729.702258,7672.011290,73.387097


In [161]:
%%time

df_pd = pd.read_csv(file_path, memory_map=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

df_pd['time_delta'] = df_pd['tpep_dropoff_datetime'] - df_pd['tpep_pickup_datetime']
df_pd['year_month'] = df_pd['tpep_dropoff_datetime'].dt.year.astype('str') + df_pd['tpep_dropoff_datetime'].dt.month.astype('str')
df_pd['day'] = df_pd['tpep_dropoff_datetime'].dt.date
df_pd['year'] = df_pd['tpep_dropoff_datetime'].dt.year
columns = ['PULocationID', 'time_delta', 'year_month', 'trip_distance', 'fare_amount', 'day', 'year', 'passenger_count']

df = df_pd[columns]

momthly_performance = (
df.groupby(
    ['PULocationID','day','year_month']
    )[['trip_distance', 
       'fare_amount', 
       'passenger_count']]
             .sum()
             .groupby(
            ['PULocationID','year_month']
                )
                 .mean()
)

momthly_performance.head()



CPU times: user 21.7 s, sys: 4.03 s, total: 25.8 s
Wall time: 27 s


Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,202210,0.01,117.5,1.0
1,20231,21.091613,1156.118065,20.83871
1,20232,12.7425,1159.394286,22.535714
1,20233,19.640323,1233.04129,22.129032
1,20234,9.450667,1382.733333,26.6


In [162]:
momthly_performance.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1132 entries, (1, '202210') to (265, '20234')
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   trip_distance    1132 non-null   float64
 1   fare_amount      1132 non-null   float64
 2   passenger_count  1132 non-null   float64
dtypes: float64(3)
memory usage: 32.1+ KB


In [163]:
%%time

df_pd = pd.read_csv(file_path, memory_map=True, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

df_pd['time_delta'] = df_pd['tpep_dropoff_datetime'] - df_pd['tpep_pickup_datetime']
df_pd['year_month'] = df_pd['tpep_dropoff_datetime'].dt.year.astype('str') + df_pd['tpep_dropoff_datetime'].dt.month.astype('str')
df_pd['day'] = df_pd['tpep_dropoff_datetime'].dt.date
df_pd['year'] = df_pd['tpep_dropoff_datetime'].dt.year
columns = ['PULocationID', 'time_delta', 'year_month', 'trip_distance', 'fare_amount', 'day', 'year', 'passenger_count']

df = df_pd[columns]



momthly_performance = (
df.groupby(
    ['PULocationID','day','year_month']
    )[['trip_distance', 
       'fare_amount', 
       'passenger_count']]
             .sum()
             .groupby(
            ['PULocationID','year_month']
                )
                 .mean()
)

momthly_performance.head()



CPU times: user 21.9 s, sys: 4.45 s, total: 26.3 s
Wall time: 27.8 s


Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,202210,0.01,117.5,1.0
1,20231,21.091613,1156.118065,20.83871
1,20232,12.7425,1159.394286,22.535714
1,20233,19.640323,1233.04129,22.129032
1,20234,9.450667,1382.733333,26.6


In [164]:
momthly_performance.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1132 entries, (1, '202210') to (265, '20234')
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   trip_distance    1132 non-null   float64
 1   fare_amount      1132 non-null   float64
 2   passenger_count  1132 non-null   float64
dtypes: float64(3)
memory usage: 32.1+ KB


In [171]:
%%time

df_pd = pd.read_csv(file_path, memory_map=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], chunksize=10_000)

results = []
for chunk in df_pd:
    chunk['time_delta'] = chunk['tpep_dropoff_datetime'] - chunk['tpep_pickup_datetime']
    chunk['year_month'] = chunk['tpep_dropoff_datetime'].dt.year.astype('str') + chunk['tpep_dropoff_datetime'].dt.month.astype('str')
    chunk['day'] = chunk['tpep_dropoff_datetime'].dt.date
    chunk['year'] = chunk['tpep_dropoff_datetime'].dt.year
    columns = ['PULocationID', 'time_delta', 'year_month', 'trip_distance', 'fare_amount', 'day', 'year', 'passenger_count']
    
    df = chunk[columns]



    momthly_performance = (
    df.groupby(
        ['PULocationID','day','year_month']
        )[['trip_distance', 
           'fare_amount', 
           'passenger_count']]
                 .sum()
                 .groupby(
                ['PULocationID','year_month']
                    )
                     .mean()
    )

    results.append(momthly_performance)

final = pd.concat(results)
final.head()



AttributeError: 'list' object has no attribute 'head'

In [175]:
len(results)

1268

In [186]:
results[0].loc[(  4, '20234')]

trip_distance       234.87
fare_amount        1378.80
passenger_count     139.00
Name: (4, 20234), dtype: float64

In [187]:
results[1].loc[(  4, '20234')]

trip_distance       189.87
fare_amount        1033.80
passenger_count      92.00
Name: (4, 20234), dtype: float64

In [189]:
results[20].loc[(  4, '20234')]

trip_distance      12.63
fare_amount        74.70
passenger_count     9.00
Name: (4, 20234), dtype: float64

In [173]:
final.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 159224 entries, (4, '20234') to (265, '20233')
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   trip_distance    159224 non-null  float64
 1   fare_amount      159224 non-null  float64
 2   passenger_count  159224 non-null  float64
dtypes: float64(3)
memory usage: 4.1+ MB


In [174]:
final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,20234,234.87,1378.8,139.0
7,20234,5.95,32.6,2.0
10,20234,40.59,206.55,1.0
13,20234,23.8,210.3,11.0
17,20234,0.0,3.0,1.0


In [195]:
x.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,202210,0.02,235.0,2.0
1,20231,653.33,35530.94,642.0
1,20232,355.875,32004.04,619.0
1,20233,599.865,38036.78,684.0
1,20234,269.82,41189.725,795.0


In [196]:
momthly_performance.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,20233,92.31,513.71,0.0
7,20233,40.95,164.81,0.0
7,20234,14.17,51.66,0.0
13,20233,86.24,422.47,0.0
17,20233,34.78,139.75,0.0


In [197]:
import pandas as pd
from functools import reduce

df_pd = pd.read_csv(file_path, memory_map=False, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], chunksize=10_000)

# Define the function to process each chunk
def process_chunk(chunk):
    chunk['time_delta'] = chunk['tpep_dropoff_datetime'] - chunk['tpep_pickup_datetime']
    chunk['year_month'] = chunk['tpep_dropoff_datetime'].dt.year.astype('str') + chunk['tpep_dropoff_datetime'].dt.month.astype('str')
    chunk['day'] = chunk['tpep_dropoff_datetime'].dt.date
    chunk['year'] = chunk['tpep_dropoff_datetime'].dt.year
    columns = ['PULocationID', 'time_delta', 'year_month', 'trip_distance', 'fare_amount', 'day', 'year', 'passenger_count']
    df = chunk[columns]
    
    momthly_performance = (
        df.groupby(['PULocationID', 'day', 'year_month'])
        [['trip_distance', 'fare_amount', 'passenger_count']]
        .sum()
        .groupby(['PULocationID', 'year_month'])
        .mean()
    )

    return momthly_performance

# Use reduce to combine the results from all chunks
results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))

final = results.reset_index()  # Reset the index to get a DataFrame
final.head()


  results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))
  results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))
  results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))
  results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))
  results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))
  results = reduce(lambda x, y: pd.concat([x, y]), (process_chunk(chunk) for chunk in df_pd))


Unnamed: 0,PULocationID,year_month,trip_distance,fare_amount,passenger_count
0,4,20234,234.87,1378.8,139.0
1,7,20234,5.95,32.6,2.0
2,10,20234,40.59,206.55,1.0
3,13,20234,23.8,210.3,11.0
4,17,20234,0.0,3.0,1.0


In [207]:
results.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 159224 entries, (4, '20234') to (265, '20233')
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   trip_distance    159224 non-null  float64
 1   fare_amount      159224 non-null  float64
 2   passenger_count  159224 non-null  float64
dtypes: float64(3)
memory usage: 4.1+ MB


In [206]:
results.reset_index(['PULocationID','year_month']).groupby(['PULocationID','year_month']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,trip_distance,fare_amount,passenger_count
PULocationID,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,202210,0.020000,235.000000,2.000000
1,20231,653.330000,35530.940000,642.000000
1,20232,355.875000,32004.040000,619.000000
1,20233,599.865000,38036.780000,684.000000
1,20234,269.820000,41189.725000,795.000000
...,...,...,...,...
265,202210,2.330000,10.000000,1.000000
265,20231,3629.886667,143351.081667,2147.166667
265,20232,6731.669333,123486.299667,1776.000000
265,20233,8717.638667,160813.071500,2105.000000


In [208]:
momthly_performance.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 144 entries, (4, '20233') to (265, '20233')
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   trip_distance    144 non-null    float64
 1   fare_amount      144 non-null    float64
 2   passenger_count  144 non-null    float64
dtypes: float64(3)
memory usage: 12.9+ KB


In [213]:
import pandas as pd
from functools import reduce

#file_path = 'your_file_path.csv'
chunksize = 10_000

# Initialize an empty list to store the results from each chunk
results = []

# Define the function to process each chunk and aggregate the data
def process_chunk(chunk):
    chunk['time_delta'] = chunk['tpep_dropoff_datetime'] - chunk['tpep_pickup_datetime']
    chunk['year_month'] = chunk['tpep_dropoff_datetime'].dt.year.astype('str') + chunk['tpep_dropoff_datetime'].dt.month.astype('str')
    chunk['day'] = chunk['tpep_dropoff_datetime'].dt.date
    chunk['year'] = chunk['tpep_dropoff_datetime'].dt.year
    columns = ['PULocationID', 'time_delta', 'year_month', 'trip_distance', 'fare_amount', 'day', 'year', 'passenger_count']
    df = chunk[columns]
    
    # monthly_performance = (
    #     df.groupby(['PULocationID', 'day', 'year_month'])
    #     [['trip_distance', 'fare_amount', 'passenger_count']]
    #     .sum()
    #     .groupby(['PULocationID', 'year_month'])
    #     .mean()
    )

    return monthly_performance

# Iterate through the chunks, process each chunk, and append the result to the 'results' list
# for chunk in pd.read_csv(file_path, memory_map=True, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], chunksize=chunksize):
#     results.append(process_chunk(chunk))

# Concatenate all the results into the final DataFrame
final = reduce(lambda x, y: pd.add([x, y]), (process_chunk(chunk) for chunk in df_pd))  


# print(final.head())


SyntaxError: unmatched ')' (2217541468.py, line 25)

In [210]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159224 entries, 0 to 159223
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   PULocationID     159224 non-null  int64  
 1   year_month       159224 non-null  object 
 2   trip_distance    159224 non-null  float64
 3   fare_amount      159224 non-null  float64
 4   passenger_count  159224 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 6.1+ MB
