# Recipe 1: Reading from CSVs and Other Delimited Files

In [1]:
import pandas as pd
from pathlib import Path
pd.__version__

'2.1.3'

In [5]:
filepath = Path('../../datasets/Ch2/movieboxoffice.csv')

In [26]:
# read movieboxoffice.csv data and parse the Date column (first column) as date
ts = pd.read_csv(filepath,
                 header=0,
                 parse_dates=[0],
                 index_col=0,
                 usecols=['Date',
                          'DOW',
                          'Daily',
                          'Forecast',
                          'Percent Diff'])
ts.head()

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,"$125,789.89","$235,036.46",-46.48%
2021-04-27,Saturday,"$99,374.01","$197,622.55",-49.72%
2021-04-28,Sunday,"$82,203.16","$116,991.26",-29.74%
2021-04-29,Monday,"$33,530.26","$66,652.65",-49.69%
2021-04-30,Tuesday,"$30,105.24","$34,828.19",-13.56%


In [27]:
ts.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2021-04-26 to 2021-08-31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DOW           128 non-null    object
 1   Daily         128 non-null    object
 2   Forecast      128 non-null    object
 3   Percent Diff  128 non-null    object
dtypes: object(4)
memory usage: 5.0+ KB


**Data Cleanup**

In [28]:
ts.dtypes

DOW             object
Daily           object
Forecast        object
Percent Diff    object
dtype: object

In [29]:
clean = lambda x: x.str.replace('[^\\d]', '', regex=True)                                                      
c_df = ts[['Daily', 'Forecast']].apply(clean, axis=1)
ts[['Daily', 'Forecast']] = c_df.astype(float)                         

In [30]:
ts.head()

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,12578989.0,23503646.0,-46.48%
2021-04-27,Saturday,9937401.0,19762255.0,-49.72%
2021-04-28,Sunday,8220316.0,11699126.0,-29.74%
2021-04-29,Monday,3353026.0,6665265.0,-49.69%
2021-04-30,Tuesday,3010524.0,3482819.0,-13.56%


In [31]:
ts.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2021-04-26 to 2021-08-31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   DOW           128 non-null    object 
 1   Daily         128 non-null    float64
 2   Forecast      128 non-null    float64
 3   Percent Diff  128 non-null    object 
dtypes: float64(2), object(2)
memory usage: 5.0+ KB


In [32]:
ts.memory_usage()

Index           1024
DOW             1024
Daily           1024
Forecast        1024
Percent Diff    1024
dtype: int64

In [33]:
ts.memory_usage().sum()

5120

## Using `date_parser`

In [34]:
ts.dtypes

DOW              object
Daily           float64
Forecast        float64
Percent Diff     object
dtype: object

using the `date_format` parameter 

In [62]:
ts = pd.read_csv(filepath,
                 parse_dates=[0],
                 index_col=0,
                 date_format="%d-%b-%Y",
                 usecols=[0,1,3, 7, 6])
ts.head()

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,"$125,789.89","$235,036.46",-46.48%
2021-04-27,Saturday,"$99,374.01","$197,622.55",-49.72%
2021-04-28,Sunday,"$82,203.16","$116,991.26",-29.74%
2021-04-29,Monday,"$33,530.26","$66,652.65",-49.69%
2021-04-30,Tuesday,"$30,105.24","$34,828.19",-13.56%


In [63]:
ts.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2021-04-26 to 2021-08-31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DOW           128 non-null    object
 1   Daily         128 non-null    object
 2   Forecast      128 non-null    object
 3   Percent Diff  128 non-null    object
dtypes: object(4)
memory usage: 5.0+ KB


Using `pd.to_datetime()` function

In [64]:
ts = pd.read_csv(filepath,
                 index_col=0,
                 usecols=[0,1,3, 7, 6])

ts.index = pd.to_datetime(ts.index, format="%d-%b-%Y")
ts.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2021-04-26 to 2021-08-31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DOW           128 non-null    object
 1   Daily         128 non-null    object
 2   Forecast      128 non-null    object
 3   Percent Diff  128 non-null    object
dtypes: object(4)
memory usage: 5.0+ KB


In [56]:
# NOTE: date_parser is deprecated since version 2.0.0 

# date_parser = lambda x: pd.to_datetime(x, format="%d-%b-%Y")
# ts = pd.read_csv(filepath,
#                  parse_dates=[0],
#                  index_col=0,
#                  date_parser=date_parser,
#                  usecols=[0,1,3, 7, 6])
# ts.head()

In [65]:
ts = pd.read_csv(filepath,
                 header=0,
                 parse_dates=['Date'],
                 index_col=0,
                 infer_datetime_format= True, # infer_datetime_format is deprecated since version 2.0.0 (already the default)
                 usecols=['Date',
                          'DOW',
                          'Daily',
                          'Forecast',
                          'Percent Diff'])
ts.head()

  ts = pd.read_csv(filepath,


Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,"$125,789.89","$235,036.46",-46.48%
2021-04-27,Saturday,"$99,374.01","$197,622.55",-49.72%
2021-04-28,Sunday,"$82,203.16","$116,991.26",-29.74%
2021-04-29,Monday,"$33,530.26","$66,652.65",-49.69%
2021-04-30,Tuesday,"$30,105.24","$34,828.19",-13.56%


# Recipe 2: Reading data from an Excel file
* Learn how to read data from Excel files using pandas 
* Learn how to read from multiple sheets into DataFrames

* To install openpyxl using conda, run the following command in the terminal:

```
conda install openpyxl
```
* To install using pip, run the following command:

```
pip install openpyxl
```

In [21]:
#!conda install openpyxl -y
#!pip install openpyxl

In [66]:
import pandas as pd
from pathlib import Path
filepath = Path('../../datasets/Ch2/sales_trx_data.xlsx')

In [67]:
import openpyxl
openpyxl.__version__

'3.1.2'

In [68]:
excelfile = pd.ExcelFile(filepath)
excelfile.sheet_names

['2017', '2018']

In [69]:
excelfile.parse('2017')

Unnamed: 0,Line_Item_ID,Date,Credit_Card_Number,Quantity,Menu_Item
0,1,2017-01-01,7437926611570790,1,spicy miso ramen
1,2,2017-01-01,7437926611570790,1,spicy miso ramen
2,3,2017-01-01,8421920068932810,3,tori paitan ramen
3,4,2017-01-01,8421920068932810,3,tori paitan ramen
4,5,2017-01-01,4787310681569640,1,truffle butter ramen
...,...,...,...,...,...
36759,36760,2017-12-31,3149176814183170,4,vegetarian curry + king trumpet mushroom ramen
36760,36761,2017-12-31,2005206728506200,1,tonkotsu ramen
36761,36762,2017-12-31,2005206728506200,1,tonkotsu ramen
36762,36763,2017-12-31,1130414909788170,2,soft-shell miso crab ramen


In [70]:
ts = pd.read_excel(filepath,
                    engine='openpyxl',
                    index_col=1,
                    sheet_name=[0,1],
                    parse_dates=True)
ts.keys()

dict_keys([0, 1])

In [71]:
ts = pd.read_excel(filepath,
                    engine='openpyxl',
                    index_col=1,
                    sheet_name=['2017','2018'],
                    parse_dates=True)
ts.keys()

dict_keys(['2017', '2018'])

In [72]:
ts = pd.read_excel(filepath,
                    engine='openpyxl',
                    index_col=1,
                    sheet_name=None,
                    parse_dates=True)
ts.keys()

dict_keys(['2017', '2018'])

In [73]:
ts['2017'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 36764 entries, 2017-01-01 to 2017-12-31
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Line_Item_ID        36764 non-null  int64 
 1   Credit_Card_Number  36764 non-null  int64 
 2   Quantity            36764 non-null  int64 
 3   Menu_Item           36764 non-null  object
dtypes: int64(3), object(1)
memory usage: 1.4+ MB


In [74]:
ts['2018'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37360 entries, 2018-01-01 to 2018-12-31
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Line_Item_ID        37360 non-null  int64 
 1   Credit_Card_Number  37360 non-null  int64 
 2   Quantity            37360 non-null  int64 
 3   Menu_Item           37360 non-null  object
dtypes: int64(3), object(1)
memory usage: 1.4+ MB


In [75]:
ts_combined = pd.concat([ts['2017'],ts['2018']])

In [76]:
ts_combined.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 74124 entries, 2017-01-01 to 2018-12-31
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Line_Item_ID        74124 non-null  int64 
 1   Credit_Card_Number  74124 non-null  int64 
 2   Quantity            74124 non-null  int64 
 3   Menu_Item           74124 non-null  object
dtypes: int64(3), object(1)
memory usage: 2.8+ MB


In [77]:
pd.concat(ts).index

MultiIndex([('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ('2017', '2017-01-01'),
            ...
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31'),
            ('2018', '2018-12-31')],
           names=[None, 'Date'], length=74124)

In [78]:
ts_combined = pd.concat(ts).droplevel(level=0)
ts_combined.head()

Unnamed: 0_level_0,Line_Item_ID,Credit_Card_Number,Quantity,Menu_Item
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,1,7437926611570790,1,spicy miso ramen
2017-01-01,2,7437926611570790,1,spicy miso ramen
2017-01-01,3,8421920068932810,3,tori paitan ramen
2017-01-01,4,8421920068932810,3,tori paitan ramen
2017-01-01,5,4787310681569640,1,truffle butter ramen


In [79]:
ts = pd.read_excel(filepath,
                   index_col=1,
                   sheet_name='2018',
                   parse_dates=True)
type(ts)

pandas.core.frame.DataFrame

## There is more
* Using `ExcelFile`

In [80]:
excelfile = pd.ExcelFile(filepath)
excelfile.parse(sheet_name='2017',
                index_col=1,
                parse_dates=True).head()


Unnamed: 0_level_0,Line_Item_ID,Credit_Card_Number,Quantity,Menu_Item
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,1,7437926611570790,1,spicy miso ramen
2017-01-01,2,7437926611570790,1,spicy miso ramen
2017-01-01,3,8421920068932810,3,tori paitan ramen
2017-01-01,4,8421920068932810,3,tori paitan ramen
2017-01-01,5,4787310681569640,1,truffle butter ramen


# Recipe 3: Reading data from a URL
* Learn how to read data as a DataFrame from GitHub
* Learn how to read data as a DataFrame from AWS S3 using `pandas`
* Learn how to read an HTML page and extract tables using `pandas`

In [37]:
#!conda install boto3 s3fs html5lib lxml -y
#!pip install boto3 s3fs html5lib lxml

* To install using pip, you can use the following command:

```
pip install boto3 s3fs lxml
```

* To install using Conda, you can use:

```
conda install boto3 s3fs lxml -y
```

In [81]:
import pandas as pd

In [82]:
import pandas as pd
import boto3, s3fs, lxml
print(f'''
pandas -> {pd.__version__}
boto3 -> {boto3.__version__}
s3fs -> {s3fs.__version__}
lxml -> {lxml.__version__}
''')


pandas -> 2.1.3
boto3 -> 1.29.2
s3fs -> 0.4.2
lxml -> 4.9.3



## Reading from GitHub

In [40]:
# example of produced error
url = 'https://github.com/PacktPublishing/Time-Series-Analysis-with-Python-Cookbook./blob/main/datasets/Ch2/AirQualityUCI.csv'
# pd.read_csv(url)


In [88]:
url = 'https://raw.githubusercontent.com/PacktPublishing/Time-Series-Analysis-with-Python-Cookbook/main/datasets/Ch2/AirQualityUCI.csv'

df = pd.read_csv(url,
                 delimiter=';',
                 parse_dates=['Date'],
                 index_col='Date')

df.iloc[:3,1:4]

Unnamed: 0_level_0,CO(GT),PT08.S1(CO),NMHC(GT)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10/03/2004,2.6,1360.0,150
10/03/2004,2.0,1292.25,112
10/03/2004,2.2,1402.0,88


## Reading from Public S3 Bucket

**Path-style access**

In [42]:
url = 'https://s3.us-east-1.amazonaws.com/tscookbook/AirQualityUCI.xlsx'

df = pd.read_excel(url,
                   index_col='Date',
                   parse_dates=True)
df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


**Virtual-hosted–style access**

In [90]:
url = 'https://tscookbook.s3.amazonaws.com/AirQualityUCI.xlsx'

df = pd.read_excel(url,
                   index_col='Date',
                   parse_dates=True)
df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


**Accessing a bucket using S3://**

In [94]:
s3uri = 's3://tscookbook/AirQualityUCI.xlsx'
df = pd.read_excel(s3uri,
                   index_col='Date',
                   parse_dates=True)
df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


## Reading from Private S3 Bucket

In [101]:
import configparser
config = configparser.ConfigParser()
config.read('aws.cfg')

AWS_ACCESS_KEY = config['AWS']['aws_access_key']
AWS_SECRET_KEY = config['AWS']['aws_secret_key']


In [102]:
s3uri = "s3://tscookbook-private/AirQuality.csv"

df = pd.read_csv(s3uri,
                 index_col='Date',
                 parse_dates=True,
                 storage_options= {
                         'key': AWS_ACCESS_KEY,
                         'secret': AWS_SECRET_KEY
                     })

In [103]:
df.iloc[:3, 1:4]

Unnamed: 0_level_0,CO(GT),PT08.S1(CO),NMHC(GT)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-10-03,26,1360.0,150.0
2004-10-03,2,1292.0,112.0
2004-10-03,22,1402.0,88.0


**Using BOTO3**

In [104]:
import boto3
bucket = "tscookbook-private"
client = boto3.client("s3",
                  aws_access_key_id =AWS_ACCESS_KEY,
                  aws_secret_access_key = AWS_SECRET_KEY)


In [105]:
data = client.get_object(Bucket=bucket, Key='AirQuality.csv')

In [106]:
data.keys()

dict_keys(['ResponseMetadata', 'AcceptRanges', 'LastModified', 'ContentLength', 'ETag', 'ContentType', 'Metadata', 'Body'])

In [107]:
df = pd.read_csv(data['Body'],
                 index_col='Date',
                 parse_dates=True)


In [108]:
df.iloc[:3, 1:4]

Unnamed: 0_level_0,CO(GT),PT08.S1(CO),NMHC(GT)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-10-03,26,1360.0,150.0
2004-10-03,2,1292.0,112.0
2004-10-03,22,1402.0,88.0


## Reading from HTML

In [125]:
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
results = pd.read_html(url)
print(len(results))


69


In [126]:
# for i, k in enumerate(results):
#     print(i)
#     display(k.head())
    

In [127]:
df = results[15]
df.columns

Index(['Region[30]', 'Total cases', 'Total deaths', 'Cases per million',
       'Deaths per million', 'Current weekly cases', 'Current weekly deaths',
       'Population millions', 'Vaccinated %[31]'],
      dtype='object')

In [133]:
df[['Region[30]','Total cases', 'Total deaths', 'Cases per million']].head(3)

Unnamed: 0,Region[30],Total cases,Total deaths,Cases per million
0,European Union,179537758,1185108,401363
1,North America,103783777,1133607,281404
2,Other Europe,57721948,498259,247054


## Example how `read_html()` works

In [116]:
from io import StringIO
import pandas as pd

In [120]:
html = """
 <table>
   <tr>
     <th>Ticker</th>
     <th>Price</th>
   </tr>
   <tr>
     <td>MSFT</td>
     <td>230</td>
   </tr>
   <tr>
     <td>APPL</td>
     <td>300</td>
   </tr>
     <tr>
     <td>MSTR</td>
     <td>120</td>
   </tr>
 </table>

 </body>
 </html>
 """

df = pd.read_html(StringIO(html))
df[0]


Unnamed: 0,Ticker,Price
0,MSFT,230
1,APPL,300
2,MSTR,120


## Using `attr` option in `read.html()`

In [None]:
#!conda install html5lib beautifulSoup4

In [122]:
import pandas as pd
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
df = pd.read_html(url, attrs={'class': 'sortable'})
len(df)

5

In [123]:
df[3].columns

Index(['Region[30]', 'Total cases', 'Total deaths', 'Cases per million',
       'Deaths per million', 'Current weekly cases', 'Current weekly deaths',
       'Population millions', 'Vaccinated %[31]'],
      dtype='object')

# Recipe 4: Reading from a SAS dataset --to be deleted recipe--
* Dataset 1: http://support.sas.com/kb/61/960.html
* Learn about chunkisize (chunking) in pandas when working when reading large data files
* Learn about `Modin` using a separate Jupyter Notebook on how to improve performance with minimal effort 


In [None]:
import pandas as pd
path = '../../datasets/Ch2/DCSKINPRODUCT.sas7bdat'


In [None]:
df = pd.read_sas(path, chunksize=10000)
type(df)


In [None]:
results = []
for chunk in df:
    results.append(
        chunk)
len(results)

In [None]:
df = pd.concat(results)
df.shape

In [None]:
df = pd.read_sas(path, chunksize=10000)
results = []
for chunk in df:
    results.append(
        chunk.groupby('DATE')['Revenue']
             .agg(['sum', 'count']))
len(results)

In [None]:
pd.concat(results).shape

In [None]:
results[0].loc['2013-02-10']

In [None]:
results[1].loc['2013-02-10']

In [None]:
results[2].loc['2013-02-10']

In [None]:
from functools import reduce
final = reduce(lambda x1, x2: x1.add(x2, fill_value=0), results)
type(final)

In [None]:
final.loc['2013-02-10']

In [None]:
final.shape

## There is more

In [None]:
# !conda install dask # install everything
#!conda install dask-core # install only core parts of dash
#!python -m pip install "dask[complete]"    # Install everything
#!python -m pip install dask                # Install only core parts of dask

# Recipe 4 - Read data from Parquet files

The following code reads the original data from NCEI NOAA in CSV then stores the data as Parquet files partitioned by year. The recipe focuses on reading the Parquet files and demonstrating how you can filter/query a specific partition.

In [134]:
import pandas as pd
from pathlib import Path
pd.__version__

'2.1.3'

Optional: Script to read the original dataset in CSV format and convert into a partitioned Parquet files by year

In [52]:
# file = 'https://www.ncei.noaa.gov/orders/cdo/3352259.csv'
# df = pd.read_csv(file)
# df['DT'] = pd.to_datetime(df['DATE'])
# df['year'] = df['DT'].dt.year
# df.to_parquet('../../datasets/Ch2/LA_weather.parquet', engine='pyarrow', partition_cols=['year'], compression='snappy')

Reading the Parquet files

In [137]:
file = Path('../../datasets/Ch2/LA_weather.parquet/')
df = pd.read_parquet(file,
                    engine='pyarrow')

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4899 entries, 0 to 4898
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          4899 non-null   object        
 1   NAME             4899 non-null   object        
 2   DATE             4899 non-null   object        
 3   PRCP             4899 non-null   float64       
 4   PRCP_ATTRIBUTES  4899 non-null   object        
 5   SNOW             121 non-null    float64       
 6   SNOW_ATTRIBUTES  121 non-null    object        
 7   SNWD             59 non-null     float64       
 8   SNWD_ATTRIBUTES  59 non-null     object        
 9   TAVG             3713 non-null   float64       
 10  TAVG_ATTRIBUTES  3713 non-null   object        
 11  TMAX             4899 non-null   int64         
 12  TMAX_ATTRIBUTES  4899 non-null   object        
 13  TMIN             4899 non-null   int64         
 14  TMIN_ATTRIBUTES  4899 non-null   object 

Reading a specific partition for year 2012

In [139]:
filters = [('year', '==', 2012)]
df_2012 = pd.read_parquet(file, 
                          engine='pyarrow', 
                          filters=filters)

Filtering to read files for years > 2020

In [140]:
df_2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          366 non-null    object        
 1   NAME             366 non-null    object        
 2   DATE             366 non-null    object        
 3   PRCP             366 non-null    float64       
 4   PRCP_ATTRIBUTES  366 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             0 non-null      float64       
 10  TAVG_ATTRIBUTES  0 non-null      object        
 11  TMAX             366 non-null    int64         
 12  TMAX_ATTRIBUTES  366 non-null    object        
 13  TMIN             366 non-null    int64         
 14  TMIN_ATTRIBUTES  366 non-null    object   

In [141]:
filters = [('year', '>', 2020)]

df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          881 non-null    object        
 1   NAME             881 non-null    object        
 2   DATE             881 non-null    object        
 3   PRCP             881 non-null    float64       
 4   PRCP_ATTRIBUTES  881 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             881 non-null    float64       
 10  TAVG_ATTRIBUTES  881 non-null    object        
 11  TMAX             881 non-null    int64         
 12  TMAX_ATTRIBUTES  881 non-null    object        
 13  TMIN             881 non-null    int64         
 14  TMIN_ATTRIBUTES  881 non-null    object   

In [142]:
filters = [('year', '>=', 2021)]
pd.read_parquet(file, 
                engine='pyarrow', 
                filters= filters).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          881 non-null    object        
 1   NAME             881 non-null    object        
 2   DATE             881 non-null    object        
 3   PRCP             881 non-null    float64       
 4   PRCP_ATTRIBUTES  881 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             881 non-null    float64       
 10  TAVG_ATTRIBUTES  881 non-null    object        
 11  TMAX             881 non-null    int64         
 12  TMAX_ATTRIBUTES  881 non-null    object        
 13  TMIN             881 non-null    int64         
 14  TMIN_ATTRIBUTES  881 non-null    object   

In [143]:
filters = [('year', 'in', [2021, 2022, 2023])]
df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          881 non-null    object        
 1   NAME             881 non-null    object        
 2   DATE             881 non-null    object        
 3   PRCP             881 non-null    float64       
 4   PRCP_ATTRIBUTES  881 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             881 non-null    float64       
 10  TAVG_ATTRIBUTES  881 non-null    object        
 11  TMAX             881 non-null    int64         
 12  TMAX_ATTRIBUTES  881 non-null    object        
 13  TMIN             881 non-null    int64         
 14  TMIN_ATTRIBUTES  881 non-null    object   

In [144]:
df.head()

Unnamed: 0,STATION,NAME,DATE,PRCP,PRCP_ATTRIBUTES,SNOW,SNOW_ATTRIBUTES,SNWD,SNWD_ATTRIBUTES,TAVG,TAVG_ATTRIBUTES,TMAX,TMAX_ATTRIBUTES,TMIN,TMIN_ATTRIBUTES,DT,year
0,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2021-01-01,0.0,",,W,2400",,,,,57.0,"H,,S",67,",,W",43,",,W",2021-01-01,2021
1,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2021-01-02,0.0,",,W,2400",,,,,54.0,"H,,S",63,",,W",46,",,W",2021-01-02,2021
2,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2021-01-03,0.0,",,W,2400",,,,,54.0,"H,,S",62,",,W",49,",,W",2021-01-03,2021
3,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2021-01-04,0.0,",,W,2400",,,,,54.0,"H,,S",59,",,W",48,",,W",2021-01-04,2021
4,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2021-01-05,0.0,",,W,2400",,,,,54.0,"H,,S",57,",,W",50,",,W",2021-01-05,2021


In [145]:
columns = ['DATE', 'year', 'TMAX']
df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters, 
                     columns=columns)

df.head()

Unnamed: 0,DATE,year,TMAX
0,2021-01-01,2021,67
1,2021-01-02,2021,63
2,2021-01-03,2021,62
3,2021-01-04,2021,59
4,2021-01-05,2021,57


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   DATE    881 non-null    object  
 1   year    881 non-null    category
 2   TMAX    881 non-null    int64   
dtypes: category(1), int64(1), object(1)
memory usage: 15.2+ KB


## There is more

In [147]:
import pyarrow.parquet as pq
import pyarrow as pa
from pathlib import Path

file = Path('../../datasets/Ch2/LA_weather.parquet/')
table = pq.read_table(file, filters=filters, columns=columns)

In [148]:
import pyarrow as pa
isinstance(table, pa.Table)

True

In [150]:
df = table.to_pandas()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   DATE    881 non-null    object  
 1   year    881 non-null    category
 2   TMAX    881 non-null    int64   
dtypes: category(1), int64(1), object(1)
memory usage: 15.2+ KB


In [151]:
df.head()

Unnamed: 0,DATE,year,TMAX
0,2021-01-01,2021,67
1,2021-01-02,2021,63
2,2021-01-03,2021,62
3,2021-01-04,2021,59
4,2021-01-05,2021,57


In [152]:
table.column_names

['DATE', 'year', 'TMAX']

In [153]:
table.schema

DATE: string
year: dictionary<values=int32, indices=int32, ordered=0>
TMAX: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2236

In [154]:
table.column_names

['DATE', 'year', 'TMAX']

In [155]:
type(table)

pyarrow.lib.Table

In [156]:
# table.schema.pandas_metadata

In [75]:
pq_dataset = pq.ParquetDataset(file, filters=filters)

In [76]:
pq_dataset

<pyarrow.parquet.core._ParquetDatasetV2 at 0x169d90b90>

In [77]:
type(pq_dataset)

pyarrow.parquet.core._ParquetDatasetV2

In [78]:
pq_dataset.files

['../../datasets/Ch2/LA_weather.parquet/year=2010/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2011/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2012/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2013/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2014/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2015/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2016/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2017/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2018/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2019/0e2f2d178c2247a8814dadb9998d32e8-0.parquet',
 '../../datasets/Ch2/LA_weather.parquet/year=2020/

In [79]:
pq_dataset.schema

STATION: string
NAME: string
DATE: string
PRCP: double
PRCP_ATTRIBUTES: string
SNOW: double
SNOW_ATTRIBUTES: string
SNWD: double
SNWD_ATTRIBUTES: string
TAVG: double
TAVG_ATTRIBUTES: string
TMAX: int64
TMAX_ATTRIBUTES: string
TMIN: int64
TMIN_ATTRIBUTES: string
DT: timestamp[us]
year: dictionary<values=int32, indices=int32, ordered=0>
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2236

In [80]:
pq_dataset.schema.pandas_metadata

{'index_columns': [{'kind': 'range',
   'name': None,
   'start': 0,
   'stop': 4899,
   'step': 1}],
 'column_indexes': [{'name': None,
   'field_name': None,
   'pandas_type': 'unicode',
   'numpy_type': 'object',
   'metadata': {'encoding': 'UTF-8'}}],
 'columns': [{'name': 'STATION',
   'field_name': 'STATION',
   'pandas_type': 'unicode',
   'numpy_type': 'object',
   'metadata': None},
  {'name': 'NAME',
   'field_name': 'NAME',
   'pandas_type': 'unicode',
   'numpy_type': 'object',
   'metadata': None},
  {'name': 'DATE',
   'field_name': 'DATE',
   'pandas_type': 'unicode',
   'numpy_type': 'object',
   'metadata': None},
  {'name': 'PRCP',
   'field_name': 'PRCP',
   'pandas_type': 'float64',
   'numpy_type': 'float64',
   'metadata': None},
  {'name': 'PRCP_ATTRIBUTES',
   'field_name': 'PRCP_ATTRIBUTES',
   'pandas_type': 'unicode',
   'numpy_type': 'object',
   'metadata': None},
  {'name': 'SNOW',
   'field_name': 'SNOW',
   'pandas_type': 'float64',
   'numpy_type': 'flo

In [81]:
pq_dataset.schema.metadata

{b'pandas': b'{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 4899, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "STATION", "field_name": "STATION", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "NAME", "field_name": "NAME", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "DATE", "field_name": "DATE", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "PRCP", "field_name": "PRCP", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "PRCP_ATTRIBUTES", "field_name": "PRCP_ATTRIBUTES", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "SNOW", "field_name": "SNOW", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "SNOW_ATTRIBUTES", "field_name": "SNOW_ATTRIBUTES", "pan

In [82]:
data = pq_dataset.read()

In [83]:
type(data)

pyarrow.lib.Table

In [84]:
data

pyarrow.Table
STATION: string
NAME: string
DATE: string
PRCP: double
PRCP_ATTRIBUTES: string
SNOW: double
SNOW_ATTRIBUTES: string
SNWD: double
SNWD_ATTRIBUTES: string
TAVG: double
TAVG_ATTRIBUTES: string
TMAX: int64
TMAX_ATTRIBUTES: string
TMIN: int64
TMIN_ATTRIBUTES: string
DT: timestamp[us]
year: dictionary<values=int32, indices=int32, ordered=0>
----
STATION: [["USW00023174","USW00023174","USW00023174","USW00023174","USW00023174",...,"USW00023174","USW00023174","USW00023174","USW00023174","USW00023174"],["USW00023174","USW00023174","USW00023174","USW00023174","USW00023174",...,"USW00023174","USW00023174","USW00023174","USW00023174","USW00023174"],["USW00023174","USW00023174","USW00023174","USW00023174","USW00023174",...,"USW00023174","USW00023174","USW00023174","USW00023174","USW00023174"]]
NAME: [["LOS ANGELES INTERNATIONAL AIRPORT, CA US","LOS ANGELES INTERNATIONAL AIRPORT, CA US","LOS ANGELES INTERNATIONAL AIRPORT, CA US","LOS ANGELES INTERNATIONAL AIRPORT, CA US","LOS ANGELES IN

In [85]:
data.num_columns

17

In [86]:
data.num_rows

881

In [87]:
data.schema

STATION: string
NAME: string
DATE: string
PRCP: double
PRCP_ATTRIBUTES: string
SNOW: double
SNOW_ATTRIBUTES: string
SNWD: double
SNWD_ATTRIBUTES: string
TAVG: double
TAVG_ATTRIBUTES: string
TMAX: int64
TMAX_ATTRIBUTES: string
TMIN: int64
TMIN_ATTRIBUTES: string
DT: timestamp[us]
year: dictionary<values=int32, indices=int32, ordered=0>
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2236

In [88]:
file = Path('../../datasets/Ch2/LA_weather.parquet/')
table = pq.read_table(file)

In [89]:
table.num_rows

4899

In [90]:
table.to_pandas()

Unnamed: 0,STATION,NAME,DATE,PRCP,PRCP_ATTRIBUTES,SNOW,SNOW_ATTRIBUTES,SNWD,SNWD_ATTRIBUTES,TAVG,TAVG_ATTRIBUTES,TMAX,TMAX_ATTRIBUTES,TMIN,TMIN_ATTRIBUTES,DT,year
0,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2010-01-01,0.0,",,0,2400",,,,,,,69,",,0",50,",,0",2010-01-01,2010
1,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2010-01-02,0.0,",,0,2400",,,,,,,76,",,0",55,",,0",2010-01-02,2010
2,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2010-01-03,0.0,",,0,2400",,,,,,,80,",,0",52,",,0",2010-01-03,2010
3,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2010-01-04,0.0,",,0,2400",,,,,,,75,",,0",53,",,0",2010-01-04,2010
4,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2010-01-05,0.0,",,0,2400",,,,,,,76,",,0",52,",,0",2010-01-05,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4894,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2023-05-27,0.0,",,W,2400",,,,,61.0,"H,,S",67,",,W",58,",,W",2023-05-27,2023
4895,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2023-05-28,0.0,"T,,W,2400",,,,,61.0,"H,,S",64,",,W",59,",,W",2023-05-28,2023
4896,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2023-05-29,0.0,",,W,2400",,,,,62.0,"H,,S",68,",,W",61,",,W",2023-05-29,2023
4897,USW00023174,"LOS ANGELES INTERNATIONAL AIRPORT, CA US",2023-05-30,0.0,",,W,2400",,,,,62.0,"H,,S",66,",,W",60,",,W",2023-05-30,2023


In [91]:
pa.Table.from_pandas(df)

pyarrow.Table
DATE: string
year: dictionary<values=int32, indices=int8, ordered=0>
TMAX: int64
----
DATE: [["2021-01-01","2021-01-02","2021-01-03","2021-01-04","2021-01-05",...,"2023-05-27","2023-05-28","2023-05-29","2023-05-30","2023-05-31"]]
year: [  -- dictionary:
[2010,2011,2012,2013,2014,...,2019,2020,2021,2022,2023]  -- indices:
[11,11,11,11,11,...,13,13,13,13,13]]
TMAX: [[67,63,62,59,57,...,67,64,68,66,68]]

### Passing a Schema

In [92]:
columns = ['DATE','year', 'TMAX']
filters = [('year', 'in', [2021, 2022, 2023])]

tb = pq.read_table(file,
                   filters=filters, 
                   columns=columns,
                   use_pandas_metadata=True)

df_pa = tb.to_pandas()

In [93]:
df_pa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   DATE    881 non-null    object  
 1   year    881 non-null    category
 2   TMAX    881 non-null    int64   
dtypes: category(1), int64(1), object(1)
memory usage: 15.2+ KB


In [94]:
df_pd = pd.read_parquet(file, 
                        filters=filters, 
                        columns=columns,
                        use_pandas_metadata=True)

df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   DATE    881 non-null    object  
 1   year    881 non-null    category
 2   TMAX    881 non-null    int64   
dtypes: category(1), int64(1), object(1)
memory usage: 15.2+ KB


In [95]:
schema = pa.schema([
    ('DATE', pa.string()),
    ('year', pa.int64()),
    ('TMAX', pa.int64())
])


# Recipe 5 - Working with Large Data Files

In [1]:
# Script to create one large data file 

# import pandas as pd
# import glob

# def run_once():
#     # Directory path where Parquet files are located
#     directory = '../../datasets/Ch2/yellow_tripdata_2023-*.parquet'
    
#     # Get a list of all Parquet files in the directory
#     parquet_files = glob.glob(directory)
    
#     # Read all Parquet files into a single DataFrame
#     dfs = []
#     for file in parquet_files:
#         df = pd.read_parquet(file)
#         dfs.append(df)
    
#     # Concatenate all DataFrames into a single DataFrame
#     combined_df = pd.concat(dfs)
#     # combined_df.to_parquet('../../datasets/Ch2/yellow_tripdata_2023.parquet', engine='pyarrow')
#     combined_df.to_csv('../../datasets/Ch2/yellow_tripdata_2023.csv', index=False)

# run_once()

### Using other librarys
* Polars
* Dask

In [1]:
import pandas as pd
import time
import psutil
import polars as pl
import dask.dataframe as dd
from pathlib import Path

In [2]:
print(f'polars version: {pl.__version__}')
import dask
print(f'dask version: {dask.__version__}')

polars version: 0.20.5
dask version: 2024.1.0


In [3]:
#import memory_profiler 
%load_ext memory_profiler

In [5]:
# Read in the CSV file
file_path = Path('../../datasets/Ch2/yellow_tripdata_2023.csv')

## Normal Read using pandas `read_csv`

In [55]:
%%time
%%memit 

df_pd = pd.read_csv(file_path, low_memory=False)

peak memory: 10851.33 MiB, increment: 2977.45 MiB
CPU times: user 21.9 s, sys: 4.01 s, total: 25.9 s
Wall time: 27.4 s


In [56]:
# clean up
del df_pd

## Using Chunksize parameter

In [39]:
%%time
%%memit 

df_pd = pd.read_csv(file_path, low_memory=False, chunksize=10000)

peak memory: 5662.83 MiB, increment: 2.06 MiB
CPU times: user 60.9 ms, sys: 47.2 ms, total: 108 ms
Wall time: 573 ms


In [32]:
type(df_pd)

pandas.io.parsers.readers.TextFileReader

In [40]:
df_pd.chunksize

10000

### Using Next()

In [59]:
%%time
%%memit 

df_pd = pd.read_csv(file_path, low_memory=False, chunksize=10000)
chunk = next(df_pd)
chunk.head()

peak memory: 6823.64 MiB, increment: 9.14 MiB
CPU times: user 72.3 ms, sys: 40.8 ms, total: 113 ms
Wall time: 581 ms


### Using .get_chunk() method

In [50]:
%%time
%%memit 

chunk = df_pd.get_chunk()
chunk.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
60000,1,2023-05-01 17:19:11,2023-05-01 17:26:25,1.0,0.6,1.0,N,162,230,1,7.2,5.0,0.5,2.75,0.0,1.0,16.45,2.5,0.0,
60001,1,2023-05-01 17:31:32,2023-05-01 17:37:04,1.0,0.7,1.0,N,100,164,1,6.5,5.0,0.5,2.6,0.0,1.0,15.6,2.5,0.0,
60002,1,2023-05-01 17:53:11,2023-05-01 18:05:09,1.0,1.3,1.0,N,79,231,1,10.7,5.0,0.5,2.0,0.0,1.0,19.2,2.5,0.0,
60003,2,2023-05-01 17:14:40,2023-05-01 17:26:24,1.0,2.16,1.0,N,161,107,1,12.8,2.5,0.5,4.0,0.0,1.0,23.3,2.5,0.0,
60004,2,2023-05-01 17:27:20,2023-05-01 17:38:22,1.0,1.95,1.0,N,107,229,1,13.5,2.5,0.5,3.0,0.0,1.0,23.0,2.5,0.0,


### Using a loop

In [61]:
%%time
%%memit 

df_pd = pd.read_csv(file_path, low_memory=False, chunksize=10000)

final_result = pd.DataFrame()
for chunk in df_pd:
    final_result = pd.concat([final_result, chunk])

peak memory: 11327.92 MiB, increment: 4507.19 MiB
CPU times: user 3min 28s, sys: 3min 46s, total: 7min 14s
Wall time: 7min 34s


In [62]:
# clean up
del df_pd

### Using pd.concat()

In [53]:
%%time
%%memit 


df_pd = pd.read_csv(file_path, low_memory=False, chunksize=10000)

final_result = pd.concat(df_pd)

peak memory: 9145.42 MiB, increment: 697.86 MiB
CPU times: user 14.9 s, sys: 2.81 s, total: 17.7 s
Wall time: 18.8 s


In [79]:
# clean up
del df_pd

## Using Dask

In [1]:
import dask.dataframe as dd
from pathlib import Path

file_path = Path('../../datasets/Ch2/yellow_tripdata_2023.csv')

In [4]:
%%time
%%memit 

df_dk = dd.read_csv(file_path)

peak memory: 153.22 MiB, increment: 3.38 MiB
CPU times: user 44.9 ms, sys: 12.1 ms, total: 57 ms
Wall time: 389 ms


In [8]:
print(df_dk)

Dask DataFrame Structure:
               VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount    extra  mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge Airport_fee airport_fee
npartitions=26                                                                                                                                                                                                                                                                                             
                  int64               string                string         float64       float64    float64             string        int64        int64        int64     float64  float64  float64    float64      float64               float64      float64              float64     float64     float64
                    ...                  ...                   ...        

In [17]:
%%time
%%memit 

print(df_dk.shape[0].compute())

16186386
peak memory: 6346.53 MiB, increment: 1818.44 MiB
CPU times: user 19.3 s, sys: 5.29 s, total: 24.6 s
Wall time: 10.5 s


In [6]:
type(df_dk)

dask.dataframe.core.DataFrame

In [13]:
df_dk.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 20 entries, VendorID to airport_fee
dtypes: float64(13), int64(4), string(3)

In [14]:
df_dk.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
0,1,2023-05-01 00:33:13,2023-05-01 00:53:01,0.0,7.8,1.0,N,138,43,1,33.8,7.75,0.5,8.6,0.0,1.0,51.65,0.0,1.75,
1,1,2023-05-01 00:42:49,2023-05-01 01:11:18,2.0,8.1,1.0,N,138,262,1,35.9,10.25,0.5,9.5,0.0,1.0,57.15,2.5,1.75,
2,1,2023-05-01 00:56:34,2023-05-01 01:13:39,2.0,9.1,1.0,N,138,141,1,35.2,10.25,0.5,10.7,6.55,1.0,64.2,2.5,1.75,
3,2,2023-05-01 00:00:52,2023-05-01 00:20:12,1.0,8.21,1.0,N,138,140,1,33.1,6.0,0.5,2.24,0.0,1.0,47.09,2.5,1.75,
4,1,2023-05-01 00:05:50,2023-05-01 00:19:41,0.0,7.9,1.0,N,138,263,1,31.0,10.25,0.5,9.85,6.55,1.0,59.15,2.5,1.75,


In [18]:
df_pd = df_dk.compute()
type(df_pd)

pandas.core.frame.DataFrame

In [15]:
del df_dk

## Using Polars

In [66]:
import polars as pl
from pathlib import Path

file_path = Path('../../datasets/Ch2/yellow_tripdata_2023.csv')


In [67]:
%%time
%%memit

df_pl = pl.read_csv(file_path)

peak memory: 8633.58 MiB, increment: 2505.14 MiB
CPU times: user 4.85 s, sys: 3.28 s, total: 8.13 s
Wall time: 2.81 s


In [74]:
df_pl.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
i64,str,str,f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,"""2023-05-01 00:…","""2023-05-01 00:…",0.0,7.8,1.0,"""N""",138,43,1,33.8,7.75,0.5,8.6,0.0,1.0,51.65,0.0,1.75,
1,"""2023-05-01 00:…","""2023-05-01 01:…",2.0,8.1,1.0,"""N""",138,262,1,35.9,10.25,0.5,9.5,0.0,1.0,57.15,2.5,1.75,
1,"""2023-05-01 00:…","""2023-05-01 01:…",2.0,9.1,1.0,"""N""",138,141,1,35.2,10.25,0.5,10.7,6.55,1.0,64.2,2.5,1.75,
2,"""2023-05-01 00:…","""2023-05-01 00:…",1.0,8.21,1.0,"""N""",138,140,1,33.1,6.0,0.5,2.24,0.0,1.0,47.09,2.5,1.75,
1,"""2023-05-01 00:…","""2023-05-01 00:…",0.0,7.9,1.0,"""N""",138,263,1,31.0,10.25,0.5,9.85,6.55,1.0,59.15,2.5,1.75,


In [75]:
type(df_pl)

polars.dataframe.frame.DataFrame

In [76]:
df_pl.shape

(16186386, 20)

In [78]:
df_pd = df_pl.to_pandas()