# Chapter 2: Reading and Writing Files

## Reading and writing CSV files

### Getting ready

In [None]:
import polars as pl

### How to do it...

In [None]:
df = pl.read_csv('../data/customer_shopping_data.csv')
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv')
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', has_header=False)
df.head()

In [None]:
column_names = ['invoice_no', 'customer_id', 'gender', 'age', 'category', 'quantity', 'price', 'payment_method', 'invoice_date', 'shopping_mall']
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names)
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True)
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True, 
                 dtypes={'age': pl.Int8, 'quantity': pl.Int32})
df.head()

In [None]:
df.write_csv('../data/output/shopping_data_output.csv', 
             has_header=False, 
             separator=',')

### There is more...

In [None]:
lf = pl.scan_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True, 
                 dtypes={'age': pl.Int8, 'quantity': pl.Int32})
lf.fetch(5)

In [None]:
lf.sink_csv('../data/output/shopping_data_output_sink.csv')

## Reading and writing parquet files

### How to do it...

In [19]:
parquet_input_file_path = '../data/largest_us_venture_funding_deals_2023.parquet'
df = pl.read_parquet(parquet_input_file_path, 
                     columns=['Company', 'Amount', 'Valuation', 'Industry'], 
                     row_count_name='row_cnt')
df.head()

row_cnt,Company,Amount,Valuation,Industry
u32,str,str,str,str
0,"""OpenAI""","""$10,000,000,00…","""n/a""","""Artificial int…"
1,"""Stripe""","""$6,500,000,000…","""$50,000,000,00…","""Fintech"""
2,"""Inflection AI""","""$1,300,000,000…","""$4,000,000,000…","""Artificial int…"
3,"""Anthropic""","""$1,250,000,000…","""$4,000,000,000…","""Artificial int…"
4,"""Generate Capit…","""$1,030,900,000…","""n/a""","""Energy"""


In [20]:
pl.read_parquet_schema(parquet_input_file_path)

{'Company': Utf8,
 'Amount': Utf8,
 'Lead investors': Utf8,
 'Valuation': Utf8,
 'Industry': Utf8,
 'Date reported': Utf8}

In [21]:
parquet_output_file_path = '../data/output/largest_us_venture_funding_deals_2023_output.parquet'
df.write_parquet(parquet_output_file_path, compression='zstd', compression_level=10)

In [22]:
lf = pl.scan_parquet(parquet_input_file_path)
lf.collect().head()

Company,Amount,Lead investors,Valuation,Industry,Date reported
str,str,str,str,str,str
"""OpenAI""","""$10,000,000,00…","""Microsoft""","""n/a""","""Artificial int…","""1/23/23"""
"""Stripe""","""$6,500,000,000…","""n/a""","""$50,000,000,00…","""Fintech""","""3/15/23"""
"""Inflection AI""","""$1,300,000,000…","""Microsoft, Rei…","""$4,000,000,000…","""Artificial int…","""6/29/23"""
"""Anthropic""","""$1,250,000,000…","""Amazon""","""$4,000,000,000…","""Artificial int…","""9/25/23"""
"""Generate Capit…","""$1,030,900,000…","""n/a""","""n/a""","""Energy""","""1/6/23"""


In [23]:
lf.sink_parquet(parquet_output_file_path, maintain_order=False)

### There is more...

In [None]:
partitioned_parquet_input_file_path = '../data/largest_us_venture_funding_deals_2023_partitioned'
df = pl.read_parquet(
    partitioned_parquet_input_file_path, 
    use_pyarrow=True, 
    pyarrow_options={'partitioning': 'hive'}
)
df.head()

In [None]:
partitioned_parquet_output_file_path = '../data/output/largest_us_venture_funding_deals_2023_partitioned_output'
df.write_parquet(
    partitioned_parquet_output_file_path, 
    use_pyarrow=True, 
    pyarrow_options={
        'partition_cols': ['Industry'],
        'existing_data_behavior': 'overwrite_or_ignore'
        }
)

## Reading and writing Delta Lake tables

### Getting ready

In [None]:
import polars as pl

### How to do it...

In [None]:
delta_input_file_path = '../data/largest_us_venture_funding_deals_2023_delta'
df = pl.read_delta(delta_input_file_path)
df.head()

In [None]:
delta_output_file_path = '../data/output/largest_us_venture_funding_deals_2023_delta_output'
df.write_delta(delta_output_file_path)

In [None]:
delta_partitioned_output_file_path = '../data/output/largest_us_venture_funding_deals_2023_delta_partitioned_output'
delta_write_options = {'partition_by': 'Industry'}
df.write_delta(
    delta_partitioned_output_file_path, 
    mode='overwrite', 
    delta_write_options=delta_write_options
)

In [None]:
df = pl.read_delta(
    delta_partitioned_output_file_path, 
    pyarrow_options={'partitions': [('Industry', '=', 'Accounting')]}
)
df.head()

## Reading and writing JSON files

### Getting ready

### How to do it...

In [None]:
df = pl.read_json('../data/world_population.json')
df.select(df.columns[:10]).head()

In [None]:
df.write_json('../data/output/world_population_output.json')

In [None]:
df = pl.read_ndjson('../data/world_population.jsonl')
df.select(df.columns[:10]).head()

In [None]:
df.write_ndjson('../data/output/world_population_output.jsonl')

### There is more...

In [None]:
lf = pl.scan_ndjson('../data/world_population.jsonl')
lf.select(lf.columns[:10]).collect().head()

## Reading and writing Excel files

### Getting ready

In [1]:
import polars as pl

### How to do it...

In [7]:
input_file_path = '../data/financial_sample.xlsx'
df = pl.read_excel(
    input_file_path, 
    sheet_name='Sheet1',
    read_csv_options={'has_header': True, 'try_parse_dates': True}
)
df.head()

Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Month Name,Year
str,str,str,str,f64,i64,i64,f64,f64,f64,f64,f64,date,i64,str,i64
"""Government""","""Canada""","""Carretera""","""None""",1618.5,3,20,32370.0,0.0,32370.0,16185.0,16185.0,0014-01-01,1,"""January""",2014
"""Government""","""Germany""","""Carretera""","""None""",1321.0,3,20,26420.0,0.0,26420.0,13210.0,13210.0,0014-01-01,1,"""January""",2014
"""Midmarket""","""France""","""Carretera""","""None""",2178.0,3,15,32670.0,0.0,32670.0,21780.0,10890.0,0014-01-06,6,"""June""",2014
"""Midmarket""","""Germany""","""Carretera""","""None""",888.0,3,15,13320.0,0.0,13320.0,8880.0,4440.0,0014-01-06,6,"""June""",2014
"""Midmarket""","""Mexico""","""Carretera""","""None""",2470.0,3,15,37050.0,0.0,37050.0,24700.0,12350.0,0014-01-06,6,"""June""",2014


In [8]:
output_file_path = '../data/output/financial_sample_output.xlsx'
df.write_excel(
    output_file_path,
    worksheet='Output Sheet1',
    header_format={'bold': True}
)

<xlsxwriter.workbook.Workbook at 0x1070696d0>

## Reading and writing other file formats

### Getting ready

In [2]:
import polars as pl

### How to do it...

In [5]:
csv_input_file_path = '../data/customer_shopping_data.csv'
ipc_file_path = '../data/customer_shopping_data.arrow'
df = pl.read_csv(csv_input_file_path)
df.write_ipc(ipc_file_path)

In [6]:
df = pl.read_ipc(ipc_file_path)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [12]:
avro_file_path = '../data/customer_shopping_data.avro'
df = pl.read_csv(csv_input_file_path)
df.write_avro(avro_file_path)

In [13]:
df = pl.read_avro(avro_file_path)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [3]:
iceberg_input_file_path = '../data/lineitem_iceberg'
lf = pl.scan_iceberg(iceberg_input_file_path)
lf.collect().head()

AttributeError: module 'polars' has no attribute 'scan_iceberg'

### There is more...

In [7]:
lf = pl.scan_ipc(ipc_file_path)
lf.collect().head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [24]:
ipc_output_file_path = '../data/output/customer_shopping_data.arrow'
lf.sink_parquet(ipc_output_file_path, maintain_order=False)

In [26]:
df = pl.read_ipc(ipc_file_path)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


## Reading and writing multiple files

### Getting ready

### How to do it...

In [56]:
data = {'Letter': ['A','B','C'], 'Value': [1,2,3]}
df = pl.DataFrame(data)

In [75]:
dfs = df.group_by('Letter')
print(dfs)

<polars.dataframe.group_by.GroupBy object at 0x153ffef90>


In [58]:
for name, df in dfs:
    df.write_csv(f'../data/output/letter_{name}.csv')

In [59]:
df = pl.read_csv('../data/output/letter_*.csv')
df.head()

Letter,Value
str,i64
"""A""",1
"""B""",2
"""C""",3


In [60]:
lf = pl.scan_csv('../data/output/letter_*.csv')
lf.collect().head()

Letter,Value
str,i64
"""A""",1
"""B""",2
"""C""",3


### There is more...

In [84]:
import glob
lfs = [pl.scan_csv(file) for file in glob.glob('../data/output/letter_*.csv')]
dfs = pl.collect_all(lfs)
display(dfs)

[shape: (1, 2)
 ┌────────┬───────┐
 │ Letter ┆ Value │
 │ ---    ┆ ---   │
 │ str    ┆ i64   │
 ╞════════╪═══════╡
 │ B      ┆ 2     │
 └────────┴───────┘,
 shape: (1, 2)
 ┌────────┬───────┐
 │ Letter ┆ Value │
 │ ---    ┆ ---   │
 │ str    ┆ i64   │
 ╞════════╪═══════╡
 │ C      ┆ 3     │
 └────────┴───────┘,
 shape: (1, 2)
 ┌────────┬───────┐
 │ Letter ┆ Value │
 │ ---    ┆ ---   │
 │ str    ┆ i64   │
 ╞════════╪═══════╡
 │ A      ┆ 1     │
 └────────┴───────┘]

## Working with databases

### Getting ready

### How to do it...

In [None]:
uri = 'postgres://postgres:postgres@127.0.0.1:5432/postgres'
query = 'SELECT * FROM orders'

pl.read_database_uri(query=query, uri=uri)

[2023-10-20T12:33:56Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:33:56Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:33:57Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:33:59Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:34:02Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:34:08Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:34:21Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"


RuntimeError: timed out waiting for connection: db error: FATAL: password authentication failed for user "postgres"

In [None]:
uri = 'postgres://postgres:5150@127.0.0.1:5432/postgres'
query = 'SELECT * FROM orders'

pl.read_database_uri(query=query, uri=uri)

[2023-10-20T12:36:49Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:36:49Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:36:50Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:36:52Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:36:55Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:02Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:14Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"


[2023-10-20T12:37:19Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:19Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:20Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:22Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:25Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:32Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"
[2023-10-20T12:37:44Z ERROR r2d2] db error: FATAL: password authentication failed for user "postgres"


KeyboardInterrupt: 