# Chapter 2: Reading and Writing Files

## Reading and writing CSV files

### Getting ready

In [None]:
import polars as pl

### How to do it...

In [None]:
df = pl.read_csv('../data/customer_shopping_data.csv')
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv')
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', has_header=False)
df.head()

In [None]:
column_names = ['invoice_no', 'customer_id', 'gender', 'age', 'category', 'quantity', 'price', 'payment_method', 'invoice_date', 'shopping_mall']
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names)
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True)
df.head()

In [None]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True, 
                 dtypes={'age': pl.Int8, 'quantity': pl.Int32})
df.head()

In [None]:
df.write_csv('../data/output/shopping_data_output.csv', 
             has_header=False, 
             separator=',')

### There is more...

In [None]:
lf = pl.scan_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True, 
                 dtypes={'age': pl.Int8, 'quantity': pl.Int32})
lf.fetch(5)

In [None]:
lf.sink_csv('../data/output/shopping_data_output_sink.csv')

## Reading and writing parquet files

### How to do it...

In [None]:
parquet_input_file_path = '../data/venture_funding_deals.parquet'
df = pl.read_parquet(parquet_input_file_path, 
                     columns=['Company', 'Amount', 'Valuation', 'Industry'], 
                     row_count_name='row_cnt')
df.head()

In [None]:
pl.read_parquet_schema(parquet_input_file_path)

In [None]:
parquet_output_file_path = '../data/output/venture_funding_deals_output.parquet'
df.write_parquet(parquet_output_file_path, compression='zstd', compression_level=10)

In [None]:
lf = pl.scan_parquet(parquet_input_file_path)
lf.collect().head()

In [None]:
lf.sink_parquet(parquet_output_file_path, maintain_order=False)

### There is more...

In [None]:
partitioned_parquet_input_file_path = '../data/venture_funding_deals_partitioned'
df = pl.read_parquet(
    partitioned_parquet_input_file_path, 
    use_pyarrow=True, 
    pyarrow_options={'partitioning': 'hive'}
)
df.head()

In [None]:
partitioned_parquet_output_file_path = '../data/output/venture_funding_deals_partitioned_output'
df.write_parquet(
    partitioned_parquet_output_file_path, 
    use_pyarrow=True, 
    pyarrow_options={
        'partition_cols': ['Industry'],
        'existing_data_behavior': 'overwrite_or_ignore'
        }
)

## Reading and writing Delta Lake tables

### How to do it...

In [None]:
delta_input_file_path = '../data/venture_funding_deals_delta'
df = pl.read_delta(delta_input_file_path)
df.head()

In [None]:
lf = pl.scan_delta(delta_input_file_path)
lf.collect().head()

In [None]:
df.write_delta('../data/output/venture_funding_deals_delta_output', mode='overwrite')

In [None]:
delta_partitioned_output_file_path = '../data/output/venture_funding_deals_delta_partitioned_output'
delta_write_options = {'partition_by': 'Industry'}
df.write_delta(
    delta_partitioned_output_file_path, 
    mode='overwrite', 
    delta_write_options=delta_write_options
)

In [None]:
df = pl.read_delta(delta_partitioned_output_file_path)
df.head()

In [None]:
df = pl.read_delta(
    delta_partitioned_output_file_path, 
    pyarrow_options={'partitions': [('Industry', '=', 'Accounting')]}
)
df.head()

### There is more...

In [None]:
from config import aws_access_key_id, aws_secret_access_key

In [None]:
table_path = 's3://sandbox-data-lake/letters_delta'
storage_options= {
    'aws_access_key_id': aws_access_key_id,
    'aws_secret_access_key': aws_secret_access_key,
    'aws_region': 'us-west-1'
}

table_path = 's3://YOUR_S3BUCKET_URI/YOUR_DELTA_TABLE'
storage_options= {
    'aws_access_key_id': 'YOUR_ACCESS_KEY',
    'aws_secret_access_key': 'YOUR_SECRET_ACCESS_KEY',
    'aws_region': 'YOUR_REGION'
}

df = pl.read_delta(table_path, storage_options=storage_options)  
df.head()

## Reading and writing JSON files

### Getting ready

### How to do it...

In [None]:
df = pl.read_json('../data/world_population.json')
df.select(df.columns[:10]).head()

In [None]:
df.write_json('../data/output/world_population_output.json')

In [None]:
df = pl.read_ndjson('../data/world_population.jsonl')
df.select(df.columns[:10]).head()

In [None]:
df.write_ndjson('../data/output/world_population_output.jsonl')

### There is more...

In [None]:
lf = pl.scan_ndjson('../data/world_population.jsonl')
lf.select(lf.columns[:10]).collect().head()

## Reading and writing Excel files

### Getting ready

In [None]:
import polars as pl

### How to do it...

In [None]:
input_file_path = '../data/financial_sample.xlsx'
df = pl.read_excel(
    input_file_path, 
    sheet_name='Sheet1',
    read_csv_options={'has_header': True, 'try_parse_dates': True}
)
df.head()

In [None]:
output_file_path = '../data/output/financial_sample_output.xlsx'
df.write_excel(
    output_file_path,
    worksheet='Output Sheet1',
    header_format={'bold': True}
)

## Reading and writing other file formats

### How to do it...

In [1]:
import polars as pl

In [2]:
csv_input_file_path = '../data/customer_shopping_data.csv'
ipc_file_path = '../data/customer_shopping_data.arrow'
df = pl.read_csv(csv_input_file_path)
df.write_ipc(ipc_file_path)

In [3]:
df = pl.read_ipc(ipc_file_path)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [4]:
avro_file_path = '../data/customer_shopping_data.avro'
df = pl.read_csv(csv_input_file_path)
df.write_avro(avro_file_path)

In [5]:
df = pl.read_avro(avro_file_path)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [6]:
iceberg_input_file_path = '../data/iceberg_table/metadata/00001-41687cbb-3a0c-4ef3-b3fa-e7026ed2eb77.metadata.json'
lf = pl.scan_iceberg(iceberg_input_file_path)
lf.collect().head()

ComputeError: ImportError: dlopen(/Users/Yuki/Desktop/Polars-Cookbook/.venv/lib/python3.11/site-packages/pyiceberg/avro/decoder_fast.cpython-311-darwin.so, 0x0002): tried: '/Users/Yuki/Desktop/Polars-Cookbook/.venv/lib/python3.11/site-packages/pyiceberg/avro/decoder_fast.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Users/Yuki/Desktop/Polars-Cookbook/.venv/lib/python3.11/site-packages/pyiceberg/avro/decoder_fast.cpython-311-darwin.so' (no such file), '/Users/Yuki/Desktop/Polars-Cookbook/.venv/lib/python3.11/site-packages/pyiceberg/avro/decoder_fast.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))

In [None]:
iceberg_input_file_path = 's3://sandbox-data-lake/iceberg-folder/metadata/00001-41687cbb-3a0c-4ef3-b3fa-e7026ed2eb77.metadata.json'
storage_options= {
    'aws_access_key_id': aws_access_key_id,
    'aws_secret_access_key': aws_secret_access_key,
    'aws_region': 'us-west-1'
}

lf = pl.scan_iceberg(iceberg_input_file_path, storage_options=storage_options)  
lf.collect().head()

### There is more...

In [None]:
lf = pl.scan_ipc(ipc_file_path)
lf.collect().head()

In [None]:
lf.collect().lazy().sink_ipc('../data/output/customer_shopping_data.arrow')

## Reading and writing multiple files

### How to do it...

In [None]:
data = {'Letter': ['A','B','C'], 'Value': [1,2,3]}
df = pl.DataFrame(data)

In [None]:
dfs = df.group_by('Letter')
print(dfs)

In [None]:
for name, df in dfs:
    df.write_csv(f'../data/output/letter_{name}.csv')

In [None]:
df = pl.read_csv('../data/output/letter_*.csv')
df.head()

In [None]:
lf = pl.scan_csv('../data/output/letter_*.csv')
lf.collect().head()

### There is more...

In [None]:
import glob
lfs = [pl.scan_csv(file) for file in glob.glob('../data/output/letter_*.csv')]
dfs = pl.collect_all(lfs)
display(dfs)

## Working with databases

### Getting ready

### How to do it...

In [7]:
import polars as pl

In [None]:
from config import postgres_pass, postgres_user

In [8]:
# connectorx is required
uri = f'postgres://{postgres_user}:{postgres_pass}@localhost:5432/postgres' 
query = 'SELECT * FROM sandbox.cars'
# uri = 'postgres://username:password@server:port/database'
# query = 'SELECT * FROM schema.table'
df = pl.read_database_uri(query, uri)
df.head()

brand,model,year
str,str,i32
"""Volvo""","""p1800""",1968
"""BMW""","""M1""",1978
"""Toyota""","""Celica""",1975


In [9]:
# pip install adbc-driver-postgresql pyarrow
df = pl.read_database_uri(query, uri, engine='adbc')
df.head()

brand,model,year
str,str,i32
"""Volvo""","""p1800""",1968
"""BMW""","""M1""",1978
"""Toyota""","""Celica""",1975


In [15]:
# pip install sqlalchemy psycopg2
from sqlalchemy import create_engine

uri = f'postgres://{postgres_user}:{postgres_pass}@localhost:5432/postgres' 
engine = create_engine(uri)
conn = engine.connect()
df = pl.read_database(query, connection=conn)
df.head()

NoSuchModuleError: Can't load plugin: sqlalchemy.dialects:postgres

In [14]:
# pip install pandas sqlalchemy 
# ( 
#     df
#     .with_columns(pl.lit(100))
#     .write_database(
#         table_name='cars_3', 
#         connection=uri, 
#         if_exists='replace', 
#         engine='sqlalchemy'
#     )
# )

uri = f'postgres://{postgres_user}:{postgres_pass}@localhost:5432/postgres' 
df = pl.DataFrame({"foo": [1, 2, 3]})
df.write_database(table_name="sandbox.records", connection=uri, engine="adbc", if_exists='append')

In [None]:
database_dataset_table_str = 'sandbox-366819.sandbox.customers'
authentication_file_path = 'sandbox-366819_keys.json'     
uri = 'bigquery://' + authentication_file_path
query = f'SELECT * FROM `{database_dataset_table_str}`' 

df = pl.read_database_uri(query, uri)
df.head()

In [None]:
database_dataset_table_str = 'sandbox-366819.sandbox.customers'
authentication_file_path = 'sandbox-366819_keys.json'     
uri = 'bigquery://' + authentication_file_path
query = f'SELECT * FROM `{database_dataset_table_str}`' 

df = pl.read_database_uri(query, uri, engine='adbc')
df.head()

In [None]:
from config import sql_server_user, sql_server_pass

In [20]:
import polars as pl
uri = f'mssql://{sql_server_user}:{sql_server_pass}@localhost:1433/Sandbox' 
# conn = 'mssql://username:password@server:port/database?encrypt=true&trusted_connection=true'         # connection token
query = 'SELECT * FROM dbo.cars'
# uri = 'postgres://username:password@server:port/database'
# query = 'SELECT * FROM schema.table'
df = pl.read_database_uri(query, uri)
df.head()

[2023-10-22T21:47:13Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456
[2023-10-22T21:47:13Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456
[2023-10-22T21:47:14Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456
[2023-10-22T21:47:15Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456
[2023-10-22T21:47:19Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456
[2023-10-22T21:47:25Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456
[2023-10-22T21:47:38Z ERROR tiberius::tds::stream::token] Login failed for user 'sa'. code=18456


RuntimeError: Timed out in bb8

### There is more...

In [None]:
# connectorx is required
uri = f'postgres://{postgres_user}:{postgres_pass}@localhost:5432/postgres'
query = 'SELECT * FROM sandbox.cars'
# uri = 'postgres://username:password@server:port/database'
# query = 'SELECT * FROM schema.table'

df = pl.read_database_uri(
    query, 
    uri, 
    partition_on='year', 
    partition_num=3)
df.head()