# Chapter 2: Reading and Writing Files

## Reading and writing CSV files

### Getting ready

In [1]:
import polars as pl

### How to do it...

In [28]:
df = pl.read_csv('../data/customer_shopping_data.csv')
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [24]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv')
df.head()

I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
str,str,str,i64,str,i64,f64,str,str,str
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""
"""I227836""","""C657758""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""24/05/2022""","""Forum Istanbul…"


In [25]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', has_header=False)
df.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [30]:
column_names = ['invoice_no', 'customer_id', 'gender', 'age', 'category', 'quantity', 'price', 'payment_method', 'invoice_date', 'shopping_mall']
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""


In [31]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True)
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,date,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""",2022-08-05,"""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""",2021-12-12,"""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""",2021-11-09,"""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""",2021-05-16,"""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""",2021-10-24,"""Kanyon"""


In [35]:
df = pl.read_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True, 
                 dtypes={'age': pl.Int8, 'quantity': pl.Int32})
df.head()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i8,str,i32,f64,str,date,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""",2022-08-05,"""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""",2021-12-12,"""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""",2021-11-09,"""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""",2021-05-16,"""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""",2021-10-24,"""Kanyon"""


In [43]:
df.write_csv('../data/output/shopping_data_output.csv', 
             has_header=False, 
             separator=',')

### There is more...

In [45]:
lf = pl.scan_csv('../data/customer_shopping_data_no_header.csv', 
                 has_header=False, 
                 new_columns=column_names, 
                 try_parse_dates=True, 
                 dtypes={'age': pl.Int8, 'quantity': pl.Int32})
lf.fetch(5)

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i8,str,i32,f64,str,date,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""",2022-08-05,"""Kanyon"""
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""",2021-12-12,"""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""",2021-11-09,"""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""",2021-05-16,"""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""",2021-10-24,"""Kanyon"""


In [47]:
lf.sink_csv('../data/output/shopping_data_output_sink.csv')

## Reading and writing parquet files

## Reading and writing JSON files

### Getting ready

### How to do it...

In [122]:
df = pl.read_json('../data/world_population.json')
df.select(df.columns[:10]).head()

place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area
i64,f64,f64,f64,f64,i64,f64,f64,str,i64
356,696828385.0,1059600000.0,1240600000.0,1417200000.0,1428627663,1515000000.0,1670500000.0,"""India""",3287590
156,982372466.0,1264100000.0,1348200000.0,1425900000.0,1425671352,1415600000.0,1312600000.0,"""China""",9706961
840,223140018.0,282398554.0,311182845.0,338289857.0,339996563,352162301.0,375391963.0,"""United States""",9372610
360,148177096.0,214072421.0,244016173.0,275501339.0,277534122,292150100.0,317225213.0,"""Indonesia""",1904569
586,80624057.0,154369924.0,194454498.0,235824862.0,240485658,274029836.0,367808468.0,"""Pakistan""",881912


In [102]:
df.write_json('../data/output/world_population_output.json')

In [121]:
df = pl.read_ndjson('../data/world_population.jsonl')
df.select(df.columns[:10]).head()

place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area
i64,f32,f64,f64,f64,i64,f64,f64,str,i64
356,696828416.0,1059600000.0,1240600000.0,1417200000.0,1428627663,1515000000.0,1670500000.0,"""India""",3287590
156,982372480.0,1264100000.0,1348200000.0,1425900000.0,1425671352,1415600000.0,1312600000.0,"""China""",9706961
840,223140016.0,282398554.0,311182845.0,338289857.0,339996563,352162301.0,375391963.0,"""United States""",9372610
360,148177088.0,214072421.0,244016173.0,275501339.0,277534122,292150100.0,317225213.0,"""Indonesia""",1904569
586,80624056.0,154369924.0,194454498.0,235824862.0,240485658,274029836.0,367808468.0,"""Pakistan""",881912


In [104]:
df.write_ndjson('../data/output/world_population_output.jsonl')

### There is more...

In [119]:
lf = pl.scan_ndjson('../data/world_population.jsonl')
lf.select(lf.columns[:10]).collect().head()

place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area
i64,f64,f64,f64,f64,i64,f64,f64,str,i64
356,696828385.0,1059600000.0,1240600000.0,1417200000.0,1428627663,1515000000.0,1670500000.0,"""India""",3287590
156,982372466.0,1264100000.0,1348200000.0,1425900000.0,1425671352,1415600000.0,1312600000.0,"""China""",9706961
840,223140018.0,282398554.0,311182845.0,338289857.0,339996563,352162301.0,375391963.0,"""United States""",9372610
360,148177096.0,214072421.0,244016173.0,275501339.0,277534122,292150100.0,317225213.0,"""Indonesia""",1904569
586,80624057.0,154369924.0,194454498.0,235824862.0,240485658,274029836.0,367808468.0,"""Pakistan""",881912


## Reading and writing CSV files

## Reading and writing CSV files

## Reading and writing CSV files

## Reading and writing CSV files