# Working With Common Cloud Sources

## Amazon S3

### How to do it...

In [1]:
import polars as pl

In [2]:
s3_file_path = 's3://polars-cookbook-demo-yk/titanic_dataset.csv'
df = pl.read_csv(s3_file_path)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [3]:
import s3fs 

fs = s3fs.S3FileSystem()
s3_parquet_file_path = 's3://polars-cookbook-demo-yk/titanic_dataset.parquet'

with fs.open(s3_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [4]:
storage_options= {
    'aws_access_key_id': 'YOUR_ACCESS_KEY_ID',
    'aws_secret_access_key': 'YOUR_SECRET_ACCESS_KEY',
    'aws_region': 'us-east-1'
}

lf = pl.scan_parquet(s3_parquet_file_path, storage_options=storage_options)
lf.fetch(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [5]:
import pyarrow.dataset as ds

dataset = ds.dataset(s3_parquet_file_path, format='parquet')
df = (
    pl.scan_pyarrow_dataset(dataset)
    .filter(pl.col('Age') <= 30)
    .collect()
)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


## Azure Blog Storage

### How to do it...

In [7]:
import polars as pl

In [22]:
storage_options={
    'account_name': 'YOUR_ACCOUNT_NAME', 
    'access_key': 'YOUR_ACCOUNT_KEY',
    'client_id': 'YOUR_CLIENT_ID',
    'client_secret': 'YOUR_CLIENT_SECRET',
    'tenant_id': 'YOUR_TENANT_ID'
}

blob_csv_file_path = 'az://demo/titanic_dataset.csv'
df = pl.read_csv(blob_csv_file_path, storage_options=storage_options) 
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [16]:
import adlfs 

fs = adlfs.AzureBlobFileSystem(
    account_name='YOUR_ACCOUNT_NAME', 
    account_key='YOUR_ACCOUNT_KEY'
)

blob_parquet_file_path = 'az://demo/titanic_dataset.parquet'

with fs.open(blob_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [25]:
lf = pl.scan_parquet(blob_parquet_file_path, storage_options=storage_options)
lf.head().collect()

ComputeError: unknown configuration key: encoding

In [21]:
import pyarrow.dataset as ds

dataset = ds.dataset(
    blob_parquet_file_path, 
    filesystem=fs,
    format='parquet')

df = (
    pl.scan_pyarrow_dataset(dataset)
    .filter(pl.col('Age') <= 30)
    .collect()
)

df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


In [24]:
import os

os.environ['AZURE_STORAGE_ACCOUNT_NAME'] = 'YOUR_ACCOUNT_NAME'
os.environ['AZURE_STORAGE_ACCOUNT_KEY'] = 'YOUR_ACCOUNT_KEY'

(
    pl.read_csv(blob_csv_file_path, storage_options=storage_options)
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
