# Chapter 11: Working With Common Cloud Sources

## Amazon S3

### How to do it...

In [1]:
import polars as pl

In [2]:
s3_file_path = 's3://polars-cookbook-demo-yk/titanic_dataset.csv'
df = pl.read_csv(s3_file_path)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [3]:
import s3fs 

fs = s3fs.S3FileSystem()

s3_parquet_file_path = 's3://polars-cookbook-demo-yk/titanic_dataset.parquet'

with fs.open(s3_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [4]:
storage_options= {
    'aws_access_key_id': 'YOUR_ACCESS_KEY_ID',
    'aws_secret_access_key': 'YOUR_SECRET_ACCESS_KEY',
    'aws_region': 'us-east-1'
}

lf = pl.scan_parquet(s3_parquet_file_path, storage_options=storage_options)
lf.fetch(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [5]:
import s3fs 

fs = s3fs.S3FileSystem(
    key='YOUR_ACCESS_KEY_ID',
    secret='YOUR_SECRET_ACCESS_KEY'
)

with fs.open(s3_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [6]:
import pyarrow.dataset as ds

dataset = ds.dataset(s3_parquet_file_path, format='parquet')
df = (
    pl.scan_pyarrow_dataset(dataset)
    .filter(pl.col('Age') <= 30)
    .collect()
)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


## Azure Blog Storage

### How to do it...

In [7]:
import polars as pl

In [22]:
storage_options={
    'account_name': 'YOUR_ACCOUNT_NAME', 
    'access_key': 'YOUR_ACCOUNT_KEY',
    'client_id': 'YOUR_CLIENT_ID',
    'client_secret': 'YOUR_CLIENT_SECRET',
    'tenant_id': 'YOUR_TENANT_ID'
}

blob_csv_file_path = 'az://demo/titanic_dataset.csv'
df = pl.read_csv(blob_csv_file_path, storage_options=storage_options) 
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [37]:
import adlfs 

fs = adlfs.AzureBlobFileSystem(
    account_name='YOUR_ACCOUNT_NAME', 
    account_key='YOUR_ACCOUNT_KEY'
)

blob_parquet_file_path = 'az://demo/titanic_dataset.parquet'

with fs.open(blob_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [38]:
lf = pl.scan_parquet(blob_parquet_file_path, storage_options=storage_options)
lf.head().collect()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [39]:
import pyarrow.dataset as ds

dataset = ds.dataset(
    blob_parquet_file_path, 
    filesystem=fs,
    format='parquet')

df = (
    pl.scan_pyarrow_dataset(dataset)
    .filter(pl.col('Age') <= 30)
    .collect()
)

df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


In [24]:
import os

os.environ['AZURE_STORAGE_ACCOUNT_NAME'] = 'YOUR_ACCOUNT_NAME'
os.environ['AZURE_STORAGE_ACCOUNT_KEY'] = 'YOUR_ACCOUNT_KEY'

(
    pl.read_csv(blob_csv_file_path, storage_options=storage_options)
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


### There is more...

In [27]:
storage_options['account_name'] = 'YOUR_ADLS_ACCOUNT_NAME'
storage_options['account_key'] = 'YOUR_ADLS_ACCOUNT_KEY'

df = pl.read_csv(blob_csv_file_path, storage_options=storage_options) 
df.head()

I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
str,str,str,i64,str,i64,f64,str,str,str
"""I317333""","""C111565""","""Male""",21,"""Shoes""",3,1800.51,"""Debit Card""","""12/12/2021""","""Forum Istanbul…"
"""I127801""","""C266599""","""Male""",20,"""Clothing""",1,300.08,"""Cash""","""9/11/2021""","""Metrocity"""
"""I173702""","""C988172""","""Female""",66,"""Shoes""",5,3000.85,"""Credit Card""","""16/05/2021""","""Metropol AVM"""
"""I337046""","""C189076""","""Female""",53,"""Books""",4,60.6,"""Cash""","""24/10/2021""","""Kanyon"""
"""I227836""","""C657758""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""24/05/2022""","""Forum Istanbul…"


## Google Cloud Storage

### How to do it...

In [1]:
import polars as pl

In [2]:
gcs_csv_file_path = 'gs://polars_cookbook_demo_yk/titanic_dataset.csv'

df = pl.read_csv(gcs_csv_file_path)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [22]:
credentials_file_path = 'YOUR_FILE_NAME.json'
storage_options = {'token': credentials_file_path}
df = pl.read_csv(gcs_csv_file_path, storage_options=storage_options)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [23]:
import json

def read_json_into_dict(file_path):   
    try:  
        with open(file_path) as file:
            dict = json.load(file)
            return dict
    except Exception as err:
        raise err

storage_options = read_json_into_dict(credentials_file_path)

df = pl.read_csv(gcs_csv_file_path, storage_options=storage_options)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [3]:
import gcsfs

fs = gcsfs.GCSFileSystem()

gcs_parquet_file_path = 'gs://polars_cookbook_demo_yk/titanic_dataset.parquet'

with fs.open(gcs_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [28]:
import gcsfs

fs = gcsfs.GCSFileSystem(
    token=credentials_file_path
)

with fs.open(gcs_parquet_file_path, mode='wb') as f:
    df.write_parquet(f)

In [25]:
import pyarrow.dataset as ds

dataset = ds.dataset(
    gcs_parquet_file_path, 
    filesystem=fs,
    format='parquet'
)

df = (
    pl.scan_pyarrow_dataset(dataset)
    .filter(pl.col('Age') <= 30)
    .collect()
)

df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


## BigQuery

### How to do it...

In [35]:
import polars as pl

In [36]:
project = 'sandbox-366819'
dataset = 'polars_cookbook_demo_yk'
table = 'titanic_dataset'

query = f'''
    select *
    from {project}.{dataset}.{table}
'''
credentials_file_path = 'YOUR_CREDENTIALS_FILE_PATH' 
uri = f'bigquery://{credentials_file_path}'

df = pl.read_database_uri(query, uri, engine='connectorx')
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
180,0,3,"""Leonard, Mr. L…","""male""",36.0,0,0,"""LINE""",0.0,,"""S"""
264,0,1,"""Harrison, Mr. …","""male""",40.0,0,0,"""112059""",0.0,"""B94""","""S"""
278,0,2,"""Parkes, Mr. Fr…","""male""",,0,0,"""239853""",0.0,,"""S"""
303,0,3,"""Johnson, Mr. W…","""male""",19.0,0,0,"""LINE""",0.0,,"""S"""
414,0,2,"""Cunningham, Mr…","""male""",,0,0,"""239853""",0.0,,"""S"""


In [37]:
df = pl.read_database(query, connection=uri)
df.head()

  df = pl.read_database(query, connection=uri)


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
180,0,3,"""Leonard, Mr. L…","""male""",36.0,0,0,"""LINE""",0.0,,"""S"""
264,0,1,"""Harrison, Mr. …","""male""",40.0,0,0,"""112059""",0.0,"""B94""","""S"""
278,0,2,"""Parkes, Mr. Fr…","""male""",,0,0,"""239853""",0.0,,"""S"""
303,0,3,"""Johnson, Mr. W…","""male""",19.0,0,0,"""LINE""",0.0,,"""S"""
414,0,2,"""Cunningham, Mr…","""male""",,0,0,"""239853""",0.0,,"""S"""


In [16]:
import polars as pl
from google.cloud import bigquery

client = bigquery.Client()
query_job = client.query(query)
rows = query_job.result()

df = pl.from_arrow(rows.to_arrow())
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
180,0,3,"""Leonard, Mr. L…","""male""",36.0,0,0,"""LINE""",0.0,,"""S"""
264,0,1,"""Harrison, Mr. …","""male""",40.0,0,0,"""112059""",0.0,"""B94""","""S"""
278,0,2,"""Parkes, Mr. Fr…","""male""",,0,0,"""239853""",0.0,,"""S"""
303,0,3,"""Johnson, Mr. W…","""male""",19.0,0,0,"""LINE""",0.0,,"""S"""
414,0,2,"""Cunningham, Mr…","""male""",,0,0,"""239853""",0.0,,"""S"""


In [32]:
import io 

with io.BytesIO() as stream:
    df.write_csv(stream)
    stream.seek(0)
    job = client.load_table_from_file(
        stream,
        destination=f'{project}.{dataset}.titanic_dataset_v2',
        project=project,
        job_config=bigquery.LoadJobConfig(
            autodetect=True,
            source_format=bigquery.SourceFormat.CSV,
        ),
    )
job.result() 

LoadJob<project=sandbox-366819, location=US, id=5c08f1f1-d163-4535-9867-b7a82cc5fba2>

In [33]:
table_id = f'{project}.{dataset}.titanic_dataset_v3'
source_file_path = '../data/titanic_dataset.csv'

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    source_format=bigquery.SourceFormat.CSV
)

with open(source_file_path, 'rb') as file:
    job = client.load_table_from_file(
        file, table_id, job_config=job_config
    )

job.result()

LoadJob<project=sandbox-366819, location=US, id=365a20a0-8e27-4524-a4c3-832f93c58e32>

## Snowflake

### How to do it...

In [1]:
import polars as pl

In [20]:
username = 'YOUR_USERNAME' 
password = 'YOUR_PASS' 
account = 'ndjydua-tdb43776'
database = 'POLARS_COOKBOOK_DEMO_YK' 
warehouse = 'COMPUTE_WH'
role = 'ACCOUNTADMIN'
schema = 'SANDBOX' 
table = 'TITANIC_DATASET'

query = f'select * from {table}'
uri = f'snowflake://{username}:{password}@{account}/{database}/{schema}?warehouse={warehouse}&role={role}'

df = pl.read_database_uri(
    query,
    uri,
    engine='adbc'
) 
df.head()

PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED
f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
1.0,0.0,3.0,"""Braund, Mr. Ow…","""male""",22.0,1.0,0.0,"""A/5 21171""",7.25,,"""S"""
2.0,1.0,1.0,"""Cumings, Mrs. …","""female""",38.0,1.0,0.0,"""PC 17599""",71.2833,"""C85""","""C"""
3.0,1.0,3.0,"""Heikkinen, Mis…","""female""",26.0,0.0,0.0,"""STON/O2. 31012…",7.925,,"""S"""
4.0,1.0,1.0,"""Futrelle, Mrs.…","""female""",35.0,1.0,0.0,"""113803""",53.1,"""C123""","""S"""
5.0,0.0,3.0,"""Allen, Mr. Wil…","""male""",35.0,0.0,0.0,"""373450""",8.05,,"""S"""


In [21]:
import snowflake.connector

conn = snowflake.connector.connect(
    user=username,
    password=password,
    account=account,
    warehouse=warehouse,
    database=database,
    schema=schema
    )

(
    pl.from_arrow(conn.cursor().execute(query).fetch_arrow_all())
    .head()
)

PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED
i16,i8,i8,str,str,f64,i8,i8,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [19]:
df = pl.read_database(query, connection=conn)
df.head()

PASSENGERID,SURVIVED,PCLASS,NAME,SEX,AGE,SIBSP,PARCH,TICKET,FARE,CABIN,EMBARKED
i16,i8,i8,str,str,f64,i8,i8,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
