In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({"nums":[1, 2, 3, 4], "letters":["a", "b", "c", "d"]})

In [3]:
print(df)

   nums letters
0     1       a
1     2       b
2     3       c
3     4       d


In [4]:
print(df.dtypes)

nums        int64
letters    object
dtype: object


## Dask dtypes

In [5]:
import dask.dataframe as dd

In [6]:
ddf = dd.from_pandas(df, npartitions=2)

In [7]:
ddf.dtypes

nums        int64
letters    object
dtype: object

## Change column type

In [8]:
ddf['nums'] = ddf['nums'].astype('int8')

In [9]:
print(ddf.compute())

   nums letters
0     1       a
1     2       b
2     3       c
3     4       d


In [10]:
ddf.dtypes

nums         int8
letters    object
dtype: object

## Read cloud files with Coiled

### Setup

In [11]:
import coiled
cluster = coiled.Cluster(name="demo-cluster", n_workers=10)

Output()

Your 'blog-notebooks' and 'coiled-examples' accounts are using the ECS backend.
After September 16th, accounts using the ECS backend will be migrated to the default AWS VM backend. For more information, refer to the backend documentation and the FAQ:

https://docs.coiled.io/user_guide/backends
https://docs.coiled.io/user_guide/faq.html#backends
Found software environment build
Created FW rules: coiled-dask-rrpelgr71-41473-firewall


In [13]:
import dask
client = dask.distributed.Client(cluster)
client


+-------------+-----------+-----------+-----------+
| Package     | client    | scheduler | workers   |
+-------------+-----------+-----------+-----------+
| blosc       | None      | 1.10.2    | 1.10.2    |
| dask        | 2021.08.0 | 2021.08.1 | 2021.08.1 |
| distributed | 2021.08.0 | 2021.08.1 | 2021.08.1 |
| numpy       | 1.20.3    | 1.21.2    | 1.21.2    |
+-------------+-----------+-----------+-----------+


0,1
Connection method: Cluster object,Cluster type: Cluster
Dashboard: http://3.238.69.8:8787,

0,1
Dashboard: http://3.238.69.8:8787,Workers: 10
Total threads: 20,Total memory: 76.72 GiB

0,1
Comm: tls://10.4.5.167:8786,Workers: 10
Dashboard: http://10.4.5.167:8787/status,Total threads: 20
Started: 7 minutes ago,Total memory: 76.72 GiB

0,1
Comm: tls://10.4.30.27:36683,Total threads: 2
Dashboard: http://10.4.30.27:41803/status,Memory: 7.67 GiB
Nanny: tls://10.4.30.27:33547,
Local directory: /dask-worker-space/worker-8xrp2idd,Local directory: /dask-worker-space/worker-8xrp2idd

0,1
Comm: tls://10.4.24.149:46187,Total threads: 2
Dashboard: http://10.4.24.149:44803/status,Memory: 7.67 GiB
Nanny: tls://10.4.24.149:38891,
Local directory: /dask-worker-space/worker-azdtqa54,Local directory: /dask-worker-space/worker-azdtqa54

0,1
Comm: tls://10.4.23.16:45119,Total threads: 2
Dashboard: http://10.4.23.16:36583/status,Memory: 7.67 GiB
Nanny: tls://10.4.23.16:34941,
Local directory: /dask-worker-space/worker-rjgqqh8p,Local directory: /dask-worker-space/worker-rjgqqh8p

0,1
Comm: tls://10.4.29.9:40783,Total threads: 2
Dashboard: http://10.4.29.9:33667/status,Memory: 7.67 GiB
Nanny: tls://10.4.29.9:35259,
Local directory: /dask-worker-space/worker-mrfubak5,Local directory: /dask-worker-space/worker-mrfubak5

0,1
Comm: tls://10.4.22.46:45091,Total threads: 2
Dashboard: http://10.4.22.46:35375/status,Memory: 7.67 GiB
Nanny: tls://10.4.22.46:38527,
Local directory: /dask-worker-space/worker-keaicp6r,Local directory: /dask-worker-space/worker-keaicp6r

0,1
Comm: tls://10.4.30.149:45081,Total threads: 2
Dashboard: http://10.4.30.149:44367/status,Memory: 7.67 GiB
Nanny: tls://10.4.30.149:36023,
Local directory: /dask-worker-space/worker-5kxzen4s,Local directory: /dask-worker-space/worker-5kxzen4s

0,1
Comm: tls://10.4.22.230:45853,Total threads: 2
Dashboard: http://10.4.22.230:41959/status,Memory: 7.67 GiB
Nanny: tls://10.4.22.230:39169,
Local directory: /dask-worker-space/worker-lwv5i0nm,Local directory: /dask-worker-space/worker-lwv5i0nm

0,1
Comm: tls://10.4.19.201:44875,Total threads: 2
Dashboard: http://10.4.19.201:43745/status,Memory: 7.67 GiB
Nanny: tls://10.4.19.201:36783,
Local directory: /dask-worker-space/worker-tulg387i,Local directory: /dask-worker-space/worker-tulg387i

0,1
Comm: tls://10.4.28.43:35483,Total threads: 2
Dashboard: http://10.4.28.43:35437/status,Memory: 7.67 GiB
Nanny: tls://10.4.28.43:33477,
Local directory: /dask-worker-space/worker-qg04rwx1,Local directory: /dask-worker-space/worker-qg04rwx1

0,1
Comm: tls://10.4.27.97:45157,Total threads: 2
Dashboard: http://10.4.27.97:33851/status,Memory: 7.67 GiB
Nanny: tls://10.4.27.97:35867,
Local directory: /dask-worker-space/worker-hhrer626,Local directory: /dask-worker-space/worker-hhrer626


### Schema inference

In [14]:
ddf = dd.read_csv(
    "s3://coiled-datasets/timeseries/20-years/csv/*.part", 
    storage_options={"anon": True, 'use_ssl': True}
)

In [15]:
print(ddf.dtypes)

timestamp     object
id             int64
name          object
x            float64
y            float64
dtype: object


### Specify schema

In [19]:
ddf.memory_usage(deep=True).compute()

Index             140160
id            5298048000
name         41289103692
timestamp    50331456000
x             5298048000
y             5298048000
dtype: int64

In [16]:
ddf.id.min().compute()

815

In [17]:
ddf.id.max().compute()

1193

In [18]:
ddf.head()

Unnamed: 0,timestamp,id,name,x,y
0,2000-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
1,2000-01-01 00:00:01,987,Patricia,0.069601,0.755351
2,2000-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
3,2000-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
4,2000-01-01 00:00:04,967,Michael,-0.25146,0.81093


In [20]:
ddf = dd.read_csv(
    "s3://coiled-datasets/timeseries/20-years/csv/*.part", 
    storage_options={"anon": True, 'use_ssl': True},
    dtype={
      "id": "int16"
    }
)

In [21]:
ddf.memory_usage(deep=True).compute()

Index             140160
id            1324512000
name         41289103692
timestamp    50331456000
x             5298048000
y             5298048000
dtype: int64

In [22]:
print(ddf.dtypes)

timestamp     object
id             int16
name          object
x            float64
y            float64
dtype: object


## Inference errors

In [23]:
ddf = dd.read_csv(
    "s3://coiled-datasets/nyc-parking-tickets/csv/*.csv",
    storage_options={"anon": True, 'use_ssl': True},
)

In [24]:
len(ddf) # this works

31536410

In [25]:
ddf.head() # this errors out

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+-----------------------+---------+----------+
| Column                | Found   | Expected |
+-----------------------+---------+----------+
| House Number          | object  | float64  |
| Issuer Command        | object  | int64    |
| Issuer Squad          | object  | int64    |
| Time First Observed   | object  | float64  |
| Unregistered Vehicle? | float64 | int64    |
| Violation Description | object  | float64  |
| Violation Legal Code  | object  | float64  |
| Violation Location    | float64 | int64    |
| Violation Post Code   | object  | float64  |
+-----------------------+---------+----------+

The following columns also raised exceptions on conversion:

- House Number
  ValueError("could not convert string to float: '67-21'")
- Issuer Command
  ValueError("invalid literal for int() with base 10: 'T730'")
- Issuer Squad
  ValueError('cannot convert float NaN to integer')
- Time First Observed
  ValueError("could not convert string to float: '1134P'")
- Violation Description
  ValueError("could not convert string to float: 'BUS LANE VIOLATION'")
- Violation Legal Code
  ValueError("could not convert string to float: 'T'")
- Violation Post Code
  ValueError("could not convert string to float: 'H -'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'House Number': 'object',
       'Issuer Command': 'object',
       'Issuer Squad': 'object',
       'Time First Observed': 'object',
       'Unregistered Vehicle?': 'float64',
       'Violation Description': 'object',
       'Violation Legal Code': 'object',
       'Violation Location': 'float64',
       'Violation Post Code': 'object'}

to the call to `read_csv`/`read_table`.

In [27]:
ddf.dtypes

Summons Number                         int64
Plate ID                              object
Registration State                    object
Plate Type                            object
Issue Date                            object
Violation Code                         int64
Vehicle Body Type                     object
Vehicle Make                          object
Issuing Agency                        object
Street Code1                           int64
Street Code2                           int64
Street Code3                           int64
Vehicle Expiration Date                int64
Violation Location                     int64
Violation Precinct                     int64
Issuer Precinct                        int64
Issuer Code                            int64
Issuer Command                         int64
Issuer Squad                           int64
Violation Time                        object
Time First Observed                  float64
Violation County                      object
Violation 

In [28]:
ddf = dd.read_csv(
    "s3://coiled-datasets/nyc-parking-tickets/csv/*.csv",
    dtype={'House Number': 'object',
       'Issuer Command': 'object',
       'Issuer Squad': 'object',
       'Time First Observed': 'object',
       'Unregistered Vehicle?': 'float64',
       'Violation Description': 'object',
       'Violation Legal Code': 'object',
       'Violation Location': 'float64',
       'Violation Post Code': 'object'}
)

In [29]:
ddf.head()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Hydrant Violation,Double Parking Violation,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,1283294138,GBB9093,NY,PAS,08/04/2013,46,SUBN,AUDI,P,37250,...,,,,,,,,,,
1,1283294151,62416MB,NY,COM,08/04/2013,46,VAN,FORD,P,37290,...,,,,,,,,,,
2,1283294163,78755JZ,NY,COM,08/05/2013,46,P-U,CHEVR,P,37030,...,,,,,,,,,,
3,1283294175,63009MA,NY,COM,08/05/2013,46,VAN,FORD,P,37270,...,,,,,,,,,,
4,1283294187,91648MC,NY,COM,08/08/2013,41,TRLR,GMC,P,37240,...,,,,,,,,,,


In [30]:
ddf.describe().compute()

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+-------------------------+--------+----------+
| Column                  | Found  | Expected |
+-------------------------+--------+----------+
| Date First Observed     | object | int64    |
| Vehicle Expiration Date | object | int64    |
+-------------------------+--------+----------+

The following columns also raised exceptions on conversion:

- Date First Observed
  ValueError("invalid literal for int() with base 10: '01/05/0001 12:00:00 PM'")
- Vehicle Expiration Date
  ValueError("invalid literal for int() with base 10: '01/05/0001 12:00:00 PM'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'Date First Observed': 'object',
       'Vehicle Expiration Date': 'object'}

to the call to `read_csv`/`read_table`.

In [31]:
ddf = dd.read_csv(
    "s3://coiled-datasets/nyc-parking-tickets/csv/*.csv",
    dtype={'House Number': 'object',
       'Issuer Command': 'object',
       'Issuer Squad': 'object',
       'Time First Observed': 'object',
       'Unregistered Vehicle?': 'float64',
       'Violation Description': 'object',
       'Violation Legal Code': 'object',
       'Violation Location': 'float64',
       'Violation Post Code': 'object',
       'Date First Observed': 'object',
       'Vehicle Expiration Date': 'object'}
)

In [32]:
ddf.describe().compute()

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+--------------------+---------+----------+
| Column             | Found   | Expected |
+--------------------+---------+----------+
| Feet From Curb     | float64 | int64    |
| Issuer Code        | float64 | int64    |
| Issuer Precinct    | float64 | int64    |
| Law Section        | float64 | int64    |
| Vehicle Year       | float64 | int64    |
| Violation Precinct | float64 | int64    |
+--------------------+---------+----------+

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'Feet From Curb': 'float64',
       'Issuer Code': 'float64',
       'Issuer Precinct': 'float64',
       'Law Section': 'float64',
       'Vehicle Year': 'float64',
       'Violation Precinct': 'float64'}

to the call to `read_csv`/`read_table`.

Alternatively, provide `assume_missing=True` to interpret
all unspecified integer columns as floats.