# Ingest

Ingest all guids from AAPB into Chowda


In [None]:
from sonyci import SonyCi
from os import environ

environ['DB_URL'] = 'sqlite:///../../chowda.development.sqlite'
from tests.factories import MediaFileFactory

ci = SonyCi.load_from_toml('../../ci.toml')


In [None]:
def get_batch(n):
    return ci.get(
        f'workspaces/{ci.workspace_id}/contents?kind=asset&limit=100&fields=id,name,type,size,thumbnails&offset={n*100}'
    )['items']


count = ci.get(f'workspaces/{ci.workspace_id}/contents?kind=asset&limit=1')['count']

In [None]:
def ingest():
    for i in range(count // 100 + 1):
        for asset in get_batch(i):
            MediaFileFactory.create(id=asset['id'], guid=asset['name'])


`ingest()` took 68 minutes to ingest 174,170 guids!

42 guids/second


## But we can do better!

First, we'll group the `MediaFile`s so we can add them in a single database commit.


In [None]:
from sqlmodel import Session

from chowda.models import SonyCiAsset
from chowda.db import engine


def batch_ingest_session(n):
    with Session(engine) as session:
        batch = get_batch(n)
        media = [SonyCiAsset(**asset) for asset in batch]
        session.add_all(media)
        session.commit()


batch = get_batch(1)
media = [SonyCiAsset(**asset) for asset in batch]

Each call currently happens in series, so we can use `multiprocessing` to parallelize the ingestion process.


In [None]:
from multiprocessing.pool import ThreadPool


def batch_ingest_with_threadpool():
    with ThreadPool() as pool:
        for _ in pool.map(batch_ingest_session, range(count // 100 + 1)):
            pass


11 minutes!

That's a 6x speedup!

264 guids/second

## But we can do even better!

Since this operation is network bound, we can use more processes than we have cores.


In [None]:
with ThreadPool() as pool:
    for _ in pool.map(batch_ingest_session, range(1)):
        pass


## Fastest!

1m 49s!

That's a 37x speedup!

1600 guids/second!!!
