# @mhaligowski 's notes on chapter 2

> 🧠 TIL:
> 
>  `str` and `bytes` were added in Python 3???

There are to flavors of sequence types in Python's standard library:

* container sequences can hold different types of data (`list`, `tuple`, `collections.deque`)
* flat sequences can only hold one type of data (`str`, `bytes`, `bytearray`, `memoryview`, `array.array`)

Container sequences hold *references*, while flat sequences hold *values*.

In [None]:
subsequence = [1, 2]
print(f"subsequence is {subsequence}")

container = ['a', 'b', subsequence]
print(f"container is {container}")

subsequence.append(3)
print(f"subsequence is {subsequence}")
print(f"container is {container}")


> 🎓 TIL:
>
> Every Python object has headers, for example for `float`:
> * `ob_refcnt` - reference count
> * `ob_type` - type of the object (pointer)
> * `ob_fval` - C `double` holding the `float` value
>
> Each field takes 8 bytes

Another distinction:
* mutable (`list`, `bytearray`, `array.array`, `collections.deque`, `memoryview`),
* immutable (`tuple`, `str`, `bytes`)

## `list`: mutable container

_listcomps_ FTW:

In [None]:
symbols = '$¢£¥€¤'

# Non-Python way
codes = []
for symbol in symbols:
    codes.append(ord(symbol))

# Pythonic way
codes = [ ord(symbol) for symbol in symbols ]


> Tip:
> Line breaks are ignored inside pairs of `[]`, `{}` or `()`. Not sure what they mean by that?


Walrus operator??? `:=` - assignment expression. It assigns a value to a variable as part of a larger expression. It is also known as the *named expression*.

In [None]:
x = [1, 2, 3, 4, 5]
squares = [x**2 for x in x]
print(squares)

cubes = [ last:=x**3 for x in x ]
print(cubes, last)

for x in x:
    while (last:=x**3 != 9):
        print("hi")

print(last)

## Cartesian products

In [None]:
# Cartesian product
chessboard = [f'{letter}{digit}' for letter in 'ABCDEFGH' for digit in range(1, 9)]
print(chessboard)

# BONUS: Splitting with listcomps!
print([chessboard[start:start+8] for start in range(0, len(chessboard), 8)])



_gencomps_ are like _listcomps_, written with `()` instead of `[]`. They are generators, so they don't build lists, but instead produce values on demand.

In [None]:
colors = ['black', 'white']
sizes = ['S', 'M', 'L']
for tshirt tshitrs (f'{c} {s}' for c in colors for s in sizes):
  

In [None]:
import tarfile
from typing import Iterator, List
import os

def generate_tar_bytes(
    *, root_file_path: str, files: List[str], part_size_threshold: int
) -> Iterator[bytes]:
    total_bytes_written = 0
    buffer = b""

    for file in files:
        file_path = os.path.join(root_file_path, file)
        with open(file_path, "rb") as file_reader:
            """this is mostly from stdlib tarfile"""
            tar_info = tarfile.TarInfo(file)
            tar_info.size = os.path.getsize(file_path)
            header_bytes = tar_info.tobuf()
            total_bytes_written += len(header_bytes)
            buffer += header_bytes
            while file_reader.peek(1) != b"":
                while (current_length := len(buffer)) >= part_size_threshold:
                    to_yield, buffer = (
                        buffer[:part_size_threshold],
                        buffer[part_size_threshold:],
                    )
                    yield to_yield
                number_of_bytes_to_add = part_size_threshold - current_length
                buffer += file_reader.read(number_of_bytes_to_add)
            blocks, remainder = divmod(tar_info.size, tarfile.BLOCKSIZE)
            if remainder > 0:
                buffer += tarfile.NUL * (tarfile.BLOCKSIZE - remainder)
                blocks += 1
            total_bytes_written += blocks * tarfile.BLOCKSIZE

    finishing_bytes = tarfile.NUL * (tarfile.BLOCKSIZE * 2)
    buffer += finishing_bytes
    total_bytes_written += len(finishing_bytes)
    blocks, remainder = divmod(total_bytes_written, tarfile.RECORDSIZE)
    if remainder > 0:
        filler_bytes = tarfile.NUL * (tarfile.RECORDSIZE - remainder)
        buffer += filler_bytes
        total_bytes_written += len(filler_bytes)
    while len(buffer) > 0:
        to_yield, buffer = (
            buffer[:part_size_threshold],
            buffer[part_size_threshold:],
        )
        yield to_yield

In [None]:
# embeddings_service/embeddings_service/main.py

@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    """
    This is a context manager which is used to download all models on startup.
    """
    if ENV_MANAGER.enable_cloud_logging:
        setup_google_cloud_logging(
            logging_level=ENV_MANAGER.logging_level,
            model_name=ENV_MANAGER.models_hf_hub_name,
            model_revision=ENV_MANAGER.models_hf_hub_revision,
        )
    else:
        setup_logging(
            logging_level=ENV_MANAGER.logging_level,
        )
    create_model_directory(Path(ENV_MANAGER.models_root_path))

    if ENV_MANAGER.enable_open_telemetry_metrics:
        setup_otlp(
            service_name=ENV_MANAGER.service_name,
            endpoint=ENV_MANAGER.open_telemetry_endpoint,
        )
    elif ENV_MANAGER.enable_console_metrics:
        setup_console(service_name=ENV_MANAGER.service_name)

    app.state.model_downloader_provider = ModelDownloaderProvider(
        models_root_path=ENV_MANAGER.models_root_path
    )
    model_downloader = app.state.model_downloader_provider.get_downloader()
    model_path = await model_downloader.download(
        model_name=ENV_MANAGER.models_hf_hub_name,
        model_revision=ENV_MANAGER.models_hf_hub_revision,
    )

    # TODO(matehal@gradient.ai, 09/14/2023): This creates the Embedder for the model too. It can get
    # _a little_ slow, and having all the embedders can get expensive. It might make sense to
    # have an instance of the service per model and or revision, and pass the model name and revision
    # through the app argument.
    app.state.embedder_provider = EmbedderProvider(
        device=ENV_MANAGER.device_type,
        enable_ctranslate2_compilation=ENV_MANAGER.enable_ctranslate2_compilation,
        enable_pytorch_compilation=ENV_MANAGER.enable_pytorch_compilation,
        model_path=model_path,
        num_workers=ENV_MANAGER.workers_count,
    )

    logger.info("Embedding service ready to start")

    yield

    # This is being executed on shutdown.
    app.state.embedder_provider.close()


In [None]:
def sum_with_pattern_matching(seq):
  match seq:
    case (): return 0
    case (head,): return head
    case (head, *tail): return head + sum_with_pattern_matching(tail)
    case _:
      raise ValueError("Not a sequence")
    
pattern_sum = sum_with_pattern_matching((10,))
print(pattern_sum)
