In [None]:
from pathlib import Path

from datasets import load_dataset
from dotenv import load_dotenv
from tqdm import tqdm

from pelican_data_loader.config import SYSTEM_CONFIG
from pelican_data_loader.data import get_default_s3_client

load_dotenv()

In [None]:
# This is how we access data via huggingface's dataset repo
dataset_hf = load_dataset("bigcode/the-stack-dedup", streaming=True)
dataset_hf

In [None]:
# Pull without streaming
dataset_hf = load_dataset("bigcode/the-stack-dedup")
dataset_hf

### Upload to S3 (Slow, don't use it for large files)

In [None]:
# DATASET_DIR = Path("/home/clo36/hf/hub/datasets--bigcode--the-stack-dedup")
# s3_client = get_default_s3_client()

# for f in tqdm(list(DATASET_DIR.rglob("*"))):
#     if f.is_file():
#         s3_key = f"data/{str(f).lstrip('/home/clo36/hf/hub/')}"

#         # Skip if file already exists
#         try:
#             s3_client.stat_object(SYSTEM_CONFIG.s3_bucket_name, s3_key)
#         except Exception:
#             s3_client.fput_object(SYSTEM_CONFIG.s3_bucket_name, s3_key, f.as_posix())

### Upload to research drive

Run on windows with mapped R: drive, Not recommended, always disconnects.
```sh
scp -r clo36@bear-dev:/home/clo36/hf/hub/datasets--bigcode--the-stack-dedup "R:/clo36/pelican-data-loader/data/datasets--bigcode--the-stack-dedup"
```

Run on Linux with mapped `/mnt/research`, still somewhat prone to disconnection, resumable.
```sh
sudo rsync -av --no-times --progress --ignore-existing clo36@bear-dev:/home/clo36/hf/hub/datasets--bigcode--the-stack-dedup /mnt/research/clo36/pelican-data-loader/data/datasets--bigcode--the-stack-dedup/
```


### Upload to S3 (~10x faster ~= 1-2TB/hr vs. Python minio client)

Run on Linux with `rclone`

Pre-requisite: Config s3 in `rclone` ad `dsi-s3` data source
```sh
rclone config
```

Sync to s3

```sh
rclone sync ./datasets--bigcode--the-stack-dedup dsi-s3:pelican-data-loader/data/datasets--bigcode--the-stack-dedup \
    --copy-links \
    --progress \
    --transfers=8 \
    --checkers=16 \
    --multi-thread-streams=2 \
    --fast-list
```