From f2c5090ddac4db8c44f2724f4fbde1d7bab84342 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 18 Mar 2026 11:59:28 -0700 Subject: [PATCH 1/4] fix: support ckpt conversion from a different location than repo root --- src/opentau/scripts/convert_checkpoint.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/opentau/scripts/convert_checkpoint.sh b/src/opentau/scripts/convert_checkpoint.sh index 40f47003..a40be10e 100755 --- a/src/opentau/scripts/convert_checkpoint.sh +++ b/src/opentau/scripts/convert_checkpoint.sh @@ -40,11 +40,11 @@ echo "Converting checkpoint in directory: $CHECKPOINT_DIR" # Step 1: Convert sharded checkpoint to full state dict echo "Step 1: Converting sharded checkpoint to full state dict..." -python src/opentau/scripts/zero_to_fp32.py "$CHECKPOINT_DIR" "$CHECKPOINT_DIR/full_state_dict" --max_shard_size 1000GB +python -m opentau.scripts.zero_to_fp32 "$CHECKPOINT_DIR" "$CHECKPOINT_DIR/full_state_dict" --max_shard_size 1000GB # Step 2: Convert pytorch_model.bin to model.safetensors echo "Step 2: Converting pytorch_model.bin to model.safetensors..." -python src/opentau/scripts/bin_to_safetensors.py "$CHECKPOINT_DIR/full_state_dict/pytorch_model.bin" --output_file "$CHECKPOINT_DIR/model.safetensors" +python -m opentau.scripts.bin_to_safetensors "$CHECKPOINT_DIR/full_state_dict/pytorch_model.bin" --output_file "$CHECKPOINT_DIR/model.safetensors" echo "Conversion completed successfully!" echo "Model saved as: $CHECKPOINT_DIR/model.safetensors" From 944eab5cc4083dd996b4b83fda9f18af26500436 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 18 Mar 2026 12:00:01 -0700 Subject: [PATCH 2/4] feat: skip conversion if model.safetensors is already present --- src/opentau/scripts/convert_checkpoint.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/opentau/scripts/convert_checkpoint.sh b/src/opentau/scripts/convert_checkpoint.sh index a40be10e..2487dbd3 100755 --- a/src/opentau/scripts/convert_checkpoint.sh +++ b/src/opentau/scripts/convert_checkpoint.sh @@ -36,6 +36,11 @@ if [ ! -d "$CHECKPOINT_DIR" ]; then exit 1 fi +if [ -f $CHECKPOINT_DIR/model.safetensors ]; then + echo "Error: model.safetensors already exists in the checkpoint directory. Please remove it before running this script." + exit 1 +fi + echo "Converting checkpoint in directory: $CHECKPOINT_DIR" # Step 1: Convert sharded checkpoint to full state dict From cfd806a9e028af286cdb4a8e0068d0681217da03 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Mon, 23 Mar 2026 12:47:42 -0700 Subject: [PATCH 3/4] Update src/opentau/scripts/convert_checkpoint.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/opentau/scripts/convert_checkpoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/opentau/scripts/convert_checkpoint.sh b/src/opentau/scripts/convert_checkpoint.sh index 2487dbd3..e2a6a06e 100755 --- a/src/opentau/scripts/convert_checkpoint.sh +++ b/src/opentau/scripts/convert_checkpoint.sh @@ -36,7 +36,7 @@ if [ ! -d "$CHECKPOINT_DIR" ]; then exit 1 fi -if [ -f $CHECKPOINT_DIR/model.safetensors ]; then +if [ -f "$CHECKPOINT_DIR/model.safetensors" ]; then echo "Error: model.safetensors already exists in the checkpoint directory. Please remove it before running this script." exit 1 fi From 6f5e4fd2870ec56c18a424a837fa748cff93c10a Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 25 Mar 2026 13:27:26 -0700 Subject: [PATCH 4/4] chore: cursor review --- src/opentau/scripts/convert_checkpoint.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/opentau/scripts/convert_checkpoint.sh b/src/opentau/scripts/convert_checkpoint.sh index 2487dbd3..59bf39ab 100755 --- a/src/opentau/scripts/convert_checkpoint.sh +++ b/src/opentau/scripts/convert_checkpoint.sh @@ -36,9 +36,9 @@ if [ ! -d "$CHECKPOINT_DIR" ]; then exit 1 fi -if [ -f $CHECKPOINT_DIR/model.safetensors ]; then - echo "Error: model.safetensors already exists in the checkpoint directory. Please remove it before running this script." - exit 1 +if [ -f "$CHECKPOINT_DIR/model.safetensors" ]; then + echo "model.safetensors already exists in '$CHECKPOINT_DIR'. Skipping conversion." + exit 0 fi echo "Converting checkpoint in directory: $CHECKPOINT_DIR" @@ -51,5 +51,8 @@ python -m opentau.scripts.zero_to_fp32 "$CHECKPOINT_DIR" "$CHECKPOINT_DIR/full_s echo "Step 2: Converting pytorch_model.bin to model.safetensors..." python -m opentau.scripts.bin_to_safetensors "$CHECKPOINT_DIR/full_state_dict/pytorch_model.bin" --output_file "$CHECKPOINT_DIR/model.safetensors" +echo "Step 3: Cleaning up intermediate files..." +rm -rf "$CHECKPOINT_DIR/full_state_dict" + echo "Conversion completed successfully!" echo "Model saved as: $CHECKPOINT_DIR/model.safetensors"