From aaa50525c9743965cd3404af8a36f81778806bfe Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Fri, 19 Jan 2024 17:56:13 -0500
Subject: [PATCH 01/10] added peft req

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b0da93b..bdf0065 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ transformers
 sentencepiece
 tqdm
 wandb
-python-box
\ No newline at end of file
+python-box
+peft
\ No newline at end of file

From 4fe62e5e0c02fe011a6ff9af28f7c8d35b86e6a5 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Fri, 19 Jan 2024 17:56:48 -0500
Subject: [PATCH 02/10] added a dev-requirements file and fixed formatting

---
 dev-requirements.txt              |  2 +-
 vectorlm/utils/model_utils.py     | 46 +++++++++++++++++++++++++------
 vectorlm/utils/optimizer_utils.py |  3 ++
 3 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 82b7387..3a19038 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,3 +1,3 @@
 
 # Ruff linter for fast and opinionated linting
-ruff>=0.1.13
\ No newline at end of file
+ruff>=0.1.13
diff --git a/vectorlm/utils/model_utils.py b/vectorlm/utils/model_utils.py
index 6a3a5b8..09d5c27 100644
--- a/vectorlm/utils/model_utils.py
+++ b/vectorlm/utils/model_utils.py
@@ -34,12 +34,12 @@ def load_peft_model_and_tokenizer(
     adapter_name: str = "default",
     is_trainable: bool = False,
     config: PeftConfig | None = None,
-    **kwargs: Any,
 ) -> tuple[PeftModel, PreTrainedTokenizer]:
-    """Loads a trained PEFT adapter (e.g. using LORA) to the base model and returns the PeftModel
-        E.g., a base llama-2-13b-chat-hf w/ adapter named nifty
-        ├── adapters_lora
-            ├── llama-2-13b-chat-hf+nifty
+    """Load a trained PEFT adapter to the base model and return the PeftModel.
+
+    E.g., a base llama-2-13b-chat-hf w/ adapter named nifty
+    ├── adapters_lora
+        ├── llama-2-13b-chat-hf+nifty
 
     Args:
     ----
@@ -47,14 +47,30 @@ def load_peft_model_and_tokenizer(
         use_mp: Whether to use mixed-precision.
         use_fa: Whether to use Flash Attention 2.
         max_seq_len: The maximum sequence length.
-        peft_adapter_path: path to the adapter model, e.g. adapters_lora/llama-2-13b-chat-hf+nifty
+        peft_adapter_path: path to the adapter model, e.g.
+            adapters_lora/llama-2-13b-chat-hf+nifty
         adapter_name: e.g. nifty
         is_trainable: train or inference mode
         config: additional configs
+
+    Returns:
+    -------
+        The PEFT model and tokenizer.
     """
-    model, tokenizer = load_model_and_tokenizer(path, use_mp, use_fa, max_seq_len)
-    peft_model = PeftModel.from_pretrained(model, peft_adapter_path, adapter_name, is_trainable, config=config, **kwargs)
-    return peft_model
+    model, tokenizer = load_model_and_tokenizer(
+        path,
+        use_mp,
+        use_fa,
+        max_seq_len,
+    )
+    peft_model = PeftModel.from_pretrained(
+        model,
+        peft_adapter_path,
+        adapter_name,
+        is_trainable,
+        config,
+    )
+    return peft_model, tokenizer
 
 def load_model_and_tokenizer(
     path: str,
@@ -70,6 +86,10 @@ def load_model_and_tokenizer(
         use_mp: Whether to use mixed-precision.
         use_fa: Whether to use Flash Attention 2.
         max_seq_len: The maximum sequence length.
+
+    Returns:
+    -------
+        The model and tokenizer.
     """
     # load model
     model_args = {"use_cache": False}
@@ -113,6 +133,10 @@ def fsdp_config(
         use_mp: Whether to use mixed-precision.
         layer_to_wrap: The layer we are wrapping using FSDP.
         strategy: The sharding strategy to use.
+
+    Returns:
+    -------
+        A dictionary containing the configurations.
     """
     strategy_exists = hasattr(ShardingStrategy, strategy)
     if not strategy_exists:
@@ -156,6 +180,10 @@ def shard_model(
         use_mp: Whether to use mixed-precision.
         use_activation_checkpointing: Whether to use activation checkpointing.
         strategy: The sharding strategy to use.
+
+    Returns:
+    -------
+        The sharded module with the requested configurations.
     """
     fsdp_cfg = fsdp_config(use_mp, layer_to_wrap, strategy)
     if dist.get_rank() == 0:
diff --git a/vectorlm/utils/optimizer_utils.py b/vectorlm/utils/optimizer_utils.py
index 46badf8..ff479a9 100644
--- a/vectorlm/utils/optimizer_utils.py
+++ b/vectorlm/utils/optimizer_utils.py
@@ -159,6 +159,9 @@ def get_custom_scheduler(
         name: The name of the scheduler
         args: The scheduler specific args.
         kwargs: The scheduler specific kwargs.
+    
+    Returns:
+        The scheduler.
     """
     if name == "plataeu-with-warmup":
         scheduler = PlateaeuWithWarmup(*args, **kwargs)

From 1690a522caa4cd72d1112c9a5c665f9131230f3d Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Fri, 19 Jan 2024 18:30:07 -0500
Subject: [PATCH 03/10] added functionality to reset dataloader after epoch
 finishes

---
 examples/llama_example.py | 21 +++++++++++----------
 vectorlm/dataset.py       |  1 -
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/llama_example.py b/examples/llama_example.py
index d9c4caa..9e9f4eb 100644
--- a/examples/llama_example.py
+++ b/examples/llama_example.py
@@ -124,16 +124,17 @@ def main(config: Config) -> None:
             batch = next(train_dl_iterator)
             trainer.step(batch, epoch)
 
-    if epoch == training_args.epochs - 1:
-        hf_save_dir = os.path.join(training_args.output_dir, "final-model")
-    else:
-        hf_save_dir = os.path.join(
-            training_args.output_dir,
-            "checkpoints",
-            f"epoch_{epoch}",
-            "end-epoch-model",
-        )
-    save_consolidated_model(trainer.model, hf_save_dir, rank)
+        if epoch == training_args.epochs - 1:
+            hf_save_dir = os.path.join(training_args.output_dir, "final-model")
+        else:
+            hf_save_dir = os.path.join(
+                training_args.output_dir,
+                "checkpoints",
+                f"epoch_{epoch}",
+                "end-epoch-model",
+            )
+        save_consolidated_model(trainer.model, hf_save_dir, rank)
+        dataset.reset_dataloaders()
 
 if __name__ == "__main__":
     args = parse_args()
diff --git a/vectorlm/dataset.py b/vectorlm/dataset.py
index 3a4a225..768ef8f 100644
--- a/vectorlm/dataset.py
+++ b/vectorlm/dataset.py
@@ -57,7 +57,6 @@ def __init__(
     def reset_dataloaders(self) -> None:
         """Reset dataloaders."""
         self._processed_ids = torch.tensor([]).to(torch.cuda.current_device())
-        self.setup_dataloaders()
 
     def update_processed_ids(self, new_ids: torch.Tensor) -> None:
         """Update processed ids with an incoming stream of ids."""

From 1db4d0e9d70cb2da23639ebf73e5c1cd92cb95c1 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Fri, 19 Jan 2024 20:50:19 -0500
Subject: [PATCH 04/10] Mistral training now works (padding token was
 incorrectly set)

---
 vectorlm/utils/data_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vectorlm/utils/data_utils.py b/vectorlm/utils/data_utils.py
index 8a75ef1..4e14dd4 100644
--- a/vectorlm/utils/data_utils.py
+++ b/vectorlm/utils/data_utils.py
@@ -129,7 +129,6 @@ def __call__(
         batch["input_ids"] = input_ids
         batch["labels"] = labels
         batch["attention_mask"] = batch["input_ids"].ne(self.pad_token_id)
-        # print(batch["attention_mask"])
         return batch
 
     def _reverse_tensor(

From 03a25f583afae9c10a36a67e646ffe3a58d47d43 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Fri, 19 Jan 2024 21:04:31 -0500
Subject: [PATCH 05/10] added config ignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 4f554fb..03e800e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
 *.egg-info
 __pycache__/
 wandb/
-build/
\ No newline at end of file
+build/
+configs/
\ No newline at end of file

From c275f5d4c440db14c13114f6126d0a1daab1b789 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Fri, 19 Jan 2024 21:09:58 -0500
Subject: [PATCH 06/10] reverting gitignore

---
 .gitignore | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 03e800e..4f554fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,5 +3,4 @@
 *.egg-info
 __pycache__/
 wandb/
-build/
-configs/
\ No newline at end of file
+build/
\ No newline at end of file

From 4d3d85aa574191392a9df90d0e3e8551bf4c6d43 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Mon, 22 Jan 2024 20:36:45 -0500
Subject: [PATCH 07/10] two bugs. one, labels for bos/eos tokens weren't added
 properly if there was a sequence separator. two, the attention mask had the
 eos/bos token id instead of a [1] for those tokens specifically

---
 .gitignore         |  1 +
 preprocess_data.py | 74 ++++++++++++++++++++++++++++++++--------------
 setup.py           |  2 +-
 3 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4f554fb..fdf00fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.out
 *.err
 *.egg-info
+**/*.sh
 __pycache__/
 wandb/
 build/
\ No newline at end of file
diff --git a/preprocess_data.py b/preprocess_data.py
index adc72e3..239bf60 100644
--- a/preprocess_data.py
+++ b/preprocess_data.py
@@ -104,10 +104,12 @@ def tokenize_dataset(
             prompt = f"{bos}{example}{eos}"
         if not separator:
             tokenized = tokenizer.encode(prompt, add_special_tokens=False)
-            if truncate and len(tokenized) > tokenizer.max_model_length:
-                tokenized = tokenized[:tokenizer.max_model_length]
+            if truncate and len(tokenized) > tokenizer.model_max_length:
+                tokenized = tokenized[:tokenizer.model_max_length]
             all_labels.append(deepcopy(tokenized))
         else:
+            if separator not in prompt:
+                continue
             separation_idx = prompt.find(separator) + len(separator)
             prefix, postfix = prompt[:separation_idx], prompt[separation_idx:]
             tokenized_prefix = tokenizer.encode(
@@ -117,12 +119,22 @@ def tokenize_dataset(
                 postfix, add_special_tokens=False,
             )
             tokenized = tokenized_prefix + tokenized_postfix
-            if truncate and len(tokenized) > tokenizer.max_model_length:
-                tokenized = tokenized[:tokenizer.max_model_length]
-            tokenized = tokenized_prefix + tokenized_postfix
-            all_labels.append(
-                [-100] * len(tokenized_prefix) + deepcopy(tokenized_postfix),
-            )
+            if truncate and len(tokenized) > tokenizer.model_max_length:
+                tokenized = tokenized[:tokenizer.model_max_length]
+            if add_bos_eos:
+                label = (
+                    [tokenizer.bos_token_id] + (
+                        [-100] * (len(tokenized_prefix) - 1)
+                    ) + deepcopy(tokenized_postfix)
+                )
+            else:
+                label = (
+                    [-100] * len(
+                        tokenized_prefix
+                    ) + deepcopy(tokenized_postfix)
+                )
+            # If truncated, labels should be the same.
+            all_labels.append(label[:len(tokenized)])
         all_input_ids.append(tokenized)
         all_attention_mask.append([1] * len(tokenized))
 
@@ -160,7 +172,8 @@ def pack_examples(
     """
     chunk_size = tokenizer.model_max_length
     if add_bos_eos:
-        chunk_size -= 2  # For BOS and EOS tokens.
+        # For BOS and EOS tokens.
+        chunk_size -= 2
         bos, eos = [tokenizer.bos_token_id], [tokenizer.eos_token_id]
     else:
         bos, eos = [], []
@@ -169,25 +182,42 @@ def pack_examples(
     if packing_type == "full":
         joined_examples = {k: sum(examples[k], []) for k in all_keys}
         total_length = len(joined_examples["input_ids"])
-        result = {
-            k: [
-                bos + v[i:i + chunk_size] + eos for i in range(
-                    0, total_length, stride,
-                )
-            ] for k, v in joined_examples.items()
-        }
+        result = {}
+        for k, v in joined_examples.items():
+            value_chunked_lst = []
+            for i in range(0, total_length, stride):
+                if k != "attention_mask":
+                    value_chunked_lst.append(bos + v[i:i + chunk_size] + eos)
+                else:
+                    if add_bos_eos:
+                        # Need to do this explicitly because attention mask
+                        # is just 1s or 0s.
+                        value_chunked_lst.append(
+                            [1] + v[i:i + chunk_size] + [1]
+                        )
+                    else:
+                        value_chunked_lst.append(v[i:i + chunk_size])
     elif packing_type == "partial":
         result = {k:[] for k in examples}
         _key = all_keys[0]
         for idx in range(len(examples[_key])):
             total_length = len(examples[_key][idx])
             for key in all_keys:
-                sliced_example = [
-                    (
-                        bos + examples[key][idx][i:i + chunk_size] + eos
-                    ) for i in range(0, total_length, stride)
-                ]
-                result[key].extend(sliced_example)
+                for i in range(0, total_length, stride):
+                    if key != "attention_mask":
+                        sliced_example = [
+                            bos + examples[key][idx][i:i + chunk_size] + eos
+                        ]
+                    else:
+                        if add_bos_eos:
+                            sliced_example = [
+                                [1] + examples[key][idx][i:i + chunk_size] + [1]
+                            ]
+                        else:
+                            sliced_example = [
+                                examples[key][idx][i:i + chunk_size]
+                            ]
+                    result[key].extend(sliced_example)
     else:
         msg = "`packing_type` needs to either be `full` or `partial`."
         raise ValueError(msg)
diff --git a/setup.py b/setup.py
index cc43024..3e1a4d9 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="vectorlm",
-    version="1.0",
+    version="0.1.0",
     packages=find_packages(),
     install_requires=requirements,
     python_requires=">=3.10",

From 4944bb4ee48373f6dcc2f8bc639d26649745acde Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Tue, 23 Jan 2024 11:06:20 -0500
Subject: [PATCH 08/10] added gotcha for separator in data preprocessing

---
 docs/config.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/config.md b/docs/config.md
index d31646e..db56806 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -65,7 +65,7 @@ Similar to the wandb config above, these keyword parameters are fed directly int
 * `overlap`: When we chunk a data point during packing, we can choose to have some overlap between the current chunk and the next chunk. This might help the model understand surrounding context during training (although this isn't something we have empirically investigated, we keep this option available to users).
 * `add_bos_eos_tokens`: Whether to add `BOS` and `EOS` tokens as defined by the respective HuggingFace tokenizer. If using packing, these will be added after packing is done, so that each chunk of size `max_seq_len` has these tokens.
 * `from_disk`: Whether we are going to be loading the dataset to preprocess from disk (the other option is to download straight from HuggingFace).
-* `seperator`: If using conditional finetuning (i.e. in a given data point, everything before `separator` will not be used for calculating the loss and its labels will be `ignore_index`).
+* `seperator`: If using conditional finetuning (i.e. in a given data point, everything before `separator` will not be used for calculating the loss and its labels will be `ignore_index`). **Note:** if `separator` is not found in a given sequence, the default behavior is that datapoint will be skipped and not be a part of the final set.
 * `load_path`: The directory containing the HuggingFace dataset we are loading to preprocess.
 * `split`: If `load_path` is a dataset dictionary, `split` specifies which key in this dictionary contains the dataset we are preprocessing.
 * `save_path`: The directory we will be saving the processed dataset to.

From f8f47ded5d8d84138ccd087dcf36e49ca129bc94 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Tue, 23 Jan 2024 11:43:15 -0500
Subject: [PATCH 09/10] fixed small bug in truncation where eos token should be
 added after truncation

---
 preprocess_data.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/preprocess_data.py b/preprocess_data.py
index 239bf60..9acbf63 100644
--- a/preprocess_data.py
+++ b/preprocess_data.py
@@ -93,15 +93,18 @@ def tokenize_dataset(
     all_input_ids = []
     all_attention_mask = []
     all_labels = []
+    # Adding bos/eos
     if add_bos_eos:
         bos, eos = tokenizer.bos_token, tokenizer.eos_token
     else:
         bos, eos = "", ""
     for example in examples[data_field]:
+        # If we want to include a prepended prompt to each datapoint
         if pre_pend:
             prompt = f"{bos}{pre_pend}{example}{eos}"
         else:
             prompt = f"{bos}{example}{eos}"
+        # If we've specified a separator present in each sequence
         if not separator:
             tokenized = tokenizer.encode(prompt, add_special_tokens=False)
             if truncate and len(tokenized) > tokenizer.model_max_length:
@@ -110,6 +113,7 @@ def tokenize_dataset(
         else:
             if separator not in prompt:
                 continue
+            # Perform tokenization separately to allow for conditional prompting
             separation_idx = prompt.find(separator) + len(separator)
             prefix, postfix = prompt[:separation_idx], prompt[separation_idx:]
             tokenized_prefix = tokenizer.encode(
@@ -120,21 +124,26 @@ def tokenize_dataset(
             )
             tokenized = tokenized_prefix + tokenized_postfix
             if truncate and len(tokenized) > tokenizer.model_max_length:
-                tokenized = tokenized[:tokenizer.model_max_length]
+                tokenized = tokenized[:tokenizer.model_max_length - 1]
+                tokenized.append(tokenizer.eos_token_id)
+            # We need to address this separately, because labels need to
+            # backprop on bos/eos tokens
             if add_bos_eos:
                 label = (
-                    [tokenizer.bos_token_id] + (
-                        [-100] * (len(tokenized_prefix) - 1)
-                    ) + deepcopy(tokenized_postfix)
+                    [tokenizer.bos_token_id]
+                    + ([-100] * (len(tokenized_prefix) - 1))
+                    + deepcopy(tokenized_postfix)
                 )
             else:
                 label = (
-                    [-100] * len(
-                        tokenized_prefix
-                    ) + deepcopy(tokenized_postfix)
+                    [-100] * len(tokenized_prefix)
+                    + deepcopy(tokenized_postfix)
                 )
             # If truncated, labels should be the same.
-            all_labels.append(label[:len(tokenized)])
+            if truncate and len(label) > tokenizer.model_max_length:
+                label = label[:tokenizer.model_max_length - 1]
+                label.append(tokenizer.eos_token_id)
+            all_labels.append(label)
         all_input_ids.append(tokenized)
         all_attention_mask.append([1] * len(tokenized))
 

From 742a9b1748099fc03529d3ebeb4f6d4418432e47 Mon Sep 17 00:00:00 2001
From: adil-a <adil.asif2000@hotmail.com>
Date: Tue, 23 Jan 2024 11:44:38 -0500
Subject: [PATCH 10/10] fixed small bug in truncation where eos token should be
 added after truncation

---
 preprocess_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/preprocess_data.py b/preprocess_data.py
index 9acbf63..7d6e3c7 100644
--- a/preprocess_data.py
+++ b/preprocess_data.py
@@ -108,7 +108,8 @@ def tokenize_dataset(
         if not separator:
             tokenized = tokenizer.encode(prompt, add_special_tokens=False)
             if truncate and len(tokenized) > tokenizer.model_max_length:
-                tokenized = tokenized[:tokenizer.model_max_length]
+                tokenized = tokenized[:tokenizer.model_max_length - 1]
+                tokenized.append(tokenizer.eos_token_id)
             all_labels.append(deepcopy(tokenized))
         else:
             if separator not in prompt: