diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 8b5954078..bc7dd7d52 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -31,7 +31,7 @@ Make sure the following tasks are completed before submitting the PR: ### General -- [ ] 📜 I have read and followed the [contributing guidelines](https://servicenow.github.io/Fast-LLM/developers/contributing). +- [ ] 📜 I have read and followed the [contributing guidelines](https://servicenow.github.io/Fast-LLM/contributing/contributing). - [ ] 🏷️ I am using a clear and descriptive PR title that summarizes the key change or feature introduced. - [ ] 🎉 The functionality is complete, and I have tested the changes. - [ ] 📝 I have updated the documentation if needed. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 16580f7d1..b64ae1ea7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,3 @@ # Contributing to Fast-LLM -Please refer to the [contributing guidelines](https://servicenow.github.io/Fast-LLM/developers/contributing) for more information on how to contribute to Fast-LLM. +Please refer to the [contributing guidelines](https://servicenow.github.io/Fast-LLM/contributing/contributing/) for more information on how to contribute to Fast-LLM. diff --git a/fast_llm/config.py b/fast_llm/config.py index c8555e448..6cebdc1c6 100644 --- a/fast_llm/config.py +++ b/fast_llm/config.py @@ -41,7 +41,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): class UpdateType(str, enum.Enum): - # Override entries no matter what they contais. + # Override entries no matter what they contain. override = "override" # Override atomic entries and lists, but update dicts recursively by setting or overriding only the specified entries. update = "update" diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index e679cfd6f..554da8cd1 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -78,7 +78,7 @@ def from_pretrained( ) # Create the model - # always set up model and crate distributed instance internally for now + # always set up model and create distributed instance internally for now fast_llm_model = cls.runner_class.model_class.from_pretrained( pretrained_model_name_or_path, *updates, diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index 49dacb914..bf0128c89 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -11,7 +11,7 @@ from fast_llm.tensor import ParameterMeta, init_ones_, init_uniform_, init_zeros_, kaiming_init_ """ -This code is adapted fropm https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py +This code is adapted from https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py """ @@ -65,7 +65,7 @@ def __init__( self.act = config.activation_type.activation_fn self.activation_name = config.activation_type.name - # TODO: double check innitializations + # TODO: double check initializations # Projections self.in_proj = Linear(td_model, td_inner_proj, bias=bias, weight_init_method=kaiming_init_(td_model.size)) self.z_bias = ( diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 4704b5228..3d9cc05b8 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -11,7 +11,7 @@ from fast_llm.tensor import ParameterMeta, init_ones_, kaiming_init_ """ -Note: this is mostly addapted from https://github.com/Zyphra/Zamba2, similar code is aslo in https://github.com/state-spaces/mamba. +Note: this is mostly adapted from https://github.com/Zyphra/Zamba2, similar code is also in https://github.com/state-spaces/mamba. For now it only supports training and not inference. This works with triton 3.1.0 """ @@ -20,7 +20,7 @@ def init_A(d_state, d_inner) -> Callable[[ParameterMeta, torch.Tensor, torch.Generator], torch.Tensor]: def init_(meta: ParameterMeta, tensor: torch.Tensor, generator: torch.Generator): # noqa # S4D real initialization - # TODO: adopt this innitialization to work for tensor parallel setting! + # TODO: adopt this initialization to work for tensor parallel setting! A = einops.repeat(torch.arange(1, d_state + 1, dtype=torch.float32), "n -> d n", d=d_inner).contiguous() A_log = torch.log(A) # Keep A_log in fp32 if tensor.shape != A_log.shape: @@ -106,7 +106,7 @@ def __init__( ) self.x_proj.weight.auto_grad_accumulation = True - # TODO: the weights are innitialized a bit differently here https://github.com/state-spaces/mamba/blob/0cce0fa645f100f00620ddf2333c2b7712abfdec/mamba_ssm/modules/mamba_simple.py#L82 + # TODO: the weights are initialized a bit differently here https://github.com/state-spaces/mamba/blob/0cce0fa645f100f00620ddf2333c2b7712abfdec/mamba_ssm/modules/mamba_simple.py#L82 self.dt_proj_weight = ParameterMeta.from_dims( (td_inner, tdt_rank), init_method=kaiming_init_(tdt_rank.size), diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py index 0697bd216..bedab9f63 100644 --- a/fast_llm/layers/transformer/preprocessing.py +++ b/fast_llm/layers/transformer/preprocessing.py @@ -109,7 +109,7 @@ def get_rotary_frequencies( # `exp(i * n * a) = cos(n * a) + i sin(n * a)`, # `a = theta ** - (2 * (channel // 2) / kv_channels)`, # where n is the position in the sequence. - # We preform the calculation in high precision because it matters for rotary embeddings. + # We perform the calculation in high precision because it matters for rotary embeddings. positions = torch.arange(sequence_length, device=device, dtype=torch.float64) frequencies = config.theta ** -torch.arange(0, 1, 2 / kv_channels, device=device, dtype=torch.float64) # Apply scaling diff --git a/fast_llm/utils.py b/fast_llm/utils.py index d89c9d760..61ac1014c 100644 --- a/fast_llm/utils.py +++ b/fast_llm/utils.py @@ -290,7 +290,7 @@ def __call__(self, *args, **kwargs): def try_decorate(get_decorator: Callable, _return_decorator: bool = True) -> Callable: """ - Try to decorate an object, but ignore the error until the object is actualy used. + Try to decorate an object, but ignore the error until the object is actually used. The wrapped decorator should always be instantiated before calling, i.e.. called as `@decorator()` rather than `@decorator`. """ diff --git a/setup.cfg b/setup.cfg index 67c5093d2..bf9416678 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ OPTIONAL = # Hydra hydra-core>=1.3.2 omegaconf>=2.3.0 - # Miscellanous + # Miscellaneous requests>=2.32.3 tqdm>=4.66.3