Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Make sure the following tasks are completed before submitting the PR:

### General

- [ ] πŸ“œ I have read and followed the [contributing guidelines](https://servicenow.github.io/Fast-LLM/developers/contributing).
- [ ] πŸ“œ I have read and followed the [contributing guidelines](https://servicenow.github.io/Fast-LLM/contributing/contributing).
- [ ] 🏷️ I am using a clear and descriptive PR title that summarizes the key change or feature introduced.
- [ ] πŸŽ‰ The functionality is complete, and I have tested the changes.
- [ ] πŸ“ I have updated the documentation if needed.
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Contributing to Fast-LLM

Please refer to the [contributing guidelines](https://servicenow.github.io/Fast-LLM/developers/contributing) for more information on how to contribute to Fast-LLM.
Please refer to the [contributing guidelines](https://servicenow.github.io/Fast-LLM/contributing/contributing/) for more information on how to contribute to Fast-LLM.
2 changes: 1 addition & 1 deletion fast_llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):


class UpdateType(str, enum.Enum):
# Override entries no matter what they contais.
# Override entries no matter what they contain.
override = "override"
# Override atomic entries and lists, but update dicts recursively by setting or overriding only the specified entries.
update = "update"
Expand Down
2 changes: 1 addition & 1 deletion fast_llm/engine/inference/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def from_pretrained(
)

# Create the model
# always set up model and crate distributed instance internally for now
# always set up model and create distributed instance internally for now
fast_llm_model = cls.runner_class.model_class.from_pretrained(
pretrained_model_name_or_path,
*updates,
Expand Down
4 changes: 2 additions & 2 deletions fast_llm/layers/ssm/discrete_mamba2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from fast_llm.tensor import ParameterMeta, init_ones_, init_uniform_, init_zeros_, kaiming_init_

"""
This code is adapted fropm https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py
This code is adapted from https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py
"""


Expand Down Expand Up @@ -65,7 +65,7 @@ def __init__(
self.act = config.activation_type.activation_fn
self.activation_name = config.activation_type.name

# TODO: double check innitializations
# TODO: double check initializations
# Projections
self.in_proj = Linear(td_model, td_inner_proj, bias=bias, weight_init_method=kaiming_init_(td_model.size))
self.z_bias = (
Expand Down
6 changes: 3 additions & 3 deletions fast_llm/layers/ssm/mamba_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from fast_llm.tensor import ParameterMeta, init_ones_, kaiming_init_

"""
Note: this is mostly addapted from https://github.com/Zyphra/Zamba2, similar code is aslo in https://github.com/state-spaces/mamba.
Note: this is mostly adapted from https://github.com/Zyphra/Zamba2, similar code is also in https://github.com/state-spaces/mamba.
For now it only supports training and not inference.
This works with triton 3.1.0
"""
Expand All @@ -20,7 +20,7 @@
def init_A(d_state, d_inner) -> Callable[[ParameterMeta, torch.Tensor, torch.Generator], torch.Tensor]:
def init_(meta: ParameterMeta, tensor: torch.Tensor, generator: torch.Generator): # noqa
# S4D real initialization
# TODO: adopt this innitialization to work for tensor parallel setting!
# TODO: adopt this initialization to work for tensor parallel setting!
A = einops.repeat(torch.arange(1, d_state + 1, dtype=torch.float32), "n -> d n", d=d_inner).contiguous()
A_log = torch.log(A) # Keep A_log in fp32
if tensor.shape != A_log.shape:
Expand Down Expand Up @@ -106,7 +106,7 @@ def __init__(
)
self.x_proj.weight.auto_grad_accumulation = True

# TODO: the weights are innitialized a bit differently here https://github.com/state-spaces/mamba/blob/0cce0fa645f100f00620ddf2333c2b7712abfdec/mamba_ssm/modules/mamba_simple.py#L82
# TODO: the weights are initialized a bit differently here https://github.com/state-spaces/mamba/blob/0cce0fa645f100f00620ddf2333c2b7712abfdec/mamba_ssm/modules/mamba_simple.py#L82
self.dt_proj_weight = ParameterMeta.from_dims(
(td_inner, tdt_rank),
init_method=kaiming_init_(tdt_rank.size),
Expand Down
2 changes: 1 addition & 1 deletion fast_llm/layers/transformer/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def get_rotary_frequencies(
# `exp(i * n * a) = cos(n * a) + i sin(n * a)`,
# `a = theta ** - (2 * (channel // 2) / kv_channels)`,
# where n is the position in the sequence.
# We preform the calculation in high precision because it matters for rotary embeddings.
# We perform the calculation in high precision because it matters for rotary embeddings.
positions = torch.arange(sequence_length, device=device, dtype=torch.float64)
frequencies = config.theta ** -torch.arange(0, 1, 2 / kv_channels, device=device, dtype=torch.float64)
# Apply scaling
Expand Down
2 changes: 1 addition & 1 deletion fast_llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def __call__(self, *args, **kwargs):

def try_decorate(get_decorator: Callable, _return_decorator: bool = True) -> Callable:
"""
Try to decorate an object, but ignore the error until the object is actualy used.
Try to decorate an object, but ignore the error until the object is actually used.
The wrapped decorator should always be instantiated before calling,
i.e.. called as `@decorator()` rather than `@decorator`.
"""
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ OPTIONAL =
# Hydra
hydra-core>=1.3.2
omegaconf>=2.3.0
# Miscellanous
# Miscellaneous
requests>=2.32.3
tqdm>=4.66.3

Expand Down