diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 8b5954078..bc7dd7d52 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -31,7 +31,7 @@ Make sure the following tasks are completed before submitting the PR:
 
 ### General
 
-- [ ] 📜 I have read and followed the [contributing guidelines](https://servicenow.github.io/Fast-LLM/developers/contributing).
+- [ ] 📜 I have read and followed the [contributing guidelines](https://servicenow.github.io/Fast-LLM/contributing/contributing).
 - [ ] 🏷️ I am using a clear and descriptive PR title that summarizes the key change or feature introduced.
 - [ ] 🎉 The functionality is complete, and I have tested the changes.
 - [ ] 📝 I have updated the documentation if needed.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 16580f7d1..b64ae1ea7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,3 @@
 # Contributing to Fast-LLM
 
-Please refer to the [contributing guidelines](https://servicenow.github.io/Fast-LLM/developers/contributing) for more information on how to contribute to Fast-LLM.
+Please refer to the [contributing guidelines](https://servicenow.github.io/Fast-LLM/contributing/contributing/) for more information on how to contribute to Fast-LLM.
diff --git a/fast_llm/config.py b/fast_llm/config.py
index c8555e448..6cebdc1c6 100644
--- a/fast_llm/config.py
+++ b/fast_llm/config.py
@@ -41,7 +41,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class UpdateType(str, enum.Enum):
-    # Override entries no matter what they contais.
+    # Override entries no matter what they contain.
     override = "override"
     # Override atomic entries and lists, but update dicts recursively by setting or overriding only the specified entries.
     update = "update"
diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index e679cfd6f..554da8cd1 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -78,7 +78,7 @@ def from_pretrained(
             )
 
         # Create the model
-        # always set up model and crate distributed instance internally for now
+        # always set up model and create distributed instance internally for now
         fast_llm_model = cls.runner_class.model_class.from_pretrained(
             pretrained_model_name_or_path,
             *updates,
diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py
index 49dacb914..bf0128c89 100644
--- a/fast_llm/layers/ssm/discrete_mamba2.py
+++ b/fast_llm/layers/ssm/discrete_mamba2.py
@@ -11,7 +11,7 @@
 from fast_llm.tensor import ParameterMeta, init_ones_, init_uniform_, init_zeros_, kaiming_init_
 
 """
-This code is adapted fropm https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py
+This code is adapted from https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py
 """
 
 
@@ -65,7 +65,7 @@ def __init__(
         self.act = config.activation_type.activation_fn
         self.activation_name = config.activation_type.name
 
-        # TODO: double check innitializations
+        # TODO: double check initializations
         # Projections
         self.in_proj = Linear(td_model, td_inner_proj, bias=bias, weight_init_method=kaiming_init_(td_model.size))
         self.z_bias = (
diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py
index 4704b5228..3d9cc05b8 100644
--- a/fast_llm/layers/ssm/mamba_layer.py
+++ b/fast_llm/layers/ssm/mamba_layer.py
@@ -11,7 +11,7 @@
 from fast_llm.tensor import ParameterMeta, init_ones_, kaiming_init_
 
 """
-Note: this is mostly addapted from https://github.com/Zyphra/Zamba2, similar code is aslo in https://github.com/state-spaces/mamba.
+Note: this is mostly adapted from https://github.com/Zyphra/Zamba2, similar code is also in https://github.com/state-spaces/mamba.
 For now it only supports training and not inference.
 This works with triton 3.1.0
 """
@@ -20,7 +20,7 @@
 def init_A(d_state, d_inner) -> Callable[[ParameterMeta, torch.Tensor, torch.Generator], torch.Tensor]:
     def init_(meta: ParameterMeta, tensor: torch.Tensor, generator: torch.Generator):  # noqa
         # S4D real initialization
-        # TODO: adopt this innitialization to work for tensor parallel setting!
+        # TODO: adopt this initialization to work for tensor parallel setting!
         A = einops.repeat(torch.arange(1, d_state + 1, dtype=torch.float32), "n -> d n", d=d_inner).contiguous()
         A_log = torch.log(A)  # Keep A_log in fp32
         if tensor.shape != A_log.shape:
@@ -106,7 +106,7 @@ def __init__(
         )
         self.x_proj.weight.auto_grad_accumulation = True
 
-        # TODO: the weights are innitialized a bit differently here https://github.com/state-spaces/mamba/blob/0cce0fa645f100f00620ddf2333c2b7712abfdec/mamba_ssm/modules/mamba_simple.py#L82
+        # TODO: the weights are initialized a bit differently here https://github.com/state-spaces/mamba/blob/0cce0fa645f100f00620ddf2333c2b7712abfdec/mamba_ssm/modules/mamba_simple.py#L82
         self.dt_proj_weight = ParameterMeta.from_dims(
             (td_inner, tdt_rank),
             init_method=kaiming_init_(tdt_rank.size),
diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py
index 0697bd216..bedab9f63 100644
--- a/fast_llm/layers/transformer/preprocessing.py
+++ b/fast_llm/layers/transformer/preprocessing.py
@@ -109,7 +109,7 @@ def get_rotary_frequencies(
     # `exp(i * n * a) = cos(n * a) + i sin(n * a)`,
     # `a = theta ** - (2 * (channel // 2) / kv_channels)`,
     # where n is the position in the sequence.
-    # We preform the calculation in high precision because it matters for rotary embeddings.
+    # We perform the calculation in high precision because it matters for rotary embeddings.
     positions = torch.arange(sequence_length, device=device, dtype=torch.float64)
     frequencies = config.theta ** -torch.arange(0, 1, 2 / kv_channels, device=device, dtype=torch.float64)
     # Apply scaling
diff --git a/fast_llm/utils.py b/fast_llm/utils.py
index d89c9d760..61ac1014c 100644
--- a/fast_llm/utils.py
+++ b/fast_llm/utils.py
@@ -290,7 +290,7 @@ def __call__(self, *args, **kwargs):
 
 def try_decorate(get_decorator: Callable, _return_decorator: bool = True) -> Callable:
     """
-    Try to decorate an object, but ignore the error until the object is actualy used.
+    Try to decorate an object, but ignore the error until the object is actually used.
     The wrapped decorator should always be instantiated before calling,
     i.e.. called as `@decorator()` rather than `@decorator`.
     """
diff --git a/setup.cfg b/setup.cfg
index 67c5093d2..bf9416678 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,7 @@ OPTIONAL =
     # Hydra
     hydra-core>=1.3.2
     omegaconf>=2.3.0
-    # Miscellanous
+    # Miscellaneous
     requests>=2.32.3
     tqdm>=4.66.3