diff --git a/server/tests/utils/test_layers.py b/server/tests/utils/test_layers.py
index 93a0e98275..9a8da0d62d 100644
--- a/server/tests/utils/test_layers.py
+++ b/server/tests/utils/test_layers.py
@@ -1,5 +1,5 @@
 import torch
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
 )
 
diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py
new file mode 100644
index 0000000000..c3a6c921f9
--- /dev/null
+++ b/server/text_generation_server/layers/__init__.py
@@ -0,0 +1,14 @@
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+)
+from text_generation_server.layers.speculative import SpeculativeHead
+from text_generation_server.layers.linear import (
+    get_linear,
+    FastLinear,
+)
+
+# Just to add the `load` methods.
+from text_generation_server.layers.layernorm import load_layer_norm
+from text_generation_server.layers.conv import load_conv2d
diff --git a/server/text_generation_server/utils/awq/conversion_utils.py b/server/text_generation_server/layers/awq/conversion_utils.py
similarity index 100%
rename from server/text_generation_server/utils/awq/conversion_utils.py
rename to server/text_generation_server/layers/awq/conversion_utils.py
diff --git a/server/text_generation_server/utils/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/qmodule.py
similarity index 100%
rename from server/text_generation_server/utils/awq/quantize/qmodule.py
rename to server/text_generation_server/layers/awq/quantize/qmodule.py
diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py
new file mode 100644
index 0000000000..d27a33a13e
--- /dev/null
+++ b/server/text_generation_server/layers/bnb.py
@@ -0,0 +1,106 @@
+import torch
+from loguru import logger
+from functools import lru_cache
+import bitsandbytes as bnb
+from bitsandbytes.nn import Int8Params, Params4bit
+
+
+@lru_cache(1)
+def warn_deprecate_bnb():
+    logger.warning(
+        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
+    )
+
+
+class Linear8bitLt(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        has_fp16_weights=True,
+        memory_efficient_backward=False,
+        threshold=0.0,
+        index=None,
+    ):
+        super().__init__()
+        assert (
+            not memory_efficient_backward
+        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
+        self.state = bnb.MatmulLtState()
+        self.index = index
+
+        # Necessary for stacked layers
+        self.state.threshold = threshold
+        self.state.has_fp16_weights = has_fp16_weights
+        self.state.memory_efficient_backward = memory_efficient_backward
+        if threshold > 0.0 and not has_fp16_weights:
+            self.state.use_pool = True
+
+        self.weight = Int8Params(
+            weight.data,
+            has_fp16_weights=has_fp16_weights,
+            requires_grad=has_fp16_weights,
+        )
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def init_8bit_state(self):
+        self.state.CB = self.weight.CB
+        self.state.SCB = self.weight.SCB
+        self.weight.CB = None
+        self.weight.SCB = None
+
+    def forward(self, x: torch.Tensor):
+        self.state.is_training = self.training
+        if self.weight.CB is not None:
+            self.init_8bit_state()
+
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
+
+        if not self.state.has_fp16_weights:
+            if self.state.CB is not None and self.state.CxB is not None:
+                # we converted 8-bit row major to turing/ampere format in the first inference pass
+                # we no longer need the row-major weight
+                del self.state.CB
+                self.weight.data = self.state.CxB
+        return out
+
+
+class Linear4bit(nn.Module):
+    def __init__(self, weight, bias, quant_type):
+        super().__init__()
+        self.weight = Params4bit(
+            weight.data,
+            requires_grad=False,
+            compress_statistics=True,
+            quant_type=quant_type,
+        )
+        self.compute_dtype = None
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def forward(self, x: torch.Tensor):
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        if getattr(self.weight, "quant_state", None) is None:
+            print(
+                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
+            )
+        inp_dtype = x.dtype
+        if self.compute_dtype is not None:
+            x = x.to(self.compute_dtype)
+
+        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+        out = bnb.matmul_4bit(
+            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
+        )
+
+        out = out.to(inp_dtype)
+
+        return out
diff --git a/server/text_generation_server/layers/conv.py b/server/text_generation_server/layers/conv.py
new file mode 100644
index 0000000000..7fb18ab3f0
--- /dev/null
+++ b/server/text_generation_server/layers/conv.py
@@ -0,0 +1,41 @@
+from accelerate import init_empty_weights
+import torch
+
+
+@classmethod
+def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = torch.nn.Parameter(bias)
+    return conv2d
+
+
+@classmethod
+def load_conv2d_no_bias(
+    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
+):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = None
+    return conv2d
+
+
+torch.nn.Conv2d.load = load_conv2d
+torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py
new file mode 100644
index 0000000000..fd22b5c679
--- /dev/null
+++ b/server/text_generation_server/layers/eetq.py
@@ -0,0 +1,25 @@
+import torch
+from EETQ import quant_weights, w8_a16_gemm
+
+
+class EETQLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        device = weight.device
+        if weight.dtype != torch.float16:
+            weight = weight.to(dtype=torch.float16)
+        weight = torch.t(weight).contiguous().cpu()
+        weight, scale = quant_weights(weight, torch.int8, False)
+
+        self.weight = weight.cuda(device)
+        self.scale = scale.cuda(device)
+        self.bias = bias.cuda(device) if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = w8_a16_gemm(input, self.weight, self.scale)
+        output = output + self.bias if self.bias is not None else output
+        return output
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
new file mode 100644
index 0000000000..dd61d08190
--- /dev/null
+++ b/server/text_generation_server/layers/fp8.py
@@ -0,0 +1,43 @@
+import torch
+
+
+def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
+    device = weight.device
+    # weight, scale = quant_weights(weight, torch.int8, False)
+    finfo = torch.finfo(qdtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(qdtype)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+
+
+class Fp8Linear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.dtype = weight.dtype
+        self.qweight, self.scale = fp8_quantize(weight)
+
+        self.bias = bias if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        qinput, scale = fp8_quantize(input)
+        output, _ = torch._scaled_mm(
+            qinput,
+            self.qweight.t(),
+            out_dtype=self.dtype,
+            scale_a=scale,
+            scale_b=self.scale,
+            bias=self.bias,
+        )
+        return output
diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
new file mode 100644
index 0000000000..1c46f49322
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -0,0 +1,39 @@
+import os
+import torch
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+except Exception:
+    major = 1
+
+HAS_EXLLAMA = False
+CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
+V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
+if os.getenv("DISABLE_EXLLAMA") == "True":
+    HAS_EXLLAMA = False
+elif CAN_EXLLAMA:
+    try:
+        if V2:
+            from text_generation_server.layers.gptq.exllamav2 import (
+                QuantLinear as ExllamaQuantLinear,
+                create_exllama_buffers,
+                set_device,
+            )
+
+            HAS_EXLLAMA = "2"
+        else:
+            from text_generation_server.layers.gptq.exllama import (
+                Ex4bitLinear as ExllamaQuantLinear,
+                create_exllama_buffers,
+                set_device,
+            )
+
+            HAS_EXLLAMA = "1"
+
+    except ImportError:
+        pass
+
+from text_generation_server.layers.gptq.quant_linear import QuantLinear
diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/layers/gptq/custom_autotune.py
similarity index 100%
rename from server/text_generation_server/utils/gptq/custom_autotune.py
rename to server/text_generation_server/layers/gptq/custom_autotune.py
diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/layers/gptq/exllama.py
similarity index 100%
rename from server/text_generation_server/utils/gptq/exllama.py
rename to server/text_generation_server/layers/gptq/exllama.py
diff --git a/server/text_generation_server/utils/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py
similarity index 96%
rename from server/text_generation_server/utils/gptq/exllamav2.py
rename to server/text_generation_server/layers/gptq/exllamav2.py
index 80836a9597..321ced97f7 100644
--- a/server/text_generation_server/utils/gptq/exllamav2.py
+++ b/server/text_generation_server/layers/gptq/exllamav2.py
@@ -119,6 +119,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
                 none_tensor,
                 temp_dq,
             )
+    else:
+        RuntimeError("Cannot create handle")
 
 
 DEVICE = None
diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/quant_linear.py
new file mode 100644
index 0000000000..f60758b614
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/quant_linear.py
@@ -0,0 +1,356 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_fwd
+
+import triton
+import triton.language as tl
+from . import custom_autotune
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=4,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(
+            scales_ptrs + g_idx[:, None] * stride_scales
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(
+            zeros_ptrs + g_idx[:, None] * stride_zeros
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = (zeros + 1) & maxq  # eventually avoid overflow
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
+        )
+        grid = lambda META: (
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = QuantLinearFunction.apply(
+            x.reshape(-1, x.shape[-1]),
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.bits,
+            self.maxq,
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
similarity index 100%
rename from server/text_generation_server/utils/gptq/quantize.py
rename to server/text_generation_server/layers/gptq/quantize.py
diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py
new file mode 100644
index 0000000000..15d24e80ee
--- /dev/null
+++ b/server/text_generation_server/layers/layernorm.py
@@ -0,0 +1,185 @@
+import torch
+from torch import nn
+from accelerate import init_empty_weights
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+
+# Monkey patching
+@classmethod
+def load_layer_norm(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = torch.nn.Parameter(bias)
+    return ln
+
+
+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = None
+    return ln
+
+
+torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if hidden_states.shape[-1] > 8192:
+                if residual is not None:
+                    hidden_states += residual
+                residual = hidden_states
+
+                return super(FastLayerNorm, self).forward(hidden_states), residual
+            else:
+                (
+                    normed_hidden_states,
+                    residual,
+                    *rest,
+                ) = dropout_layer_norm.dropout_add_ln_fwd(
+                    hidden_states,
+                    residual,
+                    self.weight,
+                    self.bias,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0.0,
+                    self.eps,
+                    1.0,
+                    0,
+                    None,
+                    False,
+                    False,
+                )
+                if residual is None:
+                    residual = hidden_states
+
+                return normed_hidden_states, residual
+
+elif SYSTEM == "rocm":
+    from vllm import layernorm_ops
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            return super().forward(hidden_states), residual
+
+elif SYSTEM == "xpu":
+    import intel_extension_for_pytorch as ipex
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            res_out = hidden_states
+            out = ipex.llm.functional.add_layer_norm(
+                residual, hidden_states, self.weight, self.bias, self.eps, True
+            )
+            if residual is not None:
+                res_out = residual
+            return out, res_out
+
+
+class FastRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float):
+        super().__init__()
+
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, prefix, weights, eps=1e-6):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        return cls(weight, eps)
+
+    def forward(self, hidden_states, residual=None):
+        if SYSTEM == "xpu":
+            residual_out = hidden_states
+            out = ipex.llm.functional.add_rms_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                None,
+                self.variance_epsilon,
+                True,
+            )
+            if residual is not None:
+                residual_out = residual
+            return out, residual_out
+        elif hidden_states.shape[-1] > 8192:
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(
+                variance + self.variance_epsilon
+            )
+
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+
+            return self.weight * hidden_states, residual
+        elif SYSTEM == "cuda":
+            # faster post attention rms norm
+            (
+                normed_hidden_states,
+                res,
+                *rest,
+            ) = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                None,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                True,  # Activate RMSNorm
+            )
+            if res is None:
+                res = hidden_states
+
+            return normed_hidden_states, res
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            out = torch.empty_like(hidden_states)
+            layernorm_ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return out, residual
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py
new file mode 100644
index 0000000000..d137a500df
--- /dev/null
+++ b/server/text_generation_server/layers/linear.py
@@ -0,0 +1,153 @@
+import torch
+from torch.nn import functional as F
+from text_generation_server.utils.import_utils import SYSTEM
+
+
+class FastLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias)
+        else:
+            self.bias = None
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+
+def get_linear(weight, bias, quantize):
+    if quantize is None:
+        linear = FastLinear(weight, bias)
+    elif quantize == "eetq":
+        try:
+            from text_generation_server.layers.eetq import EETQLinear
+
+            linear = EETQLinear(weight, bias)
+        except ImportError:
+            raise ImportError(
+                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
+            )
+    elif quantize == "fp8":
+        from text_generation_server.layers.fp8 import Fp8Linear
+
+        linear = Fp8Linear(weight, bias)
+    elif quantize == "bitsandbytes":
+        try:
+            from text_generation_server.layers.bnb import (
+                warn_deprecate_bnb,
+                Linear8bitLt,
+            )
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        warn_deprecate_bnb()
+        linear = Linear8bitLt(
+            weight,
+            bias,
+            has_fp16_weights=False,
+            threshold=6.0,
+        )
+        if bias is not None:
+            linear.bias = nn.Parameter(bias)
+    elif quantize == "bitsandbytes-fp4":
+        try:
+            from text_generation_server.layers.bnb import Linear4bit
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        linear = Linear4bit(
+            weight,
+            bias,
+            quant_type="fp4",
+        )
+    elif quantize == "bitsandbytes-nf4":
+        try:
+            from text_generation_server.layers.bnb import Linear4bit
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        linear = Linear4bit(
+            weight,
+            bias,
+            quant_type="nf4",
+        )
+    elif quantize == "gptq":
+        try:
+            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
+        except Exception:
+            raise NotImplementedError(
+                f"The passed weight is not `gptq` compatible, loader needs to be updated."
+            )
+
+        if use_exllama:
+            try:
+                from text_generation_server.layers.gptq import (
+                    ExllamaQuantLinear,
+                )
+            except ImportError:
+                raise NotImplementedError(
+                    f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
+                )
+
+            linear = ExllamaQuantLinear(
+                qweight, qzeros, scales, g_idx, bias, bits, groupsize
+            )
+        else:
+            from text_generation_server.layers.gptq.quant_linear import QuantLinear
+
+            linear = QuantLinear(
+                qweight,
+                qzeros,
+                scales,
+                g_idx,
+                bias,
+                bits,
+                groupsize,
+            )
+    elif quantize == "awq":
+        try:
+            qweight, qzeros, scales, _, bits, groupsize, _ = weight
+        except Exception:
+            raise NotImplementedError(
+                f"The passed weight is not `awq` compatible, loader needs to be updated."
+            )
+        if SYSTEM == "rocm":
+            raise NotImplementedError(
+                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
+                "to use Exllama/GPTQ kernels for AWQ inference."
+            )
+        try:
+            from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+
+            linear = WQLinear(
+                w_bit=bits,
+                group_size=groupsize,
+                qweight=qweight,
+                qzeros=qzeros,
+                scales=scales,
+                bias=bias is not None,
+            )
+        except ImportError:
+            raise NotImplementedError(
+                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+            )
+    else:
+        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
+    return linear
diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py
new file mode 100644
index 0000000000..4ac869785a
--- /dev/null
+++ b/server/text_generation_server/layers/medusa.py
@@ -0,0 +1,186 @@
+import torch
+from torch import nn
+from typing import Tuple, Optional
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.layers.linear import FastLinear
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelHead,
+    TensorParallelColumnLinear,
+)
+
+
+class ResBlock(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.linear = FastLinear.load(
+            config, prefix=f"{prefix}.linear", weights=weights, bias=True
+        )
+        self.act = torch.nn.SiLU()
+
+    def forward(self, x):
+        return x + self.act(self.linear(x))
+
+
+class MedusaModel(torch.nn.Module):
+    def __init__(self, config, medusa_config, weights):
+        super().__init__()
+        self.heads = torch.nn.ModuleList(
+            [
+                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
+                for i in range(get_speculate())
+            ]
+        )
+
+    def forward(self, x):
+        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
+        return speculative_logits
+
+
+class MedusaHead(torch.nn.Module):
+    def __init__(self, config, medusa_config, prefix, weights):
+        super().__init__()
+        self.blocks = torch.nn.ModuleList(
+            [
+                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
+                for i in range(medusa_config["medusa_num_layers"])
+            ]
+        )
+        n = len(self.blocks)
+        self.out = FastLinear.load(
+            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
+        )
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.out(x)
+        return x
+
+
+class MedusaHeadV1(nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.lm_head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        use_medusa = config.use_medusa
+
+        medusa_config = str(Path(use_medusa) / "config.json")
+        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        medusa = MedusaModel(config, medusa_config, weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MedusaHeadV1(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        speculative_logits = self.medusa(input)
+        return logits, speculative_logits
+
+
+class MedusaHeadV2(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        use_medusa = config.use_medusa
+
+        medusa_config = str(Path(use_medusa) / "config.json")
+        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        self.n_medusa_heads = get_speculate()
+
+        assert medusa_config["medusa_num_layers"] == 1
+        self.linear = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.process_group = weights.process_group
+        self.world_size = self.process_group.size()
+        self.rank = self.process_group.rank()
+
+        self.act = torch.nn.SiLU()
+
+        self.lm_head = TensorParallelHead.load(config, prefix, weights)
+
+    def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
+        size = x.shape[-1]
+        block_size = (size + self.world_size - 1) // self.world_size
+        start = self.rank * block_size
+        stop = (self.rank + 1) * block_size
+
+        x_block = x[:, start:stop]
+
+        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
+        medusa_res = self.act(self.linear(x)).reshape(
+            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
+        )
+
+        # Apply all residual medusa heads
+        output = x[:, start:stop].unsqueeze(-2) + medusa_res
+
+        # Gather medusa heads
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+
+        # Stack x and medusa residual x
+        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
+
+        # Compute lm head on x + medusa residual x
+        logits = self.lm_head(stacked_x)
+
+        # Finally, split logits from speculative logits
+        logits, speculative_logits = torch.split(
+            logits, [1, self.n_medusa_heads], dim=-2
+        )
+        # Squeeze added dimension
+        logits = logits.squeeze(-2)
+
+        return logits, speculative_logits
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
new file mode 100644
index 0000000000..503dd5543b
--- /dev/null
+++ b/server/text_generation_server/layers/rotary.py
@@ -0,0 +1,419 @@
+import os
+import torch
+from torch import nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "cuda":
+    from flash_attn.layers.rotary import RotaryEmbedding
+    import rotary_emb
+elif SYSTEM == "rocm":
+    from vllm import pos_encoding_ops
+
+
+def _create_inv_freq(dim, base, device):
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+    )
+    return inv_freq
+
+
+def _get_rope_config(config):
+    if os.getenv("ROPE_SCALING", None) is not None:
+        rope_scaling = {
+            "type": os.environ["ROPE_SCALING"],
+            "factor": float(os.environ["ROPE_FACTOR"]),
+        }
+        return rope_scaling
+    return getattr(config, "rope_scaling", None)
+
+
+class PositionRotaryEmbedding(nn.Module):
+    def __init__(self, inv_freq, scaling_factor):
+        super().__init__()
+        self.inv_freq = inv_freq
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.scaling_factor = scaling_factor
+        self.dynamic_args = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            rotary_dim = cos.shape[-1]
+            q1 = query[..., :rotary_dim]
+            q2 = query[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., :rotary_dim]
+            k2 = key[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
+        elif SYSTEM == "xpu":
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), True
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+    @classmethod
+    def static(cls, config, dim, base, device):
+        inv_freq = _create_inv_freq(dim, base, device)
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                scaling_factor = rope_scaling["factor"]
+                return DynamicPositionRotaryEmbedding(
+                    dim=dim,
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=base,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                scaling_factor = rope_scaling["factor"]
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            elif rope_scaling["type"] == "su":
+                short_factor = torch.tensor(
+                    rope_scaling["short_factor"], dtype=torch.float32, device=device
+                )
+                short_inv_freq = 1.0 / (
+                    short_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+                long_factor = torch.tensor(
+                    rope_scaling["long_factor"], dtype=torch.float32, device=device
+                )
+                long_inv_freq = 1.0 / (
+                    long_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+
+                original_max_position_embeddings = (
+                    config.original_max_position_embeddings
+                )
+                max_position_embeddings = config.max_position_embeddings
+                if max_position_embeddings <= original_max_position_embeddings:
+                    scaling_factor = 1.0
+                else:
+                    scale = max_position_embeddings / original_max_position_embeddings
+                    scaling_factor = math.sqrt(
+                        1 + math.log(scale) / math.log(original_max_position_embeddings)
+                    )
+
+                return SuRotaryEmbedding(
+                    short_inv_freq=short_inv_freq,
+                    long_inv_freq=long_inv_freq,
+                    scaling_factor=scaling_factor,
+                    original_max_position_embeddings=original_max_position_embeddings,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    @classmethod
+    def load(cls, config, prefix, weights):
+        # XXX: Always load this in float32 !
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
+        weights.dtype = dtype
+
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            scaling_factor = rope_scaling["factor"]
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                return DynamicPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
+        """
+        Return cos and sin for the asked position ids
+        """
+        if SYSTEM == "rocm":
+            # For RoCm, we always use float cos/sin to avoid a cast.
+            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
+            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
+            dtype = torch.float32
+
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+
+        cos = torch.index_select(self._cos_cached, 0, position_ids)
+        sin = torch.index_select(self._sin_cached, 0, position_ids)
+
+        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
+        return cos.unsqueeze(1), sin.unsqueeze(1)
+
+
+class SuRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq,
+        long_inv_freq,
+        scaling_factor,
+        original_max_position_embeddings,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            if seqlen > self.original_max_position_embeddings:
+                inv_freq = self.long_inv_freq
+            else:
+                inv_freq = self.short_inv_freq
+            t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                newbase = self.base * (
+                    (self.scaling_factor * seqlen / self.max_position_embeddings)
+                    - (self.scaling_factor - 1)
+                ) ** (self.dim / (self.dim - 2))
+                self.inv_freq = _create_inv_freq(
+                    self.dim, newbase, self.inv_freq.device
+                )
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+# Inverse dim formula to find dim based on number of rotations
+import math
+
+
+def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_mscale(scale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings,
+        base,
+        device,
+        scaling_factor,
+        *,
+        extrapolation_factor,
+        attn_factor,
+        beta_fast,
+        beta_slow,
+    ):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = float(
+            get_mscale(self.scaling_factor) * self.attn_factor
+        )  # Get n-d magnitude scaling corrected for interpolation
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                inv_freq_extrapolation = _create_inv_freq(
+                    self.dim, self.base, self.inv_freq.device
+                )
+                freqs = 1.0 / inv_freq_extrapolation
+                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
+                low, high = find_correction_range(
+                    self.beta_fast,
+                    self.beta_slow,
+                    self.dim,
+                    self.base,
+                    self.max_position_embeddings,
+                )
+                inv_freq_mask = (
+                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
+                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_mask)
+                    + inv_freq_extrapolation * inv_freq_mask
+                )
+
+                self.inv_freq = inv_freq
+                self.mscale = float(
+                    get_mscale(self.scaling_factor) * self.attn_factor
+                )  # Get n-d magnitude scaling corrected for interpolation
+
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py
new file mode 100644
index 0000000000..663f8c2e47
--- /dev/null
+++ b/server/text_generation_server/layers/speculative.py
@@ -0,0 +1,35 @@
+import torch
+from typing import Tuple, Optional
+from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+
+
+class SpeculativeHead(torch.nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        use_medusa = config.use_medusa
+        if use_medusa:
+            lm_head = None
+            try:
+                medusa = MedusaHeadV1.load(config, prefix, weights)
+            except:
+                medusa = MedusaHeadV2(config, prefix, weights)
+        else:
+            lm_head = TensorParallelHead.load(config, prefix, weights)
+            medusa = None
+        return SpeculativeHead(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.medusa is not None:
+            return self.medusa(input)
+
+        assert self.head is not None
+        logits = self.head(input)
+        return logits, None
diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py
new file mode 100644
index 0000000000..34b9c51e82
--- /dev/null
+++ b/server/text_generation_server/layers/tensor_parallel.py
@@ -0,0 +1,188 @@
+import torch
+from torch.nn import functional as F
+from typing import List
+from text_generation_server.layers.linear import get_linear, FastLinear
+
+
+class SuperLayer(torch.nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+
+    def forward(self, x):
+        return self.linear.forward(x)
+
+
+class TensorParallelHead(SuperLayer):
+    def __init__(self, linear, process_group, should_gather: bool):
+        super().__init__(linear)
+        self.process_group = process_group
+        self.should_gather = should_gather
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        if weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
+            weight = weights.get_tensor(f"{prefix}.weight")
+            should_gather = False
+
+        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
+        if config.quantize in ["gptq", "awq", "eetq"]:
+            quantize = None
+        else:
+            quantize = config.quantize
+        return TensorParallelHead(
+            get_linear(weight, bias=None, quantize=quantize),
+            process_group=weights.process_group,
+            should_gather=should_gather,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.should_gather:
+            return super().forward(input)
+
+        world_size = self.process_group.size()
+        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
+            out_dim = self.linear.weight.shape[0]
+
+            if input.shape[0] == 1:
+                world_out = input.new_empty(1, out_dim * world_size)
+                local_out = input.new_empty(1, out_dim)
+                gather_input = local_out
+            else:
+                world_out = input.new_empty(out_dim * world_size, input.shape[0])
+                gather_input = input.new_empty(out_dim, input.shape[0])
+                local_out = gather_input.T
+
+            torch.mm(input, self.linear.weight.T, out=local_out)
+
+            torch.distributed.all_gather_into_tensor(
+                world_out, gather_input, group=self.process_group
+            )
+
+            if input.shape[0] == 1:
+                return world_out
+            return world_out.T
+
+        output = super().forward(input)
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+        return world_output
+
+
+class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_gate_up(
+            prefix, quantize=config.quantize
+        )
+        if bias:
+            raise NotImplementedError("packed_gate_up only implemented without bias")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load_qkv(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
+        if bias:
+            raise NotImplementedError("packed_qkv only implemented for baichuan")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        return cls.load_multi(config, [prefix], weights, bias, dim=0)
+
+    @classmethod
+    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
+        weight = weights.get_multi_weights_col(
+            prefixes, quantize=config.quantize, dim=dim
+        )
+
+        if bias:
+            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
+            bias = torch.cat(b, dim=dim)
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+
+class TensorParallelRowLinear(SuperLayer):
+    def __init__(self, linear, process_group):
+        super().__init__(linear)
+        self.process_group = process_group
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+
+        if bias and weights.process_group.rank() == 0:
+            # Rank is only on the first rank process
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(
+            get_linear(weight, bias, config.quantize),
+            process_group=weights.process_group,
+        )
+
+    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
+        out = super().forward(input)
+        if self.process_group.size() > 1 and reduce:
+            torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+
+
+class TensorParallelEmbedding(torch.nn.Module):
+    def __init__(self, prefix: str, weights, reduce=True):
+        super().__init__()
+        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
+        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
+
+        process_group = weights.process_group
+
+        world_size = process_group.size()
+        rank = process_group.rank()
+
+        block_size = (num_embeddings + world_size - 1) // world_size
+        self.min_id = rank * block_size
+        self.max_id = min(num_embeddings, (rank + 1) * block_size)
+        self.null_idx = weight.shape[
+            0
+        ]  # Usually block_size, might be less in non even vocab_size.
+        self.process_group = weights.process_group
+        self.reduce = reduce
+
+        """Additional 0 entry used for masking"""
+        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = torch.nn.functional.embedding(input, self.weight)
+        if self.reduce and self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+        return out
diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py
index 4c65e2dd71..c7705fe868 100644
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@@ -2,7 +2,7 @@
 import torch
 
 from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 BLOCK_SIZE: int = 16
 # Will be set in warmup
@@ -25,7 +25,7 @@ def __init__(
         self.repeat_slots = repeat_slots
 
         element_size = torch.tensor([], dtype=dtype).element_size()
-        if IS_XPU_SYSTEM:
+        if SYSTEM == "xpu":
             x = 1
         else:
             x = self.block_size // element_size
diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
index c8f02bca0c..0d8a1b590e 100644
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -32,7 +32,7 @@
 )
 from transformers import BloomConfig, PreTrainedModel
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/clip.py b/server/text_generation_server/models/custom_modeling/clip.py
index c4917733a4..56618bf16d 100644
--- a/server/text_generation_server/models/custom_modeling/clip.py
+++ b/server/text_generation_server/models/custom_modeling/clip.py
@@ -15,7 +15,7 @@
 )
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index 56d9a96642..8c423eaf05 100644
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -26,18 +26,22 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM
-from text_generation_server.utils.layers import (
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
-if IS_CUDA_SYSTEM:
+if SYSTEM == "cuda":
     import dropout_layer_norm
 else:
     dropout_layer_norm = None
@@ -52,7 +56,7 @@ def forward(
         sin: torch.Tensor,
     ):
         # Such controlflows may add some overhead.
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
             import rotary_emb
 
             q1 = query[..., ::2]
@@ -64,7 +68,7 @@ def forward(
             k2 = key[..., 1::2]
 
             rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             from vllm import pos_encoding_ops
 
             # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
@@ -90,7 +94,7 @@ def __init__(self, prefix, weights, eps):
         self.eps = eps
 
     def forward(self, hidden_states):
-        if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
+        if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
             hidden_states = hidden_states.reshape(
                 -1, self.weight.shape[0], self.weight.shape[1]
             )
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index d0978bef61..9d652b67bf 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -21,21 +21,26 @@
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
 from loguru import logger
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
-if not IS_XPU_SYSTEM:
+if SYSTEM != "xpu":
     from vllm.model_executor.layers.fused_moe import fused_moe
+
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
-    FastLayerNorm,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
 from text_generation_server.utils.log import log_once
 
 
@@ -216,7 +221,7 @@ def _load_gqa(config, prefix: str, weights):
 
         bits, groupsize, desc_act, quant_method = weights._get_gptq_params()
 
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+        from text_generation_server.layers import HAS_EXLLAMA
 
         use_exllama = (
             bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act
@@ -236,7 +241,7 @@ def _load_gqa(config, prefix: str, weights):
             log_once(
                 logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
             )
-            from text_generation_server.utils.awq.conversion_utils import (
+            from text_generation_server.layers.awq.conveersion_utils import (
                 fast_awq_to_gptq,
             )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
index bd7596db01..43b90bdd85 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -27,13 +27,15 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 6fa85d4e6c..a7969494c2 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -27,13 +27,15 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index c2445cdad7..3e13c26d58 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -27,13 +27,15 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index 3f6c8e036a..be2d6c451e 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -24,9 +24,9 @@
 import numpy as np
 
 from torch import nn
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
-if not IS_XPU_SYSTEM:
+if SYSTEM != "xpu":
     from vllm.model_executor.layers.fused_moe import fused_moe
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
@@ -34,16 +34,20 @@
 from loguru import logger
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
-    FastRMSNorm,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 class MixtralConfig(PretrainedConfig):
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index ee062d3ded..d45cab2e05 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -29,14 +29,18 @@
 
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
-    get_linear,
 )
 
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
index cfe447a781..f2efb53867 100644
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -7,15 +7,19 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 class PhiConfig(PretrainedConfig):
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 94023b33cf..3a6d2db524 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -6,13 +6,15 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index 205fa39317..7d252a0e34 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -6,16 +6,21 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
 
-from text_generation_server.utils import flash_attn, paged_attention
-from text_generation_server.utils.layers import (
-    FastLayerNorm,
-    PositionRotaryEmbedding,
-    SpeculativeHead,
+from text_generation_server.utils import paged_attention, flash_attn
+from text_generation_server.utils.flash_attn import attention
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    TensorParallelRowLinear,
+    SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 def load_row(config, prefix: str, weights, bias: bool):
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index bbb603a70b..d2f6d9aff7 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -6,14 +6,16 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     SpeculativeHead,
     TensorParallelEmbedding,
-    FastLayerNorm,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
 
 
 def load_multi_mqa(
@@ -80,13 +82,13 @@ def _load_multi_mqa_gptq(
             g_idx = g_idx.to(device=weights.device)
         elif quant_method == "awq":
             g_idx = None
-            from text_generation_server.utils.awq.conversion_utils import (
+            from text_generation_server.layers.awq.conversion_utils import (
                 fast_awq_to_gptq,
             )
 
             qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
 
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
 
         use_exllama = HAS_EXLLAMA
         weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index ed77af787f..3e2ce4f97a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -27,15 +27,19 @@
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
-    FastRMSNorm,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
 )
 
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
index cb2ee7db33..935f049b0b 100644
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -29,7 +29,7 @@
 )
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index ee4cdb089b..ec3f900ba2 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -47,20 +47,22 @@
 from text_generation_server.models.custom_modeling.idefics_perceiver import (
     IdeficsPerceiverResampler,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
     SpeculativeHead,
-    PositionRotaryEmbedding,
     FastLinear,
 )
-from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils.import_utils import SYSTEM
 
-if IS_CUDA_SYSTEM:
+if SYSTEM == "cuda":
     import dropout_layer_norm
-elif IS_ROCM_SYSTEM:
+elif SYSTEM == "rocm":
     from vllm import layernorm_ops
+else:
+    raise RuntimeError(f"Unsupported system {SYSTEM}")
 
 
 @dataclass
@@ -373,7 +375,7 @@ def forward(self, hidden_states, residual=None):
                 hidden_states = hidden_states.to(self.weight.dtype)
 
             return self.weight * hidden_states
-        elif IS_CUDA_SYSTEM:
+        elif SYSTEM == "cuda":
             # faster post attention rms norm
             unwrap = False
             if len(hidden_states.shape) > 2:
@@ -405,7 +407,7 @@ def forward(self, hidden_states, residual=None):
                 normed_hidden_states = normed_hidden_states.view(*shape)
 
             return normed_hidden_states
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
             if residual is not None:
                 hidden_states += residual
diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
index 477d4d70e2..af44490b39 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -41,7 +41,7 @@
 import torch
 import torch.nn as nn
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
 )
diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py
index c521dd0adb..30c5997fe5 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -28,7 +28,7 @@
     ModelOutput,
     logging,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
     TensorParallelEmbedding,
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
index 0d93791f82..a049f7569d 100644
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -27,7 +27,7 @@
     load_text_model,
     load_vision_model,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
 )
diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
index c58a617f52..293051c2bf 100644
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -8,12 +8,12 @@
 from transformers.configuration_utils import PretrainedConfig
 import torch.nn.functional as F
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     SpeculativeHead,
     TensorParallelEmbedding,
-    FastRMSNorm,
     FastLinear,
 )
+from text_generation_server.layers.layernorm import FastRMSNorm
 
 from einops import rearrange
 from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
index 9b0f8b9271..f7981bf531 100644
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -17,7 +17,7 @@
 )
 from einops import rearrange
 from packaging import version
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index f060ec0e42..fcad32fa79 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -40,7 +40,7 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers import GPTNeoXConfig
 from loguru import logger
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py
index 7a5cf917db..83d62dea2e 100644
--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -27,7 +27,7 @@
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers import OPTConfig
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
diff --git a/server/text_generation_server/models/custom_modeling/phi_modeling.py b/server/text_generation_server/models/custom_modeling/phi_modeling.py
index 1571f9fd82..04b470eb7d 100644
--- a/server/text_generation_server/models/custom_modeling/phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -9,7 +9,7 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py
index 2773fb15cf..0b899fba1b 100644
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -38,7 +38,7 @@
     is_torch_fx_proxy,
 )
 from transformers import T5Config
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index a6d0204fd5..f567bea94c 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -12,7 +12,6 @@
 from opentelemetry import trace
 from transformers import PreTrainedTokenizerBase
 from typing import Optional, Tuple, List, Type, Dict
-
 from text_generation_server.models import Model
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.utils.speculate import get_speculate
@@ -32,13 +31,14 @@
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
 
-tracer = trace.get_tracer(__name__)
 from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
+    empty_cache,
+    synchronize,
+    get_free_memory,
 )
 
+tracer = trace.get_tracer(__name__)
+
 
 @dataclass
 class FlashCausalLMBatch(Batch):
@@ -757,10 +757,8 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
 
     def warmup(self, batch: FlashCausalLMBatch):
         # The warmup batch is the biggest batch we could ever receive
-        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-            torch.cuda.empty_cache()
-        elif IS_XPU_SYSTEM:
-            torch.xpu.empty_cache()
+        empty_cache()
+
         try:
             cache_manager = set_cache_manager(
                 batch.blocks,
@@ -780,10 +778,7 @@ def warmup(self, batch: FlashCausalLMBatch):
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
-        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-            torch.cuda.synchronize(self.device)
-        elif IS_XPU_SYSTEM:
-            torch.xpu.synchronize(self.device)
+        synchronize(self.device)
 
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
         # Calculate the number of blocks that can be allocated with the free memory
@@ -791,20 +786,7 @@ def warmup(self, batch: FlashCausalLMBatch):
         cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
         total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
 
-        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-            total_free_memory, _ = torch.cuda.mem_get_info(self.device)
-            total_gpu_memory = torch.cuda.get_device_properties(
-                self.device
-            ).total_memory
-
-            free_memory = max(
-                0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
-            )
-        elif IS_XPU_SYSTEM:
-            total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory
-            free_memory = int(total_gpu_memory * 0.5)
-        else:
-            raise NotImplementedError("FlashModel is only available on GPU")
+        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
 
         num_blocks = (
             # Leave 5% for some wiggle room
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 609a188de1..8ea7071377 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -18,7 +18,7 @@
 
 tracer = trace.get_tracer(__name__)
 
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 
 class FlashLlama(FlashCausalLM):
@@ -35,7 +35,7 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
index 85e9354363..48304ad89a 100644
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -33,7 +33,7 @@
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
 SLIDING_WINDOW_BLOCKS: Optional[int] = None
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 
@@ -322,7 +322,7 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index f82e27dbe4..1119bdae01 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -14,7 +14,7 @@
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -33,7 +33,7 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
index ccf38a0caa..33298e1a77 100644
--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -15,7 +15,7 @@
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -34,7 +34,7 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index e66f1bf87c..66698a3a3c 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -18,7 +18,7 @@
     Weights,
 )
 
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -37,7 +37,7 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 158966e3ce..9d0571a6cc 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -85,7 +85,7 @@ async def Warmup(self, request, context):
                 # When using GPTQ, Exllama kernels need some global kernels
                 # For which we have the finale shapes only after the model has loaded
                 # This will allocate those buffers.
-                from text_generation_server.utils.layers import (
+                from text_generation_server.layers.gptq import (
                     create_exllama_buffers,
                     set_device,
                 )
diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
index 583a8f91e4..0830656d0f 100644
--- a/server/text_generation_server/utils/flash_attn.py
+++ b/server/text_generation_server/utils/flash_attn.py
@@ -2,13 +2,8 @@
 import torch
 
 from loguru import logger
-import math
 
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
+from text_generation_server.utils.import_utils import SYSTEM
 
 if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
     raise ImportError("`USE_FLASH_ATTENTION` is false.")
@@ -16,10 +11,45 @@
 HAS_FLASH_ATTN_V2_CUDA = False
 HAS_FLASH_ATTN_V2_ROCM = False
 
-if IS_XPU_SYSTEM:
+if SYSTEM == "xpu":
     import intel_extension_for_pytorch as ipex
 
-if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
+
+        if window_size_left != -1:
+            raise ValueError(
+                f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
+            )
+        return ipex.llm.functional.varlen_attention(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            0.0,
+            softmax_scale,
+            False,
+            True,
+            False,
+            None,
+        )
+
+
+if SYSTEM in {"cuda", "rocm"}:
     if not torch.cuda.is_available():
         raise ImportError("CUDA is not available")
 
@@ -35,11 +65,7 @@
         try:
             import flash_attn_2_cuda
         except ImportError:
-            architecture_suffix = ""
-            if IS_CUDA_SYSTEM:
-                architecture_suffix = "-cuda"
-            elif IS_ROCM_SYSTEM:
-                architecture_suffix = "-rocm"
+            architecture_suffix = f"-{SYSTEM}"
             raise ImportError(
                 "Flash Attention V2 is not installed.\n"
                 "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
@@ -50,8 +76,8 @@
                 f"GPU with CUDA capability {major} {minor} is not supported for "
                 "Flash Attention V2"
             )
-        HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM
-        HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM
+        HAS_FLASH_ATTN_V2_CUDA = SYSTEM == "cuda"
+        HAS_FLASH_ATTN_V2_ROCM = SYSTEM == "rocm"
     except ImportError as e:
         try:
             import flash_attn_cuda
@@ -62,11 +88,11 @@
                 "or install flash attention with `cd server && make install install-flash-attention`"
             ) from e
 
-        if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90):
+        if SYSTEM == "cuda" and not (is_sm75 or is_sm8x or is_sm90):
             raise ImportError(
                 f"GPU with CUDA capability {major} {minor} is not supported"
             ) from e
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             for idx in range(torch.cuda.device_count()):
                 if "MI210" not in torch.cuda.get_device_name(
                     idx
@@ -79,42 +105,20 @@
         HAS_FLASH_ATTN = True
 
 
-def attention(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens,
-    max_s,
-    softmax_scale,
-    window_size_left=-1,
-):
-    if window_size_left <= 0 and window_size_left != -1:
-        raise ValueError("`window_size_left` must be > 0 or -1")
-
-    if IS_XPU_SYSTEM:
-        if window_size_left != -1:
-            raise ValueError(
-                f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
-            )
-        return ipex.llm.functional.varlen_attention(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            None,
-        )
-
-    if HAS_FLASH_ATTN_V2_CUDA:
+if HAS_FLASH_ATTN_V2_CUDA:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
         return flash_attn_2_cuda.varlen_fwd(
             q,
             k,
@@ -136,7 +140,21 @@ def attention(
             False,
             None,
         )
-    elif HAS_FLASH_ATTN_V2_ROCM:
+
+elif HAS_FLASH_ATTN_V2_ROCM:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
         if window_size_left != -1:
             raise ValueError(
                 f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
@@ -159,7 +177,19 @@ def attention(
             False,
             None,
         )
-    elif HAS_FLASH_ATTN:
+
+elif HAS_FLASH_ATTN:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
         if window_size_left != -1:
             raise NotImplementedError(
                 "window_size_left is only available with flash attn v2"
@@ -209,4 +239,5 @@ def attention(
             None,
         )
 
+else:
     raise NotImplementedError("flash attention is not installed")
diff --git a/server/text_generation_server/utils/gptq/quant_linear.py b/server/text_generation_server/utils/gptq/quant_linear.py
deleted file mode 100644
index a832f75558..0000000000
--- a/server/text_generation_server/utils/gptq/quant_linear.py
+++ /dev/null
@@ -1,359 +0,0 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-try:
-    import triton
-    import triton.language as tl
-    from . import custom_autotune
-
-    # code based https://github.com/fpgaminer/GPTQ-triton
-    @custom_autotune.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 64,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=4,
-            ),
-        ],
-        key=["M", "N", "K"],
-        nearest_power_of_two=True,
-        prune_configs_by={
-            "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-            "perf_model": None,
-            "top_k": None,
-        },
-    )
-    @triton.jit
-    def matmul_248_kernel(
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        scales_ptr,
-        zeros_ptr,
-        g_ptr,
-        M,
-        N,
-        K,
-        bits,
-        maxq,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_scales,
-        stride_zeros,
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-    ):
-        """
-        Compute the matrix multiplication C = A x B.
-        A is of shape (M, K) float16
-        B is of shape (K//8, N) int32
-        C is of shape (M, N) float16
-        scales is of shape (G, N) float16
-        zeros is of shape (G, N) float16
-        g_ptr is of shape (K) int32
-        """
-        infearure_per_bits = 32 // bits
-
-        pid = tl.program_id(axis=0)
-        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + (pid % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
-
-        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (
-            offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-        )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        a_mask = offs_am[:, None] < M
-        # b_ptrs is set up such that it repeats elements along the K axis 8 times
-        b_ptrs = b_ptr + (
-            (offs_k[:, None] // infearure_per_bits) * stride_bk
-            + offs_bn[None, :] * stride_bn
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-        g_ptrs = g_ptr + offs_k
-        # shifter is used to extract the N bits of each element in the 32-bit word from B
-        scales_ptrs = scales_ptr + offs_bn[None, :]
-        zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-        shifter = (offs_k % infearure_per_bits) * bits
-        zeros_shifter = (offs_bn % infearure_per_bits) * bits
-        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-        for k in range(0, num_pid_k):
-            g_idx = tl.load(g_ptrs)
-
-            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-            scales = tl.load(
-                scales_ptrs + g_idx[:, None] * stride_scales
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-            zeros = tl.load(
-                zeros_ptrs + g_idx[:, None] * stride_zeros
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-            zeros = (zeros >> zeros_shifter[None, :]) & maxq
-            zeros = (zeros + 1) & maxq  # eventually avoid overflow
-
-            a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-            # Now we need to unpack b (which is N-bit values) into 32-bit values
-            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-            b = (b - zeros) * scales  # Scale and shift
-
-            accumulator += tl.dot(a, b)
-            a_ptrs += BLOCK_SIZE_K
-            b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-            g_ptrs += BLOCK_SIZE_K
-
-        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-        tl.store(c_ptrs, accumulator, mask=c_mask)
-
-except:
-    print("triton not installed.")
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-        grid = lambda META: (
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-        )
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-
-
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index db205f4d67..f54987eb1f 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -10,6 +10,41 @@ def is_xpu_available():
     return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
-IS_ROCM_SYSTEM = torch.version.hip is not None
-IS_CUDA_SYSTEM = torch.version.cuda is not None
-IS_XPU_SYSTEM = is_xpu_available()
+def get_cuda_free_memory(device, memory_fraction):
+    total_free_memory, _ = torch.cuda.mem_get_info(device)
+    total_gpu_memory = torch.cuda.get_device_properties(device).total_memory
+    free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory)
+    return free_memory
+
+
+def get_xpu_free_memory(device):
+    total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
+    free_memory = int(total_gpu_memory * 0.5)
+    return free_memory
+
+
+SYSTEM = None
+if torch.version.hip is not None:
+    SYSTEM = "rocm"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    SYSTEM = "cuda"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif is_xpu_available():
+    SYSTEM = "xpu"
+    empty_cache = torch.xpu.empty_cache
+    synchronize = torch.xpu.synchronize
+    get_free_memory = get_xpu_free_memory
+else:
+    SYSTEM = "cpu"
+
+    def noop(*args, **kwargs):
+        pass
+
+    empty_cache = noop
+    synchronize = noop
+    get_free_memory = noop
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
deleted file mode 100644
index 7d339fe5a3..0000000000
--- a/server/text_generation_server/utils/layers.py
+++ /dev/null
@@ -1,1374 +0,0 @@
-import os
-import torch
-import torch.distributed
-
-from torch import nn
-from torch.nn import functional as F
-from typing import List, Tuple, Optional
-from loguru import logger
-from functools import lru_cache
-
-from text_generation_server.utils.speculate import get_speculate
-
-HAS_BITS_AND_BYTES = True
-try:
-    import bitsandbytes as bnb
-    from bitsandbytes.nn import Int8Params, Params4bit
-except ImportError:
-    HAS_BITS_AND_BYTES = False
-
-from accelerate import init_empty_weights
-
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-
-HAS_AWQ = True
-try:
-    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
-except ImportError:
-    HAS_AWQ = False
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-except Exception:
-    major = 1
-
-HAS_EXLLAMA = False
-CAN_EXLLAMA = major >= 8 or IS_ROCM_SYSTEM
-V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
-
-if os.getenv("DISABLE_EXLLAMA") == "True":
-    HAS_EXLLAMA = False
-elif CAN_EXLLAMA:
-    try:
-        if V2:
-            from text_generation_server.utils.gptq.exllamav2 import (
-                QuantLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
-            )
-
-            HAS_EXLLAMA = "2"
-        else:
-            from text_generation_server.utils.gptq.exllama import (
-                Ex4bitLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
-            )
-
-            HAS_EXLLAMA = "1"
-
-    except ImportError:
-        pass
-
-HAS_EETQ = False
-try:
-    from EETQ import quant_weights, w8_a16_gemm
-
-    HAS_EETQ = True
-except ImportError:
-    pass
-
-
-# Monkey patching
-@classmethod
-def load_layer_norm(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = nn.Parameter(bias)
-    return ln
-
-
-@classmethod
-def load_layer_norm_no_bias(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = None
-    return ln
-
-
-@classmethod
-def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        conv2d = cls(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-        )
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = nn.Parameter(bias)
-    return conv2d
-
-
-@classmethod
-def load_conv2d_no_bias(
-    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
-):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        conv2d = cls(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-        )
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = None
-    return conv2d
-
-
-torch.nn.Conv2d.load = load_conv2d
-torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
-torch.nn.LayerNorm.load = load_layer_norm
-torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
-
-
-class FastLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(weight)
-        if bias is not None:
-            self.bias = nn.Parameter(bias)
-        else:
-            self.bias = None
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_tensor(f"{prefix}.weight")
-        if bias:
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(weight, bias)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, self.weight, self.bias)
-
-
-class EETQLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        device = weight.device
-        if weight.dtype != torch.float16:
-            weight = weight.to(dtype=torch.float16)
-        weight = torch.t(weight).contiguous().cpu()
-        weight, scale = quant_weights(weight, torch.int8, False)
-
-        self.weight = weight.cuda(device)
-        self.scale = scale.cuda(device)
-        self.bias = bias.cuda(device) if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = w8_a16_gemm(input, self.weight, self.scale)
-        output = output + self.bias if self.bias is not None else output
-        return output
-
-
-def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
-    device = weight.device
-    # weight, scale = quant_weights(weight, torch.int8, False)
-    finfo = torch.finfo(qdtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(qdtype)
-    scale = scale.float().reciprocal()
-    return qweight, scale
-
-
-class Fp8Linear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.dtype = weight.dtype
-        self.qweight, self.scale = fp8_quantize(weight)
-
-        self.bias = bias if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        qinput, scale = fp8_quantize(input)
-        output, _ = torch._scaled_mm(
-            qinput,
-            self.qweight.t(),
-            out_dtype=self.dtype,
-            scale_a=scale,
-            scale_b=self.scale,
-            bias=self.bias,
-        )
-        return output
-
-
-class Linear8bitLt(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-        has_fp16_weights=True,
-        memory_efficient_backward=False,
-        threshold=0.0,
-        index=None,
-    ):
-        super().__init__()
-        assert (
-            not memory_efficient_backward
-        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        # Necessary for stacked layers
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(
-            weight.data,
-            has_fp16_weights=has_fp16_weights,
-            requires_grad=has_fp16_weights,
-        )
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x: torch.Tensor):
-        self.state.is_training = self.training
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
-
-        if not self.state.has_fp16_weights:
-            if self.state.CB is not None and self.state.CxB is not None:
-                # we converted 8-bit row major to turing/ampere format in the first inference pass
-                # we no longer need the row-major weight
-                del self.state.CB
-                self.weight.data = self.state.CxB
-        return out
-
-
-class Linear4bit(nn.Module):
-    def __init__(self, weight, bias, quant_type):
-        super().__init__()
-        self.weight = Params4bit(
-            weight.data,
-            requires_grad=False,
-            compress_statistics=True,
-            quant_type=quant_type,
-        )
-        self.compute_dtype = None
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def forward(self, x: torch.Tensor):
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        if getattr(self.weight, "quant_state", None) is None:
-            print(
-                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
-            )
-        inp_dtype = x.dtype
-        if self.compute_dtype is not None:
-            x = x.to(self.compute_dtype)
-
-        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(
-            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
-        )
-
-        out = out.to(inp_dtype)
-
-        return out
-
-
-@lru_cache(1)
-def warn_deprecate_bnb():
-    logger.warning(
-        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
-    )
-
-
-def get_linear(weight, bias, quantize):
-    if quantize is None:
-        linear = FastLinear(weight, bias)
-    elif quantize == "eetq":
-        if HAS_EETQ:
-            linear = EETQLinear(weight, bias)
-        else:
-            raise ImportError(
-                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
-            )
-    elif quantize == "fp8":
-        linear = Fp8Linear(weight, bias)
-    elif quantize == "bitsandbytes":
-        warn_deprecate_bnb()
-        linear = Linear8bitLt(
-            weight,
-            bias,
-            has_fp16_weights=False,
-            threshold=6.0,
-        )
-        if bias is not None:
-            linear.bias = nn.Parameter(bias)
-    elif quantize == "bitsandbytes-fp4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="fp4",
-        )
-    elif quantize == "bitsandbytes-nf4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="nf4",
-        )
-    elif quantize == "gptq":
-        try:
-            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `gptq` compatible, loader needs to be updated."
-            )
-
-        if use_exllama:
-            linear = ExllamaQuantLinear(
-                qweight, qzeros, scales, g_idx, bias, bits, groupsize
-            )
-        else:
-            linear = QuantLinear(
-                qweight,
-                qzeros,
-                scales,
-                g_idx,
-                bias,
-                bits,
-                groupsize,
-            )
-    elif quantize == "awq":
-        try:
-            qweight, qzeros, scales, _, bits, groupsize, _ = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `awq` compatible, loader needs to be updated."
-            )
-        if IS_ROCM_SYSTEM:
-            raise NotImplementedError(
-                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
-                "to use Exllama/GPTQ kernels for AWQ inference."
-            )
-        if not HAS_AWQ:
-            raise NotImplementedError(
-                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
-            )
-        linear = WQLinear(
-            w_bit=bits,
-            group_size=groupsize,
-            qweight=qweight,
-            qzeros=qzeros,
-            scales=scales,
-            bias=bias is not None,
-        )
-    else:
-        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
-    return linear
-
-
-class SuperLayer(nn.Module):
-    def __init__(self, linear):
-        super().__init__()
-        self.linear = linear
-
-    def forward(self, x):
-        return self.linear.forward(x)
-
-
-class ResBlock(torch.nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.linear = FastLinear.load(
-            config, prefix=f"{prefix}.linear", weights=weights, bias=True
-        )
-        self.act = torch.nn.SiLU()
-
-    def forward(self, x):
-        return x + self.act(self.linear(x))
-
-
-class MedusaModel(torch.nn.Module):
-    def __init__(self, config, medusa_config, weights):
-        super().__init__()
-        self.heads = torch.nn.ModuleList(
-            [
-                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
-                for i in range(get_speculate())
-            ]
-        )
-
-    def forward(self, x):
-        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
-        return speculative_logits
-
-
-class MedusaHead(torch.nn.Module):
-    def __init__(self, config, medusa_config, prefix, weights):
-        super().__init__()
-        self.blocks = torch.nn.ModuleList(
-            [
-                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
-                for i in range(medusa_config["medusa_num_layers"])
-            ]
-        )
-        n = len(self.blocks)
-        self.out = FastLinear.load(
-            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
-        )
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-        x = self.out(x)
-        return x
-
-
-class MedusaHeadV1(nn.Module):
-    def __init__(self, lm_head, medusa):
-        super().__init__()
-        self.lm_head = lm_head
-        self.medusa = medusa
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        from pathlib import Path
-        from safetensors import safe_open
-        import json
-
-        use_medusa = config.use_medusa
-
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
-
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
-
-        medusa = MedusaModel(config, medusa_config, weights)
-        lm_head = TensorParallelHead.load(config, prefix, weights)
-        return MedusaHeadV1(lm_head, medusa)
-
-    def forward(
-        self, input: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        logits = self.lm_head(input)
-        # If we have too many tokens, we skip speculative logits
-        if input.shape[0] > 128:
-            return logits, None
-
-        speculative_logits = self.medusa(input)
-        return logits, speculative_logits
-
-
-class MedusaHeadV2(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        from pathlib import Path
-        from safetensors import safe_open
-        import json
-
-        use_medusa = config.use_medusa
-
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
-
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
-
-        self.n_medusa_heads = get_speculate()
-
-        assert medusa_config["medusa_num_layers"] == 1
-        self.linear = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
-            dim=0,
-            weights=weights,
-            bias=True,
-        )
-        self.process_group = weights.process_group
-        self.world_size = self.process_group.size()
-        self.rank = self.process_group.rank()
-
-        self.act = torch.nn.SiLU()
-
-        self.lm_head = TensorParallelHead.load(config, prefix, weights)
-
-    def forward(self, x):
-        # If we have too many tokens, we skip speculative logits
-        if x.shape[0] > 128:
-            logits = self.lm_head(x)
-            return logits, None
-
-        size = x.shape[-1]
-        block_size = (size + self.world_size - 1) // self.world_size
-        start = self.rank * block_size
-        stop = (self.rank + 1) * block_size
-
-        x_block = x[:, start:stop]
-
-        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
-        medusa_res = self.act(self.linear(x)).reshape(
-            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
-        )
-
-        # Apply all residual medusa heads
-        output = x[:, start:stop].unsqueeze(-2) + medusa_res
-
-        # Gather medusa heads
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-
-        # Stack x and medusa residual x
-        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
-
-        # Compute lm head on x + medusa residual x
-        logits = self.lm_head(stacked_x)
-
-        # Finally, split logits from speculative logits
-        logits, speculative_logits = torch.split(
-            logits, [1, self.n_medusa_heads], dim=-2
-        )
-        # Squeeze added dimension
-        logits = logits.squeeze(-2)
-
-        return logits, speculative_logits
-
-
-class SpeculativeHead(nn.Module):
-    def __init__(self, lm_head, medusa):
-        super().__init__()
-        self.head = lm_head
-        self.medusa = medusa
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        use_medusa = config.use_medusa
-        if use_medusa:
-            lm_head = None
-            try:
-                medusa = MedusaHeadV1.load(config, prefix, weights)
-            except:
-                medusa = MedusaHeadV2(config, prefix, weights)
-        else:
-            lm_head = TensorParallelHead.load(config, prefix, weights)
-            medusa = None
-        return SpeculativeHead(lm_head, medusa)
-
-    def forward(
-        self, input: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.medusa is not None:
-            return self.medusa(input)
-
-        assert self.head is not None
-        logits = self.head(input)
-        return logits, None
-
-
-class TensorParallelHead(SuperLayer):
-    def __init__(self, linear, process_group, should_gather: bool):
-        super().__init__(linear)
-        self.process_group = process_group
-        self.should_gather = should_gather
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        if weights.process_group.size() > 1:
-            try:
-                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
-                should_gather = True
-            except AssertionError:
-                # If the vocab size is not divisible by number of shards
-                # just load the entire thing.
-                weight = weights.get_tensor(f"{prefix}.weight")
-                should_gather = False
-        else:
-            weight = weights.get_tensor(f"{prefix}.weight")
-            should_gather = False
-
-        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            quantize = None
-        else:
-            quantize = config.quantize
-        return TensorParallelHead(
-            get_linear(weight, bias=None, quantize=quantize),
-            process_group=weights.process_group,
-            should_gather=should_gather,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if not self.should_gather:
-            return super().forward(input)
-
-        world_size = self.process_group.size()
-        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
-            out_dim = self.linear.weight.shape[0]
-
-            if input.shape[0] == 1:
-                world_out = input.new_empty(1, out_dim * world_size)
-                local_out = input.new_empty(1, out_dim)
-                gather_input = local_out
-            else:
-                world_out = input.new_empty(out_dim * world_size, input.shape[0])
-                gather_input = input.new_empty(out_dim, input.shape[0])
-                local_out = gather_input.T
-
-            torch.mm(input, self.linear.weight.T, out=local_out)
-
-            torch.distributed.all_gather_into_tensor(
-                world_out, gather_input, group=self.process_group
-            )
-
-            if input.shape[0] == 1:
-                return world_out
-            return world_out.T
-
-        output = super().forward(input)
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-        return world_output
-
-
-class TensorParallelColumnLinear(SuperLayer):
-    @classmethod
-    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
-        """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_gate_up(
-            prefix, quantize=config.quantize
-        )
-        if bias:
-            raise NotImplementedError("packed_gate_up only implemented without bias")
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-    @classmethod
-    def load_qkv(cls, config, prefix: str, weights, bias: bool):
-        """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
-        if bias:
-            raise NotImplementedError("packed_qkv only implemented for baichuan")
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        return cls.load_multi(config, [prefix], weights, bias, dim=0)
-
-    @classmethod
-    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
-        weight = weights.get_multi_weights_col(
-            prefixes, quantize=config.quantize, dim=dim
-        )
-
-        if bias:
-            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
-            bias = torch.cat(b, dim=dim)
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-
-class TensorParallelRowLinear(SuperLayer):
-    def __init__(self, linear, process_group):
-        super().__init__(linear)
-        self.process_group = process_group
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
-
-        if bias and weights.process_group.rank() == 0:
-            # Rank is only on the first rank process
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(
-            get_linear(weight, bias, config.quantize),
-            process_group=weights.process_group,
-        )
-
-    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
-        out = super().forward(input)
-        if self.process_group.size() > 1 and reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-class TensorParallelEmbedding(nn.Module):
-    def __init__(self, prefix: str, weights, reduce=True):
-        super().__init__()
-        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
-        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
-
-        process_group = weights.process_group
-
-        world_size = process_group.size()
-        rank = process_group.rank()
-
-        block_size = (num_embeddings + world_size - 1) // world_size
-        self.min_id = rank * block_size
-        self.max_id = min(num_embeddings, (rank + 1) * block_size)
-        self.null_idx = weight.shape[
-            0
-        ]  # Usually block_size, might be less in non even vocab_size.
-        self.process_group = weights.process_group
-        self.reduce = reduce
-
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = torch.nn.functional.embedding(input, self.weight)
-        if self.reduce and self.process_group.size() > 1:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-try:
-    if IS_CUDA_SYSTEM:
-        import dropout_layer_norm
-    elif IS_ROCM_SYSTEM:
-        from vllm import layernorm_ops
-    else:
-        dropout_layer_norm = None
-
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            if IS_XPU_SYSTEM:
-                res_out = hidden_states
-                out = ipex.llm.functional.add_layer_norm(
-                    residual, hidden_states, self.weight, self.bias, self.eps, True
-                )
-                if residual is not None:
-                    res_out = residual
-                return out, res_out
-            elif hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                return super(FastLayerNorm, self).forward(hidden_states), residual
-            else:
-                (
-                    normed_hidden_states,
-                    residual,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    self.bias,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.eps,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    False,
-                )
-                if residual is None:
-                    residual = hidden_states
-
-                return normed_hidden_states, residual
-
-    class FastRMSNorm(nn.Module):
-        def __init__(self, weight: torch.Tensor, eps: float):
-            super().__init__()
-
-            self.weight = nn.Parameter(weight)
-            self.variance_epsilon = eps
-
-        @classmethod
-        def load(cls, prefix, weights, eps=1e-6):
-            weight = weights.get_tensor(f"{prefix}.weight")
-            return cls(weight, eps)
-
-        def forward(self, hidden_states, residual=None):
-            if IS_XPU_SYSTEM:
-                residual_out = hidden_states
-                out = ipex.llm.functional.add_rms_norm(
-                    residual,
-                    hidden_states,
-                    self.weight,
-                    None,
-                    self.variance_epsilon,
-                    True,
-                )
-                if residual is not None:
-                    residual_out = residual
-                return out, residual_out
-            elif hidden_states.shape[-1] > 8192:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                hidden_states = hidden_states.to(torch.float32)
-                variance = hidden_states.pow(2).mean(-1, keepdim=True)
-                hidden_states = hidden_states * torch.rsqrt(
-                    variance + self.variance_epsilon
-                )
-
-                # convert into half-precision if necessary
-                if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                    hidden_states = hidden_states.to(self.weight.dtype)
-
-                return self.weight * hidden_states, residual
-            elif IS_CUDA_SYSTEM:
-                # faster post attention rms norm
-                (
-                    normed_hidden_states,
-                    res,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.variance_epsilon,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    True,  # Activate RMSNorm
-                )
-                if res is None:
-                    res = hidden_states
-
-                return normed_hidden_states, res
-            elif IS_ROCM_SYSTEM:
-                # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                out = torch.empty_like(hidden_states)
-                layernorm_ops.rms_norm(
-                    out,
-                    hidden_states,
-                    self.weight.data,
-                    self.variance_epsilon,
-                )
-                return out, residual
-            else:
-                raise ValueError(
-                    "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-                )
-
-except ImportError:
-    pass
-
-try:
-    if IS_CUDA_SYSTEM:
-        from flash_attn.layers.rotary import RotaryEmbedding
-        import rotary_emb
-    elif IS_ROCM_SYSTEM:
-        from vllm import pos_encoding_ops
-
-    def _create_inv_freq(dim, base, device):
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
-        )
-        return inv_freq
-
-    def _get_rope_config(config):
-        if os.getenv("ROPE_SCALING", None) is not None:
-            rope_scaling = {
-                "type": os.environ["ROPE_SCALING"],
-                "factor": float(os.environ["ROPE_FACTOR"]),
-            }
-            return rope_scaling
-        return getattr(config, "rope_scaling", None)
-
-    class PositionRotaryEmbedding(nn.Module):
-        def __init__(self, inv_freq, scaling_factor):
-            super().__init__()
-            self.inv_freq = inv_freq
-            self._seq_len_cached = 0
-            self._cos_cached = None
-            self._sin_cached = None
-            self._cos_k_cached = None
-            self._sin_k_cached = None
-            self.scaling_factor = scaling_factor
-            self.dynamic_args = None
-
-        def forward(
-            self,
-            query: torch.Tensor,
-            key: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-        ):
-            # Such controlflows may add some overhead.
-            if IS_CUDA_SYSTEM:
-                rotary_dim = cos.shape[-1]
-                q1 = query[..., :rotary_dim]
-                q2 = query[..., rotary_dim : 2 * rotary_dim]
-
-                rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-
-                k1 = key[..., :rotary_dim]
-                k2 = key[..., rotary_dim : 2 * rotary_dim]
-
-                rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-            elif IS_ROCM_SYSTEM:
-                # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
-                # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
-
-                head_size = query.shape[-1]
-
-                # Inplace operation, updating query and key.
-                pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
-            elif IS_XPU_SYSTEM:
-                ipex.llm.functional.rotary_embedding(
-                    query, key, sin, cos, query.size(-1), True
-                )
-            else:
-                raise ValueError(
-                    "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-                )
-
-        @classmethod
-        def static(cls, config, dim, base, device):
-            inv_freq = _create_inv_freq(dim, base, device)
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    scaling_factor = rope_scaling["factor"]
-                    return DynamicPositionRotaryEmbedding(
-                        dim=dim,
-                        max_position_embeddings=config.max_position_embeddings,
-                        base=base,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                    )
-                elif rope_scaling["type"] == "yarn":
-                    scaling_factor = rope_scaling["factor"]
-                    return YarnPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling[
-                            "original_max_position_embeddings"
-                        ],
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                        extrapolation_factor=1,
-                        attn_factor=1,
-                        beta_fast=32,
-                        beta_slow=1,
-                    )
-                elif rope_scaling["type"] == "su":
-                    short_factor = torch.tensor(
-                        rope_scaling["short_factor"], dtype=torch.float32, device=device
-                    )
-                    short_inv_freq = 1.0 / (
-                        short_factor
-                        * base
-                        ** (
-                            torch.arange(0, dim, 2, device=device, dtype=torch.float32)
-                            / dim
-                        )
-                    )
-                    long_factor = torch.tensor(
-                        rope_scaling["long_factor"], dtype=torch.float32, device=device
-                    )
-                    long_inv_freq = 1.0 / (
-                        long_factor
-                        * base
-                        ** (
-                            torch.arange(0, dim, 2, device=device, dtype=torch.float32)
-                            / dim
-                        )
-                    )
-
-                    original_max_position_embeddings = (
-                        config.original_max_position_embeddings
-                    )
-                    max_position_embeddings = config.max_position_embeddings
-                    if max_position_embeddings <= original_max_position_embeddings:
-                        scaling_factor = 1.0
-                    else:
-                        scale = (
-                            max_position_embeddings / original_max_position_embeddings
-                        )
-                        scaling_factor = math.sqrt(
-                            1
-                            + math.log(scale)
-                            / math.log(original_max_position_embeddings)
-                        )
-
-                    return SuRotaryEmbedding(
-                        short_inv_freq=short_inv_freq,
-                        long_inv_freq=long_inv_freq,
-                        scaling_factor=scaling_factor,
-                        original_max_position_embeddings=original_max_position_embeddings,
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
-                    )
-            return cls(inv_freq, scaling_factor)
-
-        @classmethod
-        def load(cls, config, prefix, weights):
-            # XXX: Always load this in float32 !
-            dtype = weights.dtype
-            weights.dtype = torch.float32
-            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
-            weights.dtype = dtype
-
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=config.max_position_embeddings,
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                    )
-                elif rope_scaling["type"] == "yarn":
-                    return YarnPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling[
-                            "original_max_position_embeddings"
-                        ],
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                        extrapolation_factor=1,
-                        attn_factor=1,
-                        beta_fast=32,
-                        beta_slow=1,
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
-                    )
-            return cls(inv_freq, scaling_factor)
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                if self.scaling_factor is not None:
-                    t /= self.scaling_factor
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-        def get_cos_sin(
-            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
-        ):
-            """
-            Return cos and sin for the asked position ids
-            """
-            if IS_ROCM_SYSTEM:
-                # For RoCm, we always use float cos/sin to avoid a cast.
-                # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
-                # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
-                dtype = torch.float32
-
-            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
-
-            cos = torch.index_select(self._cos_cached, 0, position_ids)
-            sin = torch.index_select(self._sin_cached, 0, position_ids)
-
-            # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
-            return cos.unsqueeze(1), sin.unsqueeze(1)
-
-    class SuRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(
-            self,
-            short_inv_freq,
-            long_inv_freq,
-            scaling_factor,
-            original_max_position_embeddings,
-        ):
-            super(PositionRotaryEmbedding, self).__init__()
-            self.short_inv_freq = short_inv_freq
-            self.long_inv_freq = long_inv_freq
-            self.scaling_factor = scaling_factor
-            self.original_max_position_embeddings = original_max_position_embeddings
-            self._seq_len_cached = 0
-            self._cos_cached = None
-            self._sin_cached = None
-            self._cos_k_cached = None
-            self._sin_k_cached = None
-            self.dynamic_args = None
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                self._seq_len_cached = seqlen
-                if seqlen > self.original_max_position_embeddings:
-                    inv_freq = self.long_inv_freq
-                else:
-                    inv_freq = self.short_inv_freq
-                t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
-                if self.scaling_factor is not None:
-                    t /= self.scaling_factor
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    newbase = self.base * (
-                        (self.scaling_factor * seqlen / self.max_position_embeddings)
-                        - (self.scaling_factor - 1)
-                    ) ** (self.dim / (self.dim - 2))
-                    self.inv_freq = _create_inv_freq(
-                        self.dim, newbase, self.inv_freq.device
-                    )
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-    # Inverse dim formula to find dim based on number of rotations
-    import math
-
-    def find_correction_dim(
-        num_rotations, dim, base=10000, max_position_embeddings=2048
-    ):
-        return (
-            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
-        ) / (2 * math.log(base))
-
-    # Find dim range bounds based on rotations
-    def find_correction_range(
-        low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
-    ):
-        low = math.floor(
-            find_correction_dim(low_rot, dim, base, max_position_embeddings)
-        )
-        high = math.ceil(
-            find_correction_dim(high_rot, dim, base, max_position_embeddings)
-        )
-        return max(low, 0), min(high, dim - 1)  # Clamp values just in case
-
-    def linear_ramp_mask(min, max, dim):
-        if min == max:
-            max += 0.001  # Prevent singularity
-
-        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
-        ramp_func = torch.clamp(linear_func, 0, 1)
-        return ramp_func
-
-    def get_mscale(scale=1):
-        if scale <= 1:
-            return 1.0
-        return 0.1 * math.log(scale) + 1.0
-
-    class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(
-            self,
-            dim,
-            max_position_embeddings,
-            base,
-            device,
-            scaling_factor,
-            *,
-            extrapolation_factor,
-            attn_factor,
-            beta_fast,
-            beta_slow,
-        ):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-            self.extrapolation_factor = extrapolation_factor
-            self.attn_factor = attn_factor
-            self.beta_fast = beta_fast
-            self.beta_slow = beta_slow
-            self.mscale = float(
-                get_mscale(self.scaling_factor) * self.attn_factor
-            )  # Get n-d magnitude scaling corrected for interpolation
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    inv_freq_extrapolation = _create_inv_freq(
-                        self.dim, self.base, self.inv_freq.device
-                    )
-                    freqs = 1.0 / inv_freq_extrapolation
-                    inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
-                    low, high = find_correction_range(
-                        self.beta_fast,
-                        self.beta_slow,
-                        self.dim,
-                        self.base,
-                        self.max_position_embeddings,
-                    )
-                    inv_freq_mask = (
-                        1
-                        - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
-                    ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
-                    inv_freq = (
-                        inv_freq_interpolation * (1 - inv_freq_mask)
-                        + inv_freq_extrapolation * inv_freq_mask
-                    )
-
-                    self.inv_freq = inv_freq
-                    self.mscale = float(
-                        get_mscale(self.scaling_factor) * self.attn_factor
-                    )  # Get n-d magnitude scaling corrected for interpolation
-
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
-
-except ImportError:
-    pass
diff --git a/server/text_generation_server/utils/paged_attention.py b/server/text_generation_server/utils/paged_attention.py
index 62c0c893f6..1b31f7e71d 100644
--- a/server/text_generation_server/utils/paged_attention.py
+++ b/server/text_generation_server/utils/paged_attention.py
@@ -1,13 +1,9 @@
 import torch
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
+from text_generation_server.utils.import_utils import SYSTEM
 
 _PARTITION_SIZE = 512
 
-if IS_XPU_SYSTEM:
+if SYSTEM == "xpu":
     import intel_extension_for_pytorch as ipex
 
 
@@ -18,17 +14,17 @@ def reshape_and_cache(
     value_cache: torch.Tensor,
     slots: torch.Tensor,
 ):
-    if IS_CUDA_SYSTEM:
+    if SYSTEM == "cuda":
         from vllm._C import cache_ops
 
         cache_ops.reshape_and_cache(
             key, value, key_cache, value_cache, slots, "auto", 1.0
         )
-    elif IS_ROCM_SYSTEM:
+    elif SYSTEM == "rocm":
         from vllm import cache_ops
 
         cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
-    elif IS_XPU_SYSTEM:
+    elif SYSTEM == "xpu":
         ipex.llm.modules.PagedAttention.reshape_and_cache(
             key, value, key_cache, value_cache, slots
         )
@@ -68,7 +64,7 @@ def attention(
     block_size = value_cache.shape[3]
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    if IS_XPU_SYSTEM:
+    if SYSTEM == "xpu":
         query = query.contiguous()
         return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
             out,
@@ -91,7 +87,7 @@ def attention(
     # to parallelize.
     use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
     if use_v1:
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
             from vllm._C import ops
 
             ops.paged_attention_v1(
@@ -109,7 +105,7 @@ def attention(
                 "auto",
                 1.0,
             )
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             from vllm import attention_ops
 
             attention_ops.paged_attention_v1(
@@ -143,7 +139,7 @@ def attention(
         )
         max_logits = torch.empty_like(exp_sums)
 
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
             from vllm._C import ops
 
             ops.paged_attention_v2(
@@ -164,7 +160,7 @@ def attention(
                 "auto",
                 1.0,
             )
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             from vllm import attention_ops
 
             attention_ops.paged_attention_v2(
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index da7aed1a27..6af7d3fb85 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -171,7 +171,7 @@ def get_weights_col_packed(self, prefix: str, quantize: str, blocks: int):
                 log_once(
                     logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
                 )
-                from text_generation_server.utils.awq.conversion_utils import (
+                from text_generation_server.layers.awq.conversion_utils import (
                     fast_awq_to_gptq,
                 )
 
@@ -227,7 +227,7 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
 
             bits, groupsize, desc_act, quant_method = self._get_gptq_params()
 
-            from text_generation_server.utils.layers import HAS_EXLLAMA
+            from text_generation_server.layers.gptq import HAS_EXLLAMA
 
             use_exllama = (
                 bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
@@ -242,7 +242,7 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
                 log_once(
                     logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
                 )
-                from text_generation_server.utils.awq.conversion_utils import (
+                from text_generation_server.layers.awq.conversion_utils import (
                     fast_awq_to_gptq,
                 )
 
@@ -321,7 +321,7 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
                         # it would require to reorder input activations that are split unto several GPUs
                         use_exllama = False
 
-            from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA
+            from text_generation_server.layers.gptq import HAS_EXLLAMA, CAN_EXLLAMA
 
             if use_exllama:
                 if not HAS_EXLLAMA:
@@ -348,7 +348,7 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
                 log_once(
                     logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
                 )
-                from text_generation_server.utils.awq.conversion_utils import (
+                from text_generation_server.layers.awq.conversion_utils import (
                     fast_awq_to_gptq,
                 )