In [1]:
import os
import sys
sys.path.append("WavTokenizer")

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
#os.environ['CUDA_LAUNCH_BLOCKING']='1'
import librosa

import torchaudio
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)

from speechtokenizer import SpeechTokenizer
from WavTokenizer.decoder.pretrained import WavTokenizer
from audiotools import AudioSignal


def resample(audio_data: torch.Tensor, sample_rate: int):
    print("Inout sample rate:", sample_rate)
    if sample_rate == 24000:
      audio_data24k = audio_data
      audio_data16k = torch.tensor(
          librosa.resample(
              audio_data.cpu().detach().numpy(), orig_sr=sample_rate, target_sr=16000
          )
      )
    elif sample_rate == 16000:
      audio_data16k = audio_data
      audio_data24k = torch.tensor(
          librosa.resample(
              audio_data.cpu().detach().numpy(), orig_sr=sample_rate, target_sr=24000
          )
      )
    else:
      print("Resampling everything")
      audio_data16k = torch.tensor(
          librosa.resample(
              audio_data.cpu().detach().numpy(), orig_sr=sample_rate, target_sr=16000
          )
      )
      audio_data24k = torch.tensor(
          librosa.resample(
              audio_data.cpu().detach().numpy(), orig_sr=sample_rate, target_sr=24000
          )
      )

    return (audio_data16k.view(1, -1).float().to(device), 
           audio_data24k.view(1, -1).float().to(device))


def decode_tts(tokens, quantizer, n_codebooks, n_original_tokens, start_audio_token_id, end_audio_token_id):
    # find start and end indices of audio tokens
    start = torch.nonzero(tokens == start_audio_token_id)
    end = torch.nonzero(tokens == end_audio_token_id)

    start = start[0, -1] + 1 if len(start) else 0
    end = end[0, -1] if len(end) else tokens.shape[-1]

    # subtract length of original vocabulary -> tokens in range [0, 1024)
    audio_tokens = tokens[start:end] % n_original_tokens
    reminder = audio_tokens.shape[-1] % n_codebooks

    if reminder:
        # pad if last frame is incomplete
        pad_tokens = torch.zeros(n_codebooks - reminder, device="cuda")
        audio_tokens = torch.cat([audio_tokens, pad_tokens], dim=0)

    transposed = audio_tokens.view(-1, n_codebooks).t()
    codes = transposed.view(n_codebooks, 1, -1).to(device)

    audio = quantizer.decode(codes).squeeze(0)

    del tokens
    del audio_tokens
    torch.cuda.empty_cache()

    return AudioSignal(audio.detach().cpu().numpy(), quantizer.sample_rate)


def infer_text_to_audio(text, model, tokenizer, quantizer, max_seq_length=1024, top_k=20):
    text_tokenized = tokenizer(text, return_tensors="pt")
    text_input_tokens = text_tokenized["input_ids"].to(device)

    soa = tokenizer(start_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    eoa = tokenizer(end_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)

    text_tokens = torch.cat([text_input_tokens, soa], dim=1)
    attention_mask = torch.ones(text_tokens.size(), device=device)

    output_audio_tokens = model.generate(
        text_tokens,
        attention_mask=attention_mask,
        max_new_tokens=max_seq_length,
        top_k=top_k,
        do_sample=True,
        temperature=0.1,
        repetition_penalty=1.1,
        length_penalty=1.2,
        no_repeat_ngram_size=3,
    )

    audio_signal = decode_tts(output_audio_tokens[0], quantizer, 3, len(tokenizer), soa, eoa)

    return audio_signal


def infer_audio_to_text(audio_path, model, tokenizer, quantizer_speech, quantizer_wav, max_seq_length=1024, top_k=20):
    audio_data, sample_rate = torchaudio.load(audio_path)

    audio_16k, audio_24k = resample(audio_data, sample_rate)
    bandwidth_id = torch.tensor([0])

    codes_semantics = quantizer_speech.encode(audio_16k.reshape(1, 1, -1))
    raw_semantic_tokens = codes_semantics + len(tokenizer)
    raw_semantic_tokens = raw_semantic_tokens[:1].view(1, -1)

    _, codes = quantizer_wav.encode_infer(audio_24k, bandwidth_id=bandwidth_id)
    raw_acoustic_tokens = codes + len(tokenizer) + 1024
    raw_acoustic_tokens = raw_acoustic_tokens.view(1, -1)

    audio_tokens = torch.cat([raw_semantic_tokens, raw_acoustic_tokens], dim=1)

    soa = tokenizer(start_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    eoa = tokenizer(end_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    audio_tokens = torch.cat([soa, audio_tokens, eoa], dim=1)
    
    # text_tokens = tokenizer("is said with", return_tensors="pt")["input_ids"].to(device)
    tokens = torch.cat([audio_tokens], dim=1)

    attention_mask = torch.ones(tokens.size(), device=device)

    output_text_tokens = model.generate(
        tokens,
        attention_mask=attention_mask,
        max_new_tokens=max_seq_length,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
        top_k=top_k,
    )

    output_text_tokens = output_text_tokens.cpu()[0]
    output_text_tokens = output_text_tokens[output_text_tokens < tokenizer(start_audio_token)["input_ids"][-1]]
    decoded_text = tokenizer.decode(output_text_tokens, skip_special_tokens=True)

    return decoded_text


device = "cuda"

n_codebooks_tts = 3
n_codebooks_asr = 1

start_audio_token = "<|start_of_audio|>"
end_audio_token = "<|end_of_audio|>"
end_sequence_token = "<|end_of_text|>"

base_model = "Vikhrmodels/salt-asr_speech_1_wav_1_tts_speech_3_instruct-8k"


quantizer_speech = SpeechTokenizer.load_from_checkpoint("audiotokenizer/speechtokenizer_hubert_avg_config.json",
                                                        "audiotokenizer/SpeechTokenizer.pt")
quantizer_speech = quantizer_speech.eval().to(device)
codebook_size = quantizer_speech.quantizer.bins

quantizer_wav = WavTokenizer.from_pretrained0802("WavTokenizer/configs/wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml",
                                                 "audiotokenizer/WavTokenizer_small_600_24k_4096.ckpt")
quantizer_wav = quantizer_wav.to(device)


tokenizer = AutoTokenizer.from_pretrained(base_model, cache_dir=".")




model = AutoModelForCausalLM.from_pretrained(
    base_model,
    cache_dir=".",
    torch_dtype=torch.float16,
    attn_implementation="sdpa",
    device_map={"": 0}
)


import gradio as gr
import torch

# Подключение функций
def infer_text_to_audio(text, prompt, top_k=20, top_p=0.8, temperature=1):
    # Форматирование текста с учетом шаблона и инструкций
    max_seq_length=1024
    formatted_text = f"Say '{text.upper()}' {prompt}"
    
    # Токенизация текста
    text_tokenized = tokenizer(formatted_text, return_tensors="pt")
    text_input_tokens = text_tokenized["input_ids"].to(device)
    soa = tokenizer(start_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    eoa = tokenizer(end_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    text_tokens = torch.cat([text_input_tokens, soa], dim=1)
    attention_mask = torch.ones(text_tokens.size(), device=device)

    output_audio_tokens = model.generate(
        text_tokens,
        attention_mask=attention_mask,
        max_new_tokens=max_seq_length,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,
        temperature=temperature,
        no_repeat_ngram_size=3,
        #length_penalty=2.0,
        repetition_penalty=1.5,
    )

    audio_signal = decode_tts(output_audio_tokens[0], quantizer_speech, 3, len(tokenizer), soa, eoa)
    output_file = "output_audio.wav"
    audio_signal.write(output_file)
    return output_file

def infer_audio_to_text(audio_path, max_seq_length=1024, top_k=200):
    audio_data, sample_rate = torchaudio.load(audio_path)
    audio = audio_data.view(1, -1).float().to(device)
    bandwidth_id = torch.tensor([0])
    codes_semantics = quantizer_speech.encode(audio.reshape(1, 1, -1))
    raw_semantic_tokens = codes_semantics + len(tokenizer)
    raw_semantic_tokens = raw_semantic_tokens[:1].view(1, -1)
    _, codes = quantizer_wav.encode_infer(audio, bandwidth_id=bandwidth_id)
    raw_acoustic_tokens = codes + len(tokenizer) + 1024
    raw_acoustic_tokens = raw_acoustic_tokens.view(1, -1)
    audio_tokens = torch.cat([raw_semantic_tokens, raw_acoustic_tokens], dim=1)
    soa = tokenizer(start_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    eoa = tokenizer(end_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    audio_tokens = torch.cat([soa, audio_tokens, eoa], dim=1)
    tokens = torch.cat([audio_tokens], dim=1)
    attention_mask = torch.ones(tokens.size(), device=device)

    output_text_tokens = model.generate(
        tokens,
        attention_mask=attention_mask,
        max_new_tokens=max_seq_length,
        do_sample=True,
        temperature=0.5,
        top_k=top_k,
    )

    output_text_tokens = output_text_tokens.cpu()[0]
    output_text_tokens = output_text_tokens[output_text_tokens < tokenizer(start_audio_token)["input_ids"][-1]]
    decoded_text = tokenizer.decode(output_text_tokens, skip_special_tokens=True)
    return decoded_text

# Интерфейс Gradio
with gr.Blocks() as demo:
    gr.Markdown("# Text-to-Audio and Audio-to-Text Conversion")
    
    with gr.Row():
        text_input = gr.Textbox(label="Text to Say", placeholder="Enter the text to be spoken, e.g., 'Hello everyone'")
        prompt_input = gr.Textbox(
            label="Voice Instructions", 
            placeholder=
                ("with a female voice: lively, expressive, with a playful and energetic tone. The voice should be dynamic and slightly high-pitched, conveying excitement and charm. Ensure the recording is clear and crisp, with minimal background noise.")
            
        )
        audio_output = gr.Audio(label="Generated Audio", type="filepath")
    
    with gr.Row():
        # Крутилки для управления параметрами
        top_k_slider = gr.Slider(1, 200, value=20, step=1, label="Top-k")
        top_p_slider = gr.Slider(0.0, 1.0, value=0.8, step=0.01, label="Top-p")
        temperature_slider = gr.Slider(0.0, 2.0, value=1.0, step=0.01, label="Temperature")
    
    with gr.Row():
        gr.Markdown("### Generate Audio from Text and Instructions")
        text_to_audio_button = gr.Button("Generate Audio")
        text_to_audio_button.click(
            fn=infer_text_to_audio, 
            inputs=[text_input, prompt_input, top_k_slider, top_p_slider, temperature_slider], 
            outputs=audio_output
        )
    
    with gr.Row():
        audio_input = gr.Audio(label="Input Audio for Text Generation", type="filepath")
        text_output = gr.Textbox(label="Generated Text from Audio")
    
    with gr.Row():
        gr.Markdown("### Generate Text from Audio")
        audio_to_text_button = gr.Button("Generate Text")
        audio_to_text_button.click(
            fn=infer_audio_to_text, 
            inputs=[audio_input], 
            outputs=text_output
        )

demo.launch(share=True)

  from .autonotebook import tqdm as notebook_tqdm
  WeightNorm.apply(module, name, dim)
  params = torch.load(ckpt_path, map_location='cpu')


making attention of type 'vanilla' with 768 in_channels


  state_dict_raw = torch.load(model_path, map_location="cpu")['state_dict']
Downloading shards: 100%|██████████| 2/2 [02:34<00:00, 77.33s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://b23b00ddf17cbda4da.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [2]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         234G  185G   38G  84% /
tmpfs            64M     0   64M   0% /dev
tmpfs           126G     0  126G   0% /sys/fs/cgroup
shm             8.0G  4.0K  8.0G   1% /dev/shm
/dev/fioa1      2.9T  2.8T     0 100% /app
/dev/sda1        15T   14T  1.3T  92% /mnt/storage
/dev/sdb2       234G  185G   38G  84% /etc/hosts
tmpfs           126G   12K  126G   1% /proc/driver/nvidia
udev            126G     0  126G   0% /dev/nvidia0
tmpfs           126G     0  126G   0% /proc/acpi
tmpfs           126G     0  126G   0% /proc/scsi
tmpfs           126G     0  126G   0% /sys/firmware
tmpfs           126G     0  126G   0% /sys/devices/virtual/powercap


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://29bface89ef8bdf109.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [167]:
import gradio as gr
import torch

# Подключение функций
def infer_text_to_audio(formatted_text, max_seq_length=1024, top_k=20, top_p=0.8, temperature=1):
    # Токенизация текста
    text_tokenized = tokenizer(formatted_text, return_tensors="pt")
    text_input_tokens = text_tokenized["input_ids"].to(device)
    soa = tokenizer(start_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    eoa = tokenizer(end_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    text_tokens = torch.cat([text_input_tokens, soa], dim=1)
    attention_mask = torch.ones(text_tokens.size(), device=device)

    output_audio_tokens = model.generate(
        text_tokens,
        attention_mask=attention_mask,
        max_new_tokens=max_seq_length,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,
        temperature=temperature,
        no_repeat_ngram_size=3,
    )

    audio_signal = decode_tts(output_audio_tokens[0], quantizer_speech, 3, len(tokenizer), soa, eoa)
    output_file = "output_audio.wav"
    audio_signal.write(output_file)
    return output_file

def infer_audio_to_text(audio_path, max_seq_length=1024, top_k=200):
    audio_data, sample_rate = torchaudio.load(audio_path)
    audio = audio_data.view(1, -1).float().to(device)
    bandwidth_id = torch.tensor([0])
    codes_semantics = quantizer_speech.encode(audio.reshape(1, 1, -1))
    raw_semantic_tokens = codes_semantics + len(tokenizer)
    raw_semantic_tokens = raw_semantic_tokens[:1].view(1, -1)
    _, codes = quantizer_wav.encode_infer(audio, bandwidth_id=bandwidth_id)
    raw_acoustic_tokens = codes + len(tokenizer) + 1024
    raw_acoustic_tokens = raw_acoustic_tokens.view(1, -1)
    audio_tokens = torch.cat([raw_semantic_tokens, raw_acoustic_tokens], dim=1)
    soa = tokenizer(start_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    eoa = tokenizer(end_audio_token, return_tensors="pt")["input_ids"][:, -1:].to(device)
    audio_tokens = torch.cat([soa, audio_tokens, eoa], dim=1)
    tokens = torch.cat([audio_tokens], dim=1)
    attention_mask = torch.ones(tokens.size(), device=device)

    output_text_tokens = model.generate(
        tokens,
        attention_mask=attention_mask,
        max_new_tokens=max_seq_length,
        do_sample=True,
        temperature=0.01,
        top_k=top_k,
    )

    output_text_tokens = output_text_tokens.cpu()[0]
    output_text_tokens = output_text_tokens[output_text_tokens < tokenizer(start_audio_token)["input_ids"][-1]]
    decoded_text = tokenizer.decode(output_text_tokens, skip_special_tokens=True)
    return decoded_text

# Интерфейс Gradio
with gr.Blocks() as demo:
    gr.Markdown("# Text-to-Audio and Audio-to-Text Conversion")
    
    with gr.Row():
        formatted_text_input = gr.Textbox(
            label="Input Formatted Text for Audio Generation", 
            placeholder=(
                "Examples:\n"
                "1. Say 'HELLO EVERYONE' with a cheerful and energetic voice. The tone should be friendly and welcoming.\n"
                "2. Say 'WELCOME TO THE PARTY' with an enthusiastic and lively tone, like an invitation to a fun event.\n"
                "3. Say 'GOOD MORNING' with a calm and soothing female voice, warm and reassuring.\n"
                "4. Say 'LET'S LEARN TOGETHER' with a confident and clear voice for an educational setting.\n"
                "5. Say 'THIS IS A TEST OF THE EMERGENCY SYSTEM' with a serious and authoritative tone."
            )
        )
        audio_output = gr.Audio(label="Generated Audio", type="filepath")
    
    with gr.Row():
        gr.Markdown("### Generate Audio from Formatted Text")
        text_to_audio_button = gr.Button("Generate Audio")
        text_to_audio_button.click(
            fn=infer_text_to_audio, 
            inputs=[formatted_text_input], 
            outputs=audio_output
        )
    
    with gr.Row():
        audio_input = gr.Audio(label="Input Audio for Text Generation", type="filepath")
        text_output = gr.Textbox(label="Generated Text from Audio")
    
    with gr.Row():
        gr.Markdown("### Generate Text from Audio")
        audio_to_text_button = gr.Button("Generate Text")
        audio_to_text_button.click(
            fn=infer_audio_to_text, 
            inputs=[audio_input], 
            outputs=text_output
        )

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://a53a64c8e6b2da3fc8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [168]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(133379, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [170]:
!git clone https://github.com/ggerganov/llama.cpp.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning into 'llama.cpp'...
remote: Enumerating objects: 37429, done.[K
remote: Counting objects: 100% (9303/9303), done.[K
remote: Compressing objects: 100% (876/876), done.[K
remote: Total 37429 (delta 8860), reused 8687 (delta 8420), pack-reused 28126 (from 1)[K
Receiving objects: 100% (37429/37429), 60.56 MiB | 24.89 MiB/s, done.
Resolving deltas: 100% (27289/27289), done.


In [171]:
!pip install -r llama.cpp/requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu
Collecting torch~=2.2.1 (from -r llama.cpp/./requirements/requirements-convert_hf_to_gguf.txt (line 3))
  Downloading https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp312-cp312-linux_x86_64.whl (186.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m186.7/186.7 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.4.0
    Uninstalling torch-2.4.0:
      Successfully uninstalled torch-2.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xformers 0.0.27.post2 requires torch==2.4.0, but you have torch 2.2.2+cpu which is inco

In [178]:
!python3 llama.cpp/convert_hf_to_gguf.py -h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


usage: convert_hf_to_gguf.py [-h] [--vocab-only] [--outfile OUTFILE]
                             [--outtype {f32,f16,bf16,q8_0,tq1_0,tq2_0,auto}]
                             [--bigendian] [--use-temp-file] [--no-lazy]
                             [--model-name MODEL_NAME] [--verbose]
                             [--split-max-tensors SPLIT_MAX_TENSORS]
                             [--split-max-size SPLIT_MAX_SIZE] [--dry-run]
                             [--no-tensor-first-split] [--metadata METADATA]
                             model

Convert a huggingface model to a GGML compatible file

positional arguments:
  model                 directory containing model file

options:
  -h, --help            show this help message and exit
  --vocab-only          extract only the vocab
  --outfile OUTFILE     path to write to; default: based on input. {ftype}
                        will be replaced by the outtype.
  --outtype {f32,f16,bf16,q8_0,tq1_0,tq2_0,auto}
                        outpu

In [182]:
!python llama.cpp/convert_hf_to_gguf.py salt_a1 \
  --outfile salt-asr_speech_1_wav_1_tts_speech_3.gguf \
  --outtype q8_0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:hf-to-gguf:Loading model: salt_a1
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape = {3072, 133379}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> Q8_0, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> Q8_0, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> Q8_0, shape = {3072, 1024}

In [186]:
from huggingface_hub import HfApi
api = HfApi()

model_id = "Vikhrmodels/salt_asr_speech_1_wav_1_tts_speech_3_gguf"
api.create_repo(model_id, exist_ok=True, repo_type="model")
api.upload_file(
    path_or_fileobj="salt-asr_speech_1_wav_1_tts_speech_3.gguf",
    path_in_repo="salt-asr_speech_1_wav_1_tts_speech_3.gguf",
    repo_id=model_id,
    token='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',
)

HfHubHTTPError: (Request ID: Root=1-67350a88-41549e911d83527e4af124e7;9edd5758-b40a-4194-a771-695cae7cc589)

403 Forbidden: You don't have the rights to create a model under the namespace "Vikhrmodels".
Cannot access content at: https://huggingface.co/api/repos/create.
Make sure your token has the correct permissions.