In [1]:
# !pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q datasets loralib sentencepiece
# !pip install -q gradio
# !pip install -q ttsmms

In [2]:
# For TTS

!curl https://dl.fbaipublicfiles.com/mms/tts/ory.tar.gz --output ory.tar.gz #update lang
!mkdir -p data && tar -xzf ory.tar.gz -C data/ #update langcode

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  383M  100  383M    0     0   222M      0  0:00:01  0:00:01 --:--:--  222M


In [3]:
from ttsmms import TTS

tts=TTS("data/ory") #update lang code

In [4]:
wav=tts.synthesis("ମୁଁ ଅଲିଭ୍ ଏକ ଚାଟ୍ବଟ୍ ଆସିଷ୍ଟାଣ୍ଟ")

In [5]:
from IPython.display import Audio
Audio(wav["x"], rate=wav["sampling_rate"])

In [6]:
import tempfile
from IPython.display import Audio


def tts_system(text: str):
    
    print(text)
    wavs = tts.synthesis(text)
    # return output
    af = Audio(wavs["x"], rate=wavs["sampling_rate"])
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        fp.write(af.data)
        return fp.name

In [7]:
# import torch
from peft import PeftModel
# from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig


import torch
# from peft import PeftModel
import transformers
import gradio as gr

assert (
    "LlamaTokenizer" in transformers._import_structure["models.llama"]
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, BitsAndBytesConfig


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [None]:
model_id = "OdiaGenAI/odiagenAI_llama7b_base_v1"
device = "cuda:0"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")   # 4 bit model

def generate_prompt(instruction, input=None):
    if input:
        return f"""### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"""
    else:
        return f"""### Instruction:\n{instruction}\n\n### Response:\n"""


model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)


def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt")
    print(inputs)
    input_ids = inputs["input_ids"].to(device)
    print(input_ids)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
        # print(generation_output)
    s = generation_output.sequences[0]
    print(s)
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)

    text = output.split("### Response:")[1].strip()
    audio_wav = tts_system(text)
    return text, audio_wav
    # return output.split("### Response:")[1].strip()

input_text = gr.components.Textbox(
            lines=2, label="Instruction", placeholder="Tell me about alpacas."
        )
# tts_button = gr.Button("Change to voice", elem_id="send-btn", visible=True)

output_audio = gr.outputs.Audio(label="Output", type="filepath")

g = gr.Interface(
    fn=evaluate,
    inputs=[
        input_text,
        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
        gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
        gr.components.Slider(
            minimum=1, maximum=512, step=1, value=128, label="Max tokens"
        ),
    ],
    outputs=[
        gr.inputs.Textbox(
            lines=5,
            label="Output",
        ),
        output_audio
    ],
    title="🌊🐢 OdiaGenAI-4-bit",
    description="OdiaGenAI-4-bit is a 7B-parameter LLaMA model finetuned to follow Odia instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) Odia translated dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://www.odiagenai.org/).",
)



g.queue(concurrency_count=2)
g.launch(debug=True)

Loading checkpoint shards:   0%|          | 0/39 [00:00<?, ?it/s]

  super().__init__(
  super().__init__(


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://7900abb759ecf3c534.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          227,   175,   152, 29871,   227,   175,   187,   227,   175,   133,
          227,   175,   154,   227,   176,   144,   227,   175,   179,   227,
          175,   188, 29871,   227,   175,   152,   227,   175,   179,   227,
          176,   132,   227,   175,   168,   227,   175,   194,   227,   175,
          175,   227,   175], device='cuda:0')
 ⁇  ### Instruction:
What is the name of the main library at Notre Dame?

### Input:
The library system of the university is divided between the main library and each of the colleges and schools. The main building is the 14-story Theodore M. Hesburgh Library, completed in 1963, which is the third building to house the main collection of books. The front of the library is adorned with the Word of Life mural designed by artist Millard Sheets. This mural is popularly known as "Touchdown Jesus" because of its proximity to Notre Dame Stadium and Jesus' arms appearing to make