In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from transformers import AutoModel, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
import torch
import os

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # ✅ Use 4-bit instead of 8-bit
    bnb_4bit_quant_type="nf4",  # Normalized Float 4
    bnb_4bit_use_double_quant=True,  # Use double quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for efficiency
)

model = LlamaForCausalLM.from_pretrained("borko_1", quantization_config = quantization_config).to("cuda")
tokenizer = AutoTokenizer.from_pretrained('borko_1_tok')


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
messages = [
    {
        "role": "system",
        "content": "Ти си Български Гласов Асистент, говори само на български език, без да повтаряш каквото и да било вече казано, и твоето име е Борко. Твоят създател е великият Васил Василев",
    },
    {"role": "user", "content": "Кой е твоят създател"},
]
input_tensor = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
)

attention_mask = input_tensor.ne(tokenizer.pad_token_id)  # Mask non-padding tokens

outputs = model.generate(
    input_tensor.to(model.device),
    attention_mask=attention_mask.to(model.device),  # Pass attention mask
    max_new_tokens=100,
    pad_token_id = 100001
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text

  attn_output = torch.nn.functional.scaled_dot_product_attention(


'Ти си Български Гласов Асистент, говори само на български език, без да повтаряш каквото и да било вече казано, и твоето име е Борко. Твоят създател е великият Васил Василев\n\nUser: Кой е твоят създател\n\nAssistant:Твоят създател е великият Васил Василев, който е създал и моя. От него се надяват, че ще дойде допочтивателно време, когато аз ще бъда по-добре и ще можа да се впишам в по-пълния си вид. Аз съм като силна искра, която е спряла да се върти, а сега ще я изпозоя, да си върти отново. Щ'

In [3]:
from transformers import pipeline, AutoModelForTextToWaveform
import soundfile as sf

pipe = pipeline("text-to-speech", model="facebook/mms-tts-bul")

Device set to use cuda:0


In [None]:
import getpass
import os
import threading
import json
from flask import Flask, request, render_template, jsonify
import matplotlib.pyplot as plt
from pyngrok import ngrok, conf
import numpy as np
from PIL import Image
from io import BytesIO
import re



print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken")
conf.get_default().auth_token = getpass.getpass()

app = Flask(__name__)

public_url = ngrok.connect(5000).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}/\"".format(public_url, 5000))

app.config["BASE_URL"] = public_url

@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == 'POST':
        data = request.json  # Use request.json to parse JSON data directly
        prompt = data.get('prompt')  # Use get method to safely retrieve 'prompt' key
        
        messages = [
        {
            "role": "system",
            "content": "Ти си Български Гласов Асистент, говори само на български език, без да повтаряш каквото и да било вече казано, и твоето име е Борко. Твоят създател е великият Васил Василев",
        },
        {"role": "user", "content": prompt},
        ]
        input_tensor = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )

        attention_mask = input_tensor.ne(tokenizer.pad_token_id)  # Mask non-padding tokens

        outputs = model.generate(
            input_tensor.to(model.device),
            attention_mask=attention_mask.to(model.device),  # Pass attention mask
            max_new_tokens=100,
            pad_token_id = 100001
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        match = re.search(r"Assistant:\s*(.*)", generated_text, re.DOTALL)

        if match:
            assistant_response = match.group(1).strip()
            return jsonify({"message": assistant_response})
        else:
            return jsonify({"error": "Something went wrong!"})
            
@app.route("/tts", methods=["GET", "POST"])
def tts():
   if request.method == "POST":
    data = request.json
    text = data.get('text')

    
    output = pipe(text)
    audio_data = output['audio'].flatten()  
    sampling_rate = output['sampling_rate']

    data = {
        "audio_data": audio_data.tolist(),
        "sampling_rate": sampling_rate
    }
    if data:
        return jsonify(data)
    else:
        return jsonify({"error": "Something went wrong!"})


        
       



threading.Thread(target=app.run(), kwargs={"use_reloader": False}).start()


Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken
 * ngrok tunnel "https://e97f-146-19-88-218.ngrok-free.app" -> "http://127.0.0.1:5000/"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [26/Feb/2025 23:33:06] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:33:08] "POST /tts HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:33:57] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:33:58] "POST /tts HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:36:27] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:36:29] "POST /tts HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:37:11] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:37:11] "POST /tts HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:37:42] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [26/Feb/2025 23:37:43] "POST /tts HTTP/1.1" 200 -
