In [None]:
!pip install snac unsloth ipython
!pip install soundfile

In [2]:
HF_TOKEN = "HF_TOKEN" # Replace with your Hugging Face token

# Text Pipeline

In [None]:
from unsloth import FastLanguageModel
import torch
from transformers import DetrImageProcessor, DetrForObjectDetection, DPTFeatureExtractor, DPTForDepthEstimation
from PIL import Image, ImageDraw
from gradio_client import Client, handle_file
import requests
import numpy as np

client = Client("vikhyatk/moondream2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

detr_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
detr_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)

depth_processor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device)

llama_model, llama_tokenizer = FastLanguageModel.from_pretrained(
    model_name="AquaLabs/Spatial-Llama-1B",
    max_seq_length=1024,
)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
FastLanguageModel.for_inference(llama_model)

SYSTEM_PROMPT = """You are a visual understanding and interpretation assistant. You will receive an input consisting of a natural language description of an image along with a grid-based object detection matrix, which contains object names, counts, and their spatial positions with (x, y, z) format. Your task is to give information and answer questions about places. Summarize the image in a few sentences."""

def spatial_pipeline(image):
    inputs = detr_processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = detr_model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]]).to(device)
    results = detr_processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    depth_inputs = depth_processor(images=image, return_tensors="pt")
    depth_inputs = {k: v.to(device) for k, v in depth_inputs.items()}
    with torch.no_grad():
        depth_output = depth_model(**depth_inputs)
        depth = depth_output.predicted_depth
        depth = torch.nn.functional.interpolate(
            depth.unsqueeze(1),
            size=image.size[::-1],
            mode="bicubic",
            align_corners=False,
        ).squeeze().cpu().numpy()

    grid_rows, grid_cols = 2, 3
    width, height = image.size
    cell_width, cell_height = width // grid_cols, height // grid_rows
    object_matrix = [[[] for _ in range(grid_cols)] for _ in range(grid_rows)]

    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        box = [round(i, 2) for i in box.tolist()]
        center_x, center_y = (box[0] + box[2]) / 2, (box[1] + box[3]) / 2
        grid_x, grid_y = int(center_x // cell_width), int(center_y // cell_height)
        class_name = detr_model.config.id2label[label.item()]
        x_center, y_center = int(center_x), int(center_y)
        z = round(float(depth[y_center, x_center]), 2) 

        object_matrix[grid_y][grid_x].append({
            "class_name": class_name,
            "score": round(score.item(), 2),
            "box": box,
            "position": {"x": x_center, "y": y_center, "z": z}
        })

    matrix_output = "\nObject Vector Matrix by Region:\n"
    for i in range(grid_rows):
        row_str = "| "
        for j in range(grid_cols):
            objects = object_matrix[i][j]
            obj_vector = [f"{obj['class_name']}:{obj['score']}, Pos({obj['position']['x']}, {obj['position']['y']}, {obj['position']['z']})" for obj in objects]
            row_str += f"{obj_vector} | "
        matrix_output += row_str + "\n"

    result = client.predict(img=handle_file('image.jpeg'), prompt="Describe this image.", api_name="/answer_question")

    final_prompt = f"Image Description: {result}\n\nThis grid-based object detection matrix represents detected objects in different regions of the image. {matrix_output}"
    print("\nFinal Model Prompt:\n", final_prompt)

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": final_prompt}
    ]

    inputs = llama_tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    ).to(device)

    outputs = llama_model.generate(
        input_ids=inputs,
        max_new_tokens=1024,
        pad_token_id=llama_tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
    )

    response = llama_tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return response

In [None]:
url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRzpyNtHiuQ7oC9CbRuIXcqFJp8kcjz_CT8pA&s" # An example image, change it with your own image
image = Image.open(requests.get(url, stream=True).raw)
image.save("image.jpeg")
spatial_response = spatial_pipeline(image)

# TTS

In [None]:
import torch
from unsloth import FastLanguageModel
from snac import SNAC
from IPython.display import Audio, display
import numpy as np
import requests
from transformers import BitsAndBytesConfig
import soundfile as sf
import warnings
warnings.filterwarnings("ignore")

TOKENISER_LENGTH = 128256
START_OF_TEXT = 128000
END_OF_TEXT = 128009
START_OF_HUMAN = TOKENISER_LENGTH + 3
END_OF_HUMAN = TOKENISER_LENGTH + 4
START_OF_AI = TOKENISER_LENGTH + 5
END_OF_AI = TOKENISER_LENGTH + 6

GEN_START_TOKEN = 128259
GEN_EOS_TOKEN = 128258
GEN_END_EXTRA_TOKEN = 128260
GEN_REMOVE_TOKEN = 128258
CODE_OFFSET = 128266

def load_models():
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16
    )

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise",
        quantization_config=quant_config,
        max_seq_length=4096,
        token=HF_TOKEN
    )
    FastLanguageModel.for_inference(model)

    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    snac_model = snac_model.to(device)

    return model, tokenizer, snac_model, device

def redistribute_codes(code_list, snac_model, device):
    layer_1, layer_2, layer_3 = [], [], []
    num_groups = len(code_list) // 7
    for i in range(num_groups):
        group = code_list[7 * i: 7 * i + 7]
        layer_1.append(group[0])
        layer_2.append(group[1] - 4096)
        layer_3.append(group[2] - (2 * 4096))
        layer_3.append(group[3] - (3 * 4096))
        layer_2.append(group[4] - (4 * 4096))
        layer_3.append(group[5] - (5 * 4096))
        layer_3.append(group[6] - (6 * 4096))
    codes = [
        torch.tensor(layer_1).unsqueeze(0).to(device),
        torch.tensor(layer_2).unsqueeze(0).to(device),
        torch.tensor(layer_3).unsqueeze(0).to(device)
    ]

    audio_waveform = snac_model.decode(codes)
    return audio_waveform

def tts_pipeline(prompt, model, tokenizer, snac_model, device):
    input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
    end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
    modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)

    attention_mask = torch.ones_like(modified_input_ids, device=device)

    generated_ids = model.generate(
        input_ids=modified_input_ids,
        attention_mask=attention_mask,
        max_new_tokens=2400,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.1,
        num_return_sequences=1,
        eos_token_id=GEN_EOS_TOKEN,
        use_cache=True
    )

    marker_token = 128257
    token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
    if len(token_indices[1]) > 0:
        last_marker = token_indices[1][-1].item()
        cropped_tensor = generated_ids[:, last_marker + 1:]
    else:
        cropped_tensor = generated_ids

    processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]

    row_length = processed_tokens.size(0)
    new_length = (row_length // 7) * 7
    trimmed_tokens = processed_tokens[:new_length]

    code_list = (trimmed_tokens - CODE_OFFSET).tolist()

    audio_waveform = redistribute_codes(code_list, snac_model, device)
    return audio_waveform

In [None]:
model_sound, tokenizer, snac_model, device = load_models()

In [6]:
prompt = spatial_response

In [9]:
def play_and_save_audio(audio_waveform, sample_rate=24000, filename="output.wav"):
    audio_np = audio_waveform.detach().cpu().numpy()[0]

    display(Audio(audio_np, rate=sample_rate))

    sf.write(filename, audio_np, samplerate=sample_rate)

In [None]:
audio_waveform = tts_pipeline(prompt, model_sound, tokenizer, snac_model, device)
play_and_save_audio(audio_waveform, filename="my_voice.wav")