# Voice Chat Bot

In [1]:
import numpy as np
import pandas as pd
import os 

from datasets import load_dataset
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

import warnings

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
ds = load_dataset("neural-bridge/rag-dataset-12000")

In [24]:
def get_context(question: str, dataset, threshold=0.7):
    """
    Find the most relevant context for a given question in the dataset.
    
    Args:
        question (str): The user's question
        dataset: The loaded dataset
        threshold (float): Minimum similarity score to consider context relevant
        
    Returns:
        str: The most relevant context if found, otherwise None
    """
    # Create embeddings for all contexts in the dataset
    embedding = HuggingFaceEmbeddings()
    vectorstore = FAISS.from_texts(
        texts=[sample["context"] for sample in dataset],
        embedding=embedding
    )
    
    # Get embeddings for the question
    question_embedding = embedding.embed_query(question)
    
    # Search for similar contexts
    similarities, context_indices = vectorstore.similarity_search(
        question_embedding,
        k=1
    )
    
    # Check if we found a relevant context
    if similarities[0] >= threshold:
        context_idx = context_indices[0]
        return dataset[context_idx]["context"]
    return None



In [2]:
dataset = load_dataset("neural-bridge/rag-dataset-12000")

In [36]:
dataset['test'][0]['question']

'Who is the music director of the Quebec Symphony Orchestra?'

In [41]:
dataset['test'][0]['context']

'HOUSTON (Jan. 23, 2018) – Fabien Gabel, music director of the Quebec Symphony Orchestra, returns to Houston to lead the Houston Symphony in Ravel’s Daphnis and Chloé on Feb. 2 and 3 at 8 p.m. and Feb. 4 at 2:30 p.m. in Jones Hall.\nRecognized internationally as one of the stars of the new generation, Fabien Gabel is a regular guest of the Houston Symphony and an audience favorite. Known for conducting music with French influences, Gabel leads the Symphony in a program of French and American classics, including the breathtaking musical sunrise from Ravel’s Daphnis and Chloé and Bernstein’s comic operetta Overture to Candide as the Symphony joins other orchestras around the world for Leonard Bernstein at 100, a worldwide celebration of the composer’s 100th birthday. Also on the program is Habanera, a piece by French composer Louis Aubert.\nThe evening’s featured soloist, Colin Currie, is hailed as “the world’s finest and most daring percussionist” (Spectator). He performs regularly with

In [51]:
from langchain.docstore.document import Document

# 1. Подготовим данные
texts = [sample["context"] for sample in dataset['test']]
documents = [Document(page_content=text) for text in texts]

In [44]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sklearn.preprocessing import normalize

# Получаем контексты и эмбеддинги
texts = [sample["context"] for sample in dataset['test']]
embedding_model = HuggingFaceEmbeddings()
embeddings = [embedding_model.embed_query(text) for text in texts]

# Нормализуем векторы вручную
normalized_embeddings = normalize(embeddings)  # теперь будет косинусная метрика

# Создаём FAISS индекс вручную (в обход from_texts)
import faiss
import numpy as np

  embedding_model = HuggingFaceEmbeddings()


In [46]:
# 4. Создаём FAISS IndexFlatIP (inner product)
dimension = len(normalized_embeddings[0])
index = faiss.IndexFlatIP(dimension)
index.add(np.array(normalized_embeddings))

# 5. Создаём FAISS vectorstore без аргумента texts
vectorstore = FAISS(embedding_function=embedding_model, index=index, docstore=None, index_to_docstore_id=None)


langchain_community.vectorstores.faiss.FAISS

In [52]:
from langchain.docstore.in_memory import InMemoryDocstore
docstore = InMemoryDocstore(dict(enumerate(documents)))
vectorstore.docstore = docstore
vectorstore.index_to_docstore_id = dict(enumerate(range(len(documents))))

In [63]:
# Perform similarity search with scores
document, score = vectorstore.similarity_search_with_score("Who is are you?", k=1)[0]

In [64]:
score 

0.29662287

In [39]:
most_similar_doc, score = results[0]

In [65]:
document.page_content

'I feel compelled to do this. Why? Because I don\'t want to start working on a new sign right now.\nI got this at A Yank Gone South. Go read it, won\'t you?\nThe Rules:\n1. Link to the person that tagged you, and post the rules on your blog.\n2. Share 7 random/weird facts about yourself .\n3. Tag 7 random people at the end of your post and include links to their blogs. --Yeah, I\'m not going to do this, but feel free to paste and post it if you want. I\'m a rebel.\nRANDOM KELLY STUFF\n1) I get very irritated when unimportant things begin to get complicated. I like things to be decided on and then carried out. I Can. Not. Stand. to sit around and listen to people waffle about what needs to be done when easy answers are the best answers. This is not to say I am particular about what the answers are, I just want the answer to be decided on without undue talkage about them. If I have an opinion about something, you\'re gonna dang well hear it, but I\'m easy going enough to go along with an

In [16]:
dataset['test']

TypeError: BaseModel.validate() missing 1 required positional argument: 'value'

In [None]:
def main():
    # Load the dataset
    ds = load_dataset("neural-bridge/rag-dataset-12000")
    
    console.print("[cyan]Assistant starting...")

    if not check_ollama_server():
        console.print("[red]Please start the Ollama server by running 'ollama serve'")
        sys.exit(1)

    console.print("[cyan]Assistant ready! Press Enter to start recording, then press Enter again to stop.")

    try:
        while True:
            console.input(
                "Press Enter to start recording, then press Enter again to stop."
            )

            data_queue = Queue()  # type: ignore[var-annotated]
            stop_event = threading.Event()
            recording_thread = threading.Thread(
                target=record_audio,
                args=(stop_event, data_queue),
            )
            recording_thread.start()

            input()
            stop_event.set()
            recording_thread.join()

            audio_data = b"".join(list(data_queue.queue))
            audio_np = (
                np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
            )

            if audio_np.size > 0:
                with console.status("Transcribing...", spinner="earth"):
                    text = transcribe(audio_np)
                console.print(f"[yellow]You: {text}")

                # Find relevant context
                with console.status("Searching for relevant context...", spinner="earth"):
                    context = get_context(text, ds)
                    if context:
                        console.print(f"[green]Found helpful context: {context}")
                        # Add context to the prompt template
                        text = f"{text}\n\nContext: {context}"

                with console.status("Generating response...", spinner="earth"):
                    response = get_llm_response(text)
                    sample_rate, audio_array = tts.long_form_synthesize(response)

                console.print(f"[cyan]Assistant: {response}")
                play_audio(sample_rate, audio_array)
            else:
                console.print(
                    "[red]No audio recorded. Please ensure your microphone is working."
                )

    except KeyboardInterrupt:
        console.print("\n[red]Exiting...")

    console.print("[blue]Session ended.")