In [2]:
import spacy
import pandas as pd
import re
import time
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document
from langchain.vectorstores import Chroma
from openai import OpenAI
import elevenlabs
import subprocess
import os
from typing import Iterator

class Chat:
    def __init__(self, file_path, i):
        self.model_id = "paraphrase-MiniLM-L3-v2"
        self.device = "cpu"
        self.dim = 384
        self.file_path = file_path
        chroma_client = chromadb.PersistentClient(path=f"./dp/demo{i}")
        self.collection = chroma_client.create_collection(
            name="book",
            metadata={"hnsw:space": "cosine"}
        )
        full_text = self.read_docx(self.file_path)
        splitted_txt = self.splitter(full_text)
        self.model = self._encode()
        encoded_text = self.model.encode(splitted_txt, show_progress_bar=True).tolist()
        ids = [str(i) for i in range(len(encoded_text))]
        self.collection.add(
            documents=splitted_txt,
            embeddings=encoded_text,
            ids=ids
        )
        self.system = """
                I'll provide you with a JSON object that contains a question and the context related to it:
                {"question": the question, "context": the context}
                Please generate the answer of the provided question based on the context above.
                """
        
        api_key = "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"
        elevenlabs.set_api_key("19971a4ea37210273ae9e3f5a76174db")
        
        self.client = OpenAI(api_key=api_key)

        self.messages = [
            {"role": "system", "content": self.system},
            
        ]

    def run(self, question):
        question_embed = self.model.encode(question)
        results = self.collection.query(
            query_embeddings=question_embed.tolist(),
            n_results=3,  
        )
        top_paragraph = ' '.join([i for i in results['documents']][0])
        prompt = '{"question": ' + question + ', "context": ' + top_paragraph + '}'

        self.messages.append(
            {"role": "user", "content": prompt}
        )

        return self.generate_audio(prompt, self.messages)

    def read_docx(self, file_path):
        doc = Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        full_text = '\n'.join(full_text)

        return full_text

    def splitter(self, txt):
        
        chunk_size = 1000
        chunk_overlap = 200

        def length_function(text: str) -> int:
            return len(text)

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function
        )

        return splitter.split_text(txt)
    
    def _encode(self):
        return SentenceTransformerEmbeddings(model_name=self.model_id, device=self.device)
    
    def _get_apen_ai_answer(self, prompt, messages):
        response = self.client.chat.completions.create(
            model = "gpt-3.5-turbo-1106",
            temperature= 0,
            messages=messages,
            stream=True
        )
        
        for chunk in response:
            txt = chunk.choices[0].delta.content
            print(txt, end="")
            
            yield txt if txt != None else ""
            
    def stream(self, audio_stream: Iterator[bytes]) -> bytes:

        mpv_command = ["C:\\Program Files\\mpv\\mpv.exe", "--no-cache", "--no-terminal", "--", "fd://0"]
        mpv_process = subprocess.Popen(
            mpv_command,
            stdin=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )

        audio = b""

        for chunk in audio_stream:
            if chunk is not None:
                mpv_process.stdin.write(chunk)  # type: ignore
                mpv_process.stdin.flush()  # type: ignore
                audio += chunk

        if mpv_process.stdin:
            mpv_process.stdin.close()
        mpv_process.wait()

        return audio

    def generate_audio(self, prompt, messages):
        generated_audio = elevenlabs.generate(text=self._get_apen_ai_answer(prompt, messages), voice="tsample", model="eleven_monolingual_v1", stream=True)
        self.stream(generated_audio)


In [4]:
answer_me = Chat("Master Machine Learning Algorithms - Discover how they work by Jason Brownlee (z-lib.org).docx", 44)

Batches: 100%|██████████| 12/12 [00:02<00:00,  4.49it/s]


## Demo

In [5]:
while True:
    txt = input("")
    
    if txt == "End Session": break
    answer_me.run(txt)

Machine learning refers to the process of teaching a computer system to make predictions or take actions based on data, without being explicitly programmed. It involves the learning of a target function from training data through inductive learning, which refers to learning general concepts from specific examples. This is different from deduction, which seeks to learn specific concepts from general rules. In the context of machine learning, data plays a crucial role, and it is important to understand and use the right terminology when discussing it.None

In [6]:
answer_me.messages

[{'role': 'system',
  'content': '\n                I\'ll provide you with a JSON object that contains a question and the context related to it:\n                {"question": the question, "context": the context}\n                Please generate the answer of the provided question based on the context above.\n                '},
 {'role': 'user',
  'content': '{"question": what is machine learning?, "context": Generalization in Machine Learning\nIn machine learning we describe the learning of the target function from training data as inductive learning. Induction refers to learning general concepts from specific examples which is exactly the problem that supervised machine learning problems aim to solve. This is different from deduction that is the other way around and seeks to learn specific concepts from general rules. Machine Learning Books\nThis book contains everything that you need to get started with machine learning algorithms, but if you are like me, then you love books. There

### One Example

In [23]:
answer_me.run("Give me an example for a linear regression prediction using (x) and (y) values")