In [4]:
!pip install -q python-dotenv llama-index==0.10.59 llama-index-embeddings-huggingface==0.2.2 torch==2.2.2 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 google-generativeai==0.5.4 llama-index-llms-gemini==0.1.11


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import os
import chromadb
from dotenv import load_dotenv

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.llms import ChatMessage
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool
from llama_index.core.tools import QueryEngineTool
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.gemini import Gemini

In [7]:
load_dotenv()
#openai_api = os.getenv("OPENAI_API_KEY")
google_api = os.getenv("GOOGLE_API_KEY")

In [8]:
documents = SimpleDirectoryReader("./data").load_data()
#print(documents)
print("Documents loading done...")

Documents loading done...


In [9]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model
Settings.chunk_size = 512

In [10]:
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("trial")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
print(index)
print("Documents indexing and saving done...")

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x0000021EA0D18250>
Documents indexing and saving done...


In [11]:
llm = Gemini(model="models/gemini-1.5-pro")
Settings.llm = llm

In [12]:
query_engine = index.as_query_engine()

In [8]:
print("----------------------")
messages = []
start=input('to start/stop chat, enter y/n')
while start=='y':
    user_question = input('enter question: ')
    messages.append(ChatMessage(role="user", content=user_question))

    resp = llm.chat(messages)
    messages.append(ChatMessage(role="assistant", content=resp.message.content))
    print(messages)

    start=input('to continue/stop chat, enter y/n')

print("----------------------")

----------------------
[ChatMessage(role=<MessageRole.USER: 'user'>, content='hi', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Hi there! How can I help you today? \n', additional_kwargs={})]
[ChatMessage(role=<MessageRole.USER: 'user'>, content='hi', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Hi there! How can I help you today? \n', additional_kwargs={}), ChatMessage(role=<MessageRole.USER: 'user'>, content='tell me your name', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content="I don't have a name. I'm a large language model, and I'm not a person. You can call me Bard, though! \n\nWhat would you like to talk about today? \n", additional_kwargs={})]
----------------------


In [13]:
def multiply(a: float, b: float) -> float:
    return a * b

def add(a: float, b: float) -> float:
    return a + b

multiply_tool = FunctionTool.from_defaults(fn=multiply)
add_tool = FunctionTool.from_defaults(fn=add)

qna_tool = QueryEngineTool.from_defaults(
    query_engine,
    name="rag_question_answer_tool",
    description="A RAG engine with information of company policy.",
)

In [14]:
agent = ReActAgent.from_tools([multiply_tool, add_tool, qna_tool], llm=llm, verbose=True)

In [22]:
while True:
    text_input = input("User: ")
    if text_input == "e":
        break
    response = agent.chat(text_input)
    print(f"Agent: {response}")
    

> Running step 486a8051-beff-4be3-a3e5-938b06c09301. Step input: can you spell bee
[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: B-E-E 🐝 

[0mAgent: B-E-E 🐝 

> Running step ca020078-5a3d-4c36-a8e0-bfdbed56d8d2. Step input: how many vowels are there
[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: There are two vowels in the word "bee": **E** and **E**. 

[0mAgent: There are two vowels in the word "bee": **E** and **E**. 

> Running step c89afde8-ceac-4789-80c4-ff6f28cade99. Step input: is it pallindrome
[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: You got it!  "Bee" is indeed a palindrome because it reads the same forwards and backward. 🐝  

[0mAgent: You got it!  "Bee" is indeed a palindrome because it reads the same forwards and backward. 🐝  

> Running step 4578e158-24f9-4a87-accd-267f5e7c3b27. Step input: then spell it backward and varify once again
[1;3;38;5;200mThought: (Impli

In [23]:
!pip install -q transformers scipy


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
!pip install -q sentencepiece datasets[audio]


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
# text-to-speech
from transformers import pipeline
import scipy

synthesiser = pipeline("text-to-speech", "suno/bark")

speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"do_sample": True})

scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


error: argument out of range

In [26]:
from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("suno/bark")
model = AutoModel.from_pretrained("suno/bark")

inputs = processor(
    text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
    return_tensors="pt",
)

speech_values = model.generate(**inputs, do_sample=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [27]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)


In [2]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import sentencepiece
import soundfile as sf
from datasets import load_dataset

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

inputs = processor(text="Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.", return_tensors="pt")

# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

sf.write("speech.wav", speech.numpy(), samplerate=16000)


In [5]:
from IPython.display import Audio

sampling_rate = 16000

Audio(speech.cpu().numpy().squeeze(), rate=sampling_rate)

In [17]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
from datasets import load_dataset
from IPython.display import Audio

def text_to_speech(text_input):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    inputs = processor(text=text_input, return_tensors="pt")

    # load xvector containing speaker's voice characteristics from a dataset
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    return speech

In [25]:
while True:
    text_input = input("User: ")
    if text_input == "e":
        break
    response = agent.chat(text_input)
    print(f"Agent: {response}")

    speech = text_to_speech(response.response)
    #Audio(speech.cpu().numpy().squeeze(), rate=16000)
    sf.write("my_output.wav", speech.numpy(), samplerate=16000)
    
    

> Running step f2cd7a6d-aa6f-438a-a384-d2fa38cddb0d. Step input: hi
[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: Hi! 😊  I'm here whenever you're ready to chat or if you need help with something. Just let me know!  😄 

[0mAgent: Hi! 😊  I'm here whenever you're ready to chat or if you need help with something. Just let me know!  😄 

Hi! 😊  I'm here whenever you're ready to chat or if you need help with something. Just let me know!  😄 

> Running step 67ab01c4-aa78-4f6f-b165-7fca92eafd6a. Step input: what girlfriend means
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: rag_question_answer_tool
Action Input: {'input': 'what is a girlfriend?'}
[0m[1;3;34mObservation: This document is an employee agreement and does not contain the answer to this question. 
[0m> Running step 2ae344d1-3eeb-40b6-9d9a-0aa4db2cc8ce. Step input: None
[1;3;38;5;200mThought: I cannot answer the 