In [3]:
import os
import vertexai

from llama_index.core import Document, VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from langchain_core.prompts import PromptTemplate
from langchain_google_vertexai import VertexAI

from google.oauth2 import service_account

from dotenv import load_dotenv

In [2]:
reader = SimpleDirectoryReader(input_dir="stories")
documents = reader.load_data()

# build index
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model) 




In [3]:
simplest_retriever = index.as_retriever(similarity_top_k=3)

simplest_retriever.retrieve("What can you tell me about Sherlock Holmes?")

[NodeWithScore(node=TextNode(id_='a897a7bc-13bd-48a1-abdf-106054bc6ba4', embedding=None, metadata={'file_path': 'C:\\Users\\flore\\Desktop\\projects\\simple_chatbot\\stories\\The Mystery of the Vanishing Violin.md', 'file_name': 'The Mystery of the Vanishing Violin.md', 'file_size': 3668, 'creation_date': '2024-05-07', 'last_modified_date': '2024-05-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9cf6e0af-2ff4-4750-87a9-47e809790344', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'C:\\Users\\flore\\Desktop\\projects\\simple_chatbot\\stories\\The Mystery of the Vanishing Violin.md', 'file_name': 'The Mystery of the Vanishing Violin.md', 'file_size': 3668, 'creation_date': '2024-05-07', '

In [4]:
# Make it prettier
for node in simplest_retriever.retrieve("What can you tell me about Sherlock Holmes?"):
    print(node.text)
    print("\n------------------------------------------------------------------------------------------------\n")

The Mystery of the Vanishing Violin

In the quiet of 221B Baker Street, the serene morning was abruptly disturbed by the hurried tapping of a walking stick on the wooden stairs leading to Sherlock Holmes's abode. Holmes, lounging in his armchair with a cold pipe dangling from his lips, glanced over the top of his newspaper at the door just as Mrs. Hudson ushered in a distressed gentleman.

“Mr. Holmes, thank heavens! I'm at my wit's end,” the man exclaimed, wringing his hat in his hands. His well-tailored suit and polished shoes spoke of a man accustomed to finer things, perhaps a musician or a conductor.

“Please, take a seat, Mr.?”

“Carrington, Reginald Carrington. I'm the conductor of the London Symphony Orchestra. It’s my Stradivarius, Mr. Holmes—it's gone! Stolen!”

Holmes perked up at the mention of the Stradivarius, his interest clearly piqued. “When was the violin last seen, and under what circumstances did it vanish?”

“It was just last night,” Mr. Carrington explained, his v

### Maybe too much, let us try a different approach

In [5]:
node_parser = SentenceSplitter(chunk_size=200, chunk_overlap=50)

nodes = node_parser.get_nodes_from_documents(documents)

len(documents), len(nodes)

(3, 22)

In [6]:
index = VectorStoreIndex(nodes, embed_model=embed_model)
moderate_retriever = index.as_retriever(similarity_top_k=3)

for node in moderate_retriever.retrieve("What can you tell me about Sherlock Holmes?"):
    print(node.text)
    print("\n------------------------------------------------------------------------------------------------\n")

“Watson, we shall take a look at the scene of the crime. Mr. Carrington, lead the way.”

The crime scene was as Mr. Carrington described. Holmes inspected the lock with meticulous attention and then paced the room slowly, his keen eyes scanning every inch. Suddenly, he halted and knelt, picking up a tiny piece of fine string.

“Remarkable,” he muttered, holding it up to the light.

“What is it, Holmes?” Watson asked, peering over his shoulder.

“A clue, Watson, potentially from the thief’s clothing. Mr. Carrington, do any of your employees play string instruments themselves?”

“Well, yes, several.”

Holmes nodded. “I suspected as much.

------------------------------------------------------------------------------------------------

Holmes and Watson exchanged a look. "Might we see your son's personal quarters?" Holmes requested.

In the viscount's room, Holmes immediately went to the desk, sifting through papers and personal effects. Hidden in a drawer, he found a journal. Flipping th

In [7]:
print(nodes[0].get_content(metadata_mode="all"))
print("\n------------------------------------------------------------------------------------------------\n")
print(nodes[0].get_content(metadata_mode="llm"))
print("\n------------------------------------------------------------------------------------------------\n")
print(nodes[0].get_content(metadata_mode="embed"))

file_path: C:\Users\flore\Desktop\projects\simple_chatbot\stories\The Case of the Midnight Caller.md
file_name: The Case of the Midnight Caller.md
file_size: 3901
creation_date: 2024-05-07
last_modified_date: 2024-05-07

The Case of the Midnight Caller

On a stormy night in London, the rain battered the windows of 221B Baker Street with relentless fury. Sherlock Holmes sat hunched over a chemical experiment, the soft glow of the burner casting eerie shadows on his intense features. Dr. John Watson, accustomed to his friend's nocturnal habits, read a book by the light of the fireplace. Their peace was suddenly interrupted by a sharp, urgent knock at the door.

Mrs. Hudson, looking rather disheveled from the wind, showed in a young woman, soaked to the bone, her face marked by distress. “Mr. Holmes, thank goodness you're here,” she gasped, clutching a damp handkerchief.

------------------------------------------------------------------------------------------------

file_path: C:\Users\

In [4]:
credentials_file = "daredata-chatbot-a4cc28540e5c.json"
credentials: service_account.Credentials = (
    service_account.Credentials.from_service_account_file(credentials_file)
)

vertexai.init(project=credentials.project_id, credentials=credentials)

llm = VertexAI(model_name="gemini-1.0-pro-001")

In [5]:
llm.invoke("What can you tell me about Sherlock Holmes?")

'**Sherlock Holmes**\n\n**Fictional Character:**\n* Created by Sir Arthur Conan Doyle\n* Introduced in the novel "A Study in Scarlet" (1887)\n\n**Description:**\n* Consulting detective\n* Brilliant analytical mind and exceptional deductive abilities\n* Eccentric and antisocial personality\n* Lives at 221B Baker Street, London with Dr. Watson\n\n**Method:**\n* Holmes relies on his powers of observation and deduction to solve mysteries.\n* He uses inductive reasoning, paying attention to the smallest details and drawing logical conclusions.\n\n**Key Traits:**\n* Intellectual superiority\n* Arrogance and self-confidence\n* Cold and unemotional\n* Eccentric habits (e.g., playing the violin, injecting cocaine)\n* Addicted to crime-solving\n\n**Sidekick:**\n* Dr. John H. Watson\n* A retired army doctor who becomes Holmes\'s friend and chronicler\n\n**Famous Cases:**\n* "The Sign of the Four"\n* "The Hound of the Baskervilles"\n* "The Valley of Fear"\n* "The Case of the Norwood Builder"\n* "T

In [10]:
import pandas as pd
from sklearn.datasets import load_diabetes

diabetes_dict = load_diabetes()

data = pd.DataFrame(diabetes_dict["data"], columns=diabetes_dict["feature_names"])

print(diabetes_dict["DESCR"])
data

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [13]:
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

agent = create_pandas_dataframe_agent(llm,
     data, 
     verbose=True,
     agent_executor_kwargs={"handle_parsing_errors": True})

In [15]:
agent.invoke("How many rows are there?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I should get the shape of the dataframe
Action: df.shape
Action Input: [0mdf.shape is not a valid tool, try one of [python_repl_ast].[32;1m[1;3mAction: python_repl_ast
Action Input: print(df.shape[0])[0m[36;1m[1;3m442
[0m[32;1m[1;3mFinal Answer: 442[0m

[1m> Finished chain.[0m


{'input': 'How many rows are there?', 'output': '442'}