In [60]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install pypdf
!pip install -qU langchain-mistralai
!pip install -qU langchain-core
!pip install -U langchain-groq

Collecting langchain-groq
  Downloading langchain_groq-0.2.4-py3-none-any.whl.metadata (3.0 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Downloading groq-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading langchain_groq-0.2.4-py3-none-any.whl (14 kB)
Downloading groq-0.18.0-py3-none-any.whl (121 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.9/121.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain-groq
Successfully installed groq-0.18.0 langchain-groq-0.2.4


In [65]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("nke-10k-2023.pdf")
docs = loader.load()
print(len(docs))  #Split per page

107


In [66]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': 'nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [67]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)
print(len(all_splits))

516


In [68]:
from google.colab import userdata
mistral_api_key = userdata.get('mistral_key')
groq_api_key = userdata.get('groq_api_key')

In [69]:
import getpass
import os

os.environ["MISTRAL_API_KEY"] = mistral_api_key

In [70]:
os.environ["GROQ_API_KEY"] = groq_api_key

In [71]:
from langchain_mistralai import MistralAIEmbeddings

embeddings = MistralAIEmbeddings(model="mistral-embed")



In [72]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [73]:
ids = vector_store.add_documents(documents=all_splits)

In [74]:
from langchain.chat_models import init_chat_model

llm_mistral = init_chat_model("mistral-large-latest", model_provider="mistralai")
llm_llama = init_chat_model("llama3-8b-8192", model_provider="groq")

In [75]:
base_prompt = f"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."

In [76]:
question = "How much was the lease expense for 2023"

retrieved_docs = vector_store.similarity_search(question)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
prompt =f"{base_prompt}\n\nContexte : {docs_content}\n\nQuestion : {question}"
answer = llm_llama.invoke(prompt)

In [77]:
print(answer.content)

According to the context, lease expense for the fiscal year ended May 31, 2023, primarily consisted of operating lease costs of $585 million and variable lease costs of $403 million.
