In [1]:
%pip install langchain openai pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Document Loading

In [2]:
import os
import sys

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("./LawsofTheGame2022_23.pdf")
pages = loader.load()

In [3]:
len(pages)

230

In [4]:
trimmed_pages = pages[10:200] # Just the relevant pages
page = trimmed_pages[0]

In [5]:
print(page.page_content[0:500])

11
 Football is the greatest sport on earth. It is played on every continent, in every 
country and at many different levels. The fact that the Laws of the Game are the 
same for all football throughout the world, from the FIFA World Cup™ through 
to a game between young children in a remote village, is a considerable 
strength which must continue to be harnessed for the good of football 
everywhere.
 Football must have Laws which keep the game fair – this is a crucial foundation 
of the ‘beauti


# Document Splitting

In [6]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [7]:
splits = text_splitter.split_documents(trimmed_pages)

In [8]:
print("Length of trimmed_pages: ", len(trimmed_pages))
print("Length of splits: ", len(splits))

Length of trimmed_pages:  190
Length of splits:  245


In [9]:
print(splits[0])

page_content='11\n Football is the greatest sport on earth. It is played on every continent, in every \ncountry and at many different levels. The fact that the Laws of the Game are the \nsame for all football throughout the world, from the FIFA World Cup™ through \nto a game between young children in a remote village, is a considerable \nstrength which must continue to be harnessed for the good of football \neverywhere.\n Football must have Laws which keep the game fair – this is a crucial foundation \nof the ‘beautiful game’ and a vital feature of the ‘spirit’ of the game. The best \nmatches are those where the referee is rarely needed because the players play \nwith respect for each other, the match officials and the Laws.\n Football’s Laws are relatively simple compared to most other team sports, but \nas many situations are subjective and match officials are human, some \ndecisions will inevitably be wrong or cause debate and discussion. For some \npeople, this discussion is part o

# Embeddings
Let's take our splits and embed them.

In [10]:
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [11]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [12]:
%pip install chromadb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
!rm -rf ./docs/chroma

In [14]:
persist_directory = 'docs/chroma/'

In [15]:
%pip install tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from langchain.vectorstores import Chroma

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [17]:
print(vectordb._collection.count())

245


In [18]:
question = "how long does a match last for"

In [19]:
docs = vectordb.similarity_search(question,k=3)

In [20]:
len(docs)

3

In [21]:
docs[0]

Document(page_content='77 Laws of the Game 2022/23   |  Law 7  |  The Duration of the Match1. Periods of play\n A match lasts for two equal halves of 45 minutes, which may only be reduced  \nif agreed between the referee and the two teams before the start of the match \nand if in accordance with competition rules.\n2. Half-time interval\n Players are entitled to an interval at half-time, not exceeding 15 minutes;  \na short drinks break (which should not exceed one minute)  is permitted at the \ninterval of half-time in extra time. Competition rules must state the duration of \nthe half-time interval and it may be altered only with the referee’s permission.\n3. Allowance for time lost\n Allowance is made by the referee in each half for all playing time lost in that \nhalf through:\n•\u2002substitutions\n•\u2002assessment and/or removal of injured players\n•\u2002wasting time\n•\u2002disciplinary sanctions\n•\u2002 medical stoppages permitted by competition rules, e.g. ‘drinks’ breaks \

In [22]:
docs[1]

Document(page_content='Practical \nguidelines \nfor match \nofficials', metadata={'page': 169, 'source': './LawsofTheGame2022_23.pdf'})

In [23]:
vectordb.persist()

# Retrieval

In [24]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import OpenAI

In [None]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [None]:
# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [None]:
question = "how long does a match last for"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

# Question Answering

In [None]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
print(vectordb._collection.count())

In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [None]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_API_KEY"] = "ls__a28eabe3968647c5a2752e344d35b269" # replace dots with your api key

In [None]:
print(question)

In [None]:
from langchain.chains import RetrievalQA

qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)
result = qa_chain_mr({"query": question})
result["result"]

# Chat

In [None]:
#%pip install panel

In [None]:
import panel as pn  # GUI
pn.extension()

In [None]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})


result = qa_chain({"query": question})
result["result"]

In [None]:
# Memory
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
# ConversationalRetrievalChain
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [None]:
question = "how long does a match last for"
result = qa({"question": question})

In [None]:
result['answer']

In [None]:
question_2 = "Is the game allowed to go to extra time"
result = qa({"question": question_2})

In [None]:
result['answer']