## Setup

In [1]:
%pip install -Uqqq rich openai tiktoken wandb langchain unstructured tabulate pdf2image chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m35.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Downloading langchain_community-0.3.21-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading pydantic_settings-2.9.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpx-sse, pydantic-settings, langchain-community
Successfully installed httpx-sse-0.4.0 langchain-community-0.3.21 pydantic-settings-2.9.1


In [25]:
import os
from pathlib import Path
import tiktoken
from getpass import getpass
from rich.markdown import Markdown
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [4]:
# get an open ai key
if os.getenv("OPENAI_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["OPENAI_API_KEY"] = getpass("")

assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

··········
OpenAI API key configured


In [5]:
# Langchain
# we need a single line of code to start tracing langchain with W&B
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
os.environ["WANDB_PROJECT"] = "maven-article"

In [17]:
# Parsing Documents
model_name = 'text-davinci-003'

from langchain.document_loaders import DirectoryLoader

def find_md_files(directory):
    'find all markdown files in a directory'
    d1 = DirectoryLoader(directory, glob="**/*.txt")
    return d1.load()

documents = find_md_files('./')
len(documents)

1

In [18]:
documents

[Document(metadata={'source': 'sample_data/Open AI in Education, the Responsib.txt'}, page_content='Open AI in Education, the Responsible and Ethical Use of ChatGPT Towards\n\nLifelong Learning\n\nDavid Mhlanga\n\nUniversity of Johannesburg, South Africa\n\nCollege of Business and Economics\n\nPO Box 524, Auckland Park, 2006\n\ndmhlanga67@gmail.com\n\nAbstract\n\nSignificant changes have been brought about in society, the economy, and the environment as a\n\nresult of the quick development of technology and the interconnection of the world. Artificial\n\nintelligence has advanced significantly in recent years, which has sparked the creation of ground\n\nbreaking technologies like Open AI\'s ChatGPT. Modern technology like the ChatGPT language\n\nmodel has the potential to revolutionize the educational landscape. This article\'s goals are to\n\npresent a thorough analysis of the responsible and ethical usage of ChatGPT in education, as well\n\nas to encourage further study and debate on

In [19]:
# we will need to count tokens in the document and for that we need the tokenizer
toknizer = tiktoken.encoding_for_model(model_name)

def count_tokens(text):
    return [len(toknizer.encode(document.page_content)) for document in text]

count_tokens(documents)

[14435]

In [22]:
# split the document usiing markdowntextsplitter
from langchain.text_splitter import MarkdownTextSplitter

md_text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=200)
document_section = md_text_splitter.split_documents(documents)

len(document_section), max(count_tokens(document_section))

(81, 369)

In [26]:
Markdown(document_section[0].page_content)

In [31]:
#get a hugging face api key
if os.getenv("huggingface_api_key") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["huggingface_api_key"] = getpass("")

assert os.getenv("huggingface_api_key", "").startswith("hf"), "This doesn't look like a valid huggingface API key"
print("huggingface API key configured")

huggingface API key configured


In [42]:
# Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
vectorstore = Chroma.from_documents(document_section, embeddings)

retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

query = 'Describe about what this document about?'
docs = retreiver.get_relevant_documents(query)
docs

[Document(metadata={'source': 'sample_data/Open AI in Education, the Responsib.txt'}, page_content='Figure 1: Articles used from Various Sites\n\nGoogle Scholar\n\nScopus\n\nWeb of Science\n\nResearch Gate\n\nFigure 1 above is outlining the distribution of articles across various sites like google scholar,\n\nScopus, and research gate. Figure 2 explains the process followed to include and exclude other\n\narticles.\n\nElectronic copy available at: https://ssrn.com/abstract=4354422\n\nSource: Author’s Analysis\n\nFigure 2 outlines the steps that were used to determine which articles to include and which to\n\nomit. It details the websites, such as Google Scholar, Web of Science, and ResearchGate, from\n\nwhich the original documents were collected, as well as the screenings that were performed\n\ninitially and ultimately.\n\nTable 1: Selected Articles Consulted in the Study\n\nStudy Focus Year\n\nRodgers, W., Murray, J. M.,\n\nStefanidis, A., Degbey, W. Y., &\n\nTarba, S. Y.\n\nAn artif

In [43]:
# Stuff Prompt
from langchain.prompts import PromptTemplate
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

context = '\n\n'.join([doc.page_content for doc in docs])

prompt = PROMPT.format(context=context, question=query)

In [44]:
# get a groq api
if os.getenv("GROQ_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["GROQ_API_KEY"] = getpass("")

assert os.getenv("GROQ_API_KEY", "").startswith("gsk"), "This doesn't look like a valid groq API key"
print("groq API key configured")

groq API key configured


In [45]:
from langchain_groq import ChatGroq

llm = ChatGroq(model_name= 'Gemma2-9b-It')

response = llm.invoke(prompt)
Markdown(response.content)

In [46]:
# using langchain retreivalqa
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain

qa = RetrievalQA.from_chain_type(llm=ChatGroq(model_name= 'Gemma2-9b-It'), chain_type="stuff", retriever = retreiver)

result = qa.run(query)

Markdown(result)

  result = qa.run(query)
