## Kownledge worker(By Sai Kumar)

In [46]:
# imports

import os
import glob
import gradio as gr

In [47]:

MODEL = "llama3.2"
db_name = "vector_db"

In [48]:
pip install -U langchain-community langchain-chroma

Note: you may need to restart the kernel to use updated packages.


In [49]:
# imports for langchain and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
import numpy as np
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [50]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [51]:
import wikipedia  # Wikipedia API
import requests   # To make HTTP requests
from bs4 import BeautifulSoup  # For web scraping

In [52]:
# Create base folders
folders = ["wikipedia", "cricinfo", "youtube", "books", "quotes"]
base_folder = "knowledge-base"
for folder in folders:
    os.makedirs(os.path.join(base_folder, folder), exist_ok=True)

In [53]:
# Save text to a file
def save_to_file(content, folder, filename):
    path = os.path.join(base_folder, folder, filename)
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)
    print("Saved:", path)

In [54]:
# 1. Wikipedia content
def fetch_wikipedia():
    try:
        url = "https://en.wikipedia.org/wiki/MS_Dhoni"
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = "\n".join(p.text for p in paragraphs if p.text.strip() != "")
        save_to_file(text, "wikipedia", "ms_dhoni.md")
    except Exception as e:
        print("Wikipedia Error:", e)

In [55]:
# 2. Cricinfo content
def fetch_cricinfo():
    try:
        url = "https://www.espncricinfo.com/player/ms-dhoni-28081"
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = "\n".join(p.text for p in paragraphs if p.text.strip() != "")
        save_to_file(text, "cricinfo", "dhoni_stats.md")
    except Exception as e:
        print("Cricinfo Error:", e)

In [56]:
# # 3. YouTube transcript 
# def transcribe_youtube(url):
#     try:
#         subprocess.run(["yt-dlp", "-x", "--audio-format", "mp3", url, "-o", "yt_audio.%(ext)s"], check=True)
#         subprocess.run(["whisper", "yt_audio.mp3", "--model", "base", "--output_format", "txt"], check=True)
#         with open("yt_audio.txt", "r", encoding="utf-8") as f:
#             text = f.read()
#         save_to_file(text, "youtube", "interview_transcript.md")
#     except Exception as e:
#         print("YouTube Error:", e) # This is the optional part doesn't work out for some cases.

In [3]:
# 4. PDF to text (optional)
def extract_pdf_text(pdf_path, save_name="msd_book.md"):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text() for page in doc])
        save_to_file(text, "books", save_name)
    except Exception as e:
        print("PDF Error:", e)

In [58]:
# 5. Add quotes
def add_custom_quotes():
    quotes = [
        "I don't mind repeating everything.",
        "You don't play for the crowd, you play for the country.",
        "Process is more important than the result."
    ]
    save_to_file("\n".join(quotes), "quotes", "inspirational_quotes.md")

In [59]:
# Run functions
fetch_wikipedia()
fetch_cricinfo()
add_custom_quotes()

Saved: knowledge-base\wikipedia\ms_dhoni.md
Saved: knowledge-base\cricinfo\dhoni_stats.md
Saved: knowledge-base\quotes\inspirational_quotes.md


In [5]:
#transcribe_youtube("https://www.youtube.com/watch?v=example")
extract_pdf_text("msd_book.pdf") ## System is getting some unwanted error, That's why i am not considering pdf part.

PDF Error: name 'fitz' is not defined


In [61]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1175, which is longer than the specified 1000
Created a chunk of size 1355, which is longer than the specified 1000
Created a chunk of size 1277, which is longer than the specified 1000
Created a chunk of size 2327, which is longer than the specified 1000
Created a chunk of size 1632, which is longer than the specified 1000
Created a chunk of size 1167, which is longer than the specified 1000
Created a chunk of size 2315, which is longer than the specified 1000
Created a chunk of size 2149, which is longer than the specified 1000
Created a chunk of size 1662, which is longer than the specified 1000
Created a chunk of size 1285, which is longer than the specified 1000
Created a chunk of size 2328, which is longer than the specified 1000
Created a chunk of size 2203, which is longer than the specified 1000
Created a chunk of size 2263, which is longer than the specified 1000
Created a chunk of size 2208, which is longer than the specified 1000
Created a chunk of s

Total number of chunks: 226
Document types found: {'wikipedia', 'cricinfo', 'books', 'quotes'}


In [62]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [63]:
# Create vectorstore
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 678 documents


## Time to use LangChain to bring it all together

In [65]:
from langchain_openai import ChatOpenAI

In [66]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the llama3.2 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [67]:
# Let's try a simple question

query = "what is the age of ms dhoni"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

According to the context provided, MS Dhoni was born on 7 July 1981, but his current age is not mentioned. However, based on this birthdate, we can calculate his age. As of my knowledge cutoff in December 2023, MS Dhoni's age would be approximately 42 years old (if he were still alive), but please note that I'm an AI and do not have real-time access to current events or the current age of individuals.


In [68]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the llama3.2 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Lets play with Gradio

In [70]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [71]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.


In [72]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 100})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [73]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [74]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.
