## Download Data ??

In [2]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [3]:
!unzip -q new_articles.zip -d new_articles

## Load Enviro. Vars ??

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['HUGGINGFACE_API'] = os.getenv('HUGGINGFACE_API')

## Import Necassary Library ??

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

## Load Data ??

In [5]:
loader = DirectoryLoader(
    "new_articles/",
    glob="./05-04*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"}  # ✅ Prevent decode crash
)

In [6]:
document = loader.load()

document

 Document(metadata={'source': 'new_articles\\05-04-hugging-face-and-servicenow-release-a-free-code-generating-model.txt'}, page_content='AI startup Hugging Face and ServiceNow Research, ServiceNow’s R&D division, have released StarCoder, a free alternative to code-generating AI systems along the lines of GitHub’s Copilot.\n\nCode-generating systems like DeepMind’s AlphaCode; Amazon’s CodeWhisperer; and OpenAI’s Codex, which powers Copilot, provide a tantalizing glimpse at what’s possible with AI within the realm of computer programming. Assuming the ethical, technical and legal issues are someday ironed out (and AI-powered coding tools don’t cause more bugs and security exploits than they solve), they could cut development costs substantially while allowing coders to focus on more creative tasks.\n\nAccording to a study from the University of Cambridge, at least half of developers’ efforts are spent debugging and not actively programming, which costs the software industry an estimated 

## Handle the Idea of Chunk Size and Chunk Overlap ??

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

chunks = text_splitter.split_documents(document)

chunks

[Document(metadata={'source': 'new_articles\\05-04-cma-generative-ai-review.txt'}, page_content='Well that was fast. The U.K.’s competition watchdog has announced an initial review of “AI'),
 Document(metadata={'source': 'new_articles\\05-04-cma-generative-ai-review.txt'}, page_content='of “AI foundational models”, such as the large language models (LLMs) which underpin OpenAI’s'),
 Document(metadata={'source': 'new_articles\\05-04-cma-generative-ai-review.txt'}, page_content='OpenAI’s ChatGPT and Microsoft’s New Bing. Generative AI models which power AI art platforms such'),
 Document(metadata={'source': 'new_articles\\05-04-cma-generative-ai-review.txt'}, page_content='such as OpenAI’s DALL-E or Midjourney will also likely fall in scope.'),
 Document(metadata={'source': 'new_articles\\05-04-cma-generative-ai-review.txt'}, page_content='The Competition and Markets Authority (CMA) said its review will look at competition and consumer'),
 Document(metadata={'source': 'new_articles\\05-0

In [9]:
len(chunks) # Number of Chunks Created ??

407

In [10]:
chunks[1]

Document(metadata={'source': 'new_articles\\05-04-cma-generative-ai-review.txt'}, page_content='of “AI foundational models”, such as the large language models (LLMs) which underpin OpenAI’s')

## Creating VectorDB -> ChromaDB ??

In [12]:
from langchain import embeddings

persist_directory = "db"

embedding = HuggingFaceEmbeddings()

embedding

  embedding = HuggingFaceEmbeddings()
  embedding = HuggingFaceEmbeddings()





HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
vectordb = Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory=persist_directory)

In [None]:
# Persiste The DB To Disk 

vectordb.persist()
vectordb = None

In [None]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

## Make a Retriever ??

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

len(docs)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

In [None]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

len(docs)

## Make a Chain ??

In [None]:
from langchain.chains import RetrievalQA

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=HuggingFaceHub,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [None]:
query = "How mush money did Microsoft raise?"

llm_response = qa_chain(query)

llm_response

In [None]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')

    for source in llm_response['source_documents']:
        print(source.metadata['source'])

In [None]:
process_llm_response(llm_response)

## Deleting The DB ??

In [None]:
!zip -r db.zip ./db

In [None]:
# To Cleanup, You Can Delete the Collection ??
vectordb.delete_collection()
vectordb.persist()

# Delete the Directory ??
!rm -rf db/

## If You Want to Start Again To Use Your DB ??

In [None]:
!unzip db.zip