# Importing Libraries

In [89]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [159]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import SquadExample, pipeline

# Document Loading

In [92]:
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context" 

loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
data = loader.load()

data[:2]



[Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."'),
 Document(metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'}, page_content='""')]

# Document Transfomers

In [94]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000 , chunk_overlap = 150)

docs = text_splitter.split_documents(data)

In [95]:
docs[0]

Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."')

# Text Embedding

In [97]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name = modelPath,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [98]:
text = "This is a test doc"
query_res = embeddings.embed_query(text)
query_res[:3]

[-0.0014149884227663279, 0.05130577087402344, -0.0014118633698672056]

# Vector Stores

In [100]:
db = FAISS.from_documents(docs , embeddings)

In [101]:
question = "What is internet?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

of networking. An implementation of the layers for a particular application forms a protocol stack. From lowest to highest, the layers are the link layer, containing communication methods for data that remains within a single network segment (link); the internet layer, providing internetworking between independent networks; the transport layer, handling host-to-host communication; and the application layer, providing process-to-process data exchange for applications.\n\nThe technical standards underlying the Internet protocol suite and its constituent protocols are maintained by the Internet Engineering Task Force (IETF). The Internet protocol suite predates the OSI model, a more comprehensive reference framework for general networking systems."


# LLM Preparation

In [103]:
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

In [145]:
model_name = "Intel/dynamic_tinybert"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)
question_answerer = pipeline(
    task = "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

In [147]:
preds = question_answerer(
    question="What is the name of the repository?",
    context="The name of the repository is huggingface/transformers",
)

print(
    f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
)

score: 0.8342, start: 30, end: 54, answer: huggingface/transformers


# Retrievers

In [106]:
retrivers = db.as_retriever()

In [122]:
docs = retrivers.invoke("What is internet")
print(docs[0].page_content)

packet switching, a technology that sends a message in portions to its destination asynchronously without passing it through a centralized mainframe. A four-node network emerged on 5 December 1969, constituting the beginnings of the ARPANET, which by 1981 had grown to 213 nodes. ARPANET eventually merged with other networks to form the Internet. While Internet development was a focus of the Internet Engineering Task Force (IETF) who published a series of Request for Comments documents, other networking advancements occurred in industrial laboratories, such as the local area network (LAN) developments of Ethernet (1983) and Token Ring (1984)[citation needed].\n\nGrowth of transmission capacity\nThe effective capacity to exchange information worldwide through two-way telecommunication networks grew from 281 petabytes (pB) of optimally compressed information in 1986 to 471 pB in 1993 to 2.2 exabytes (eB) in 2000 to 65 eB in 2007. This is the informational equivalent of two newspaper


# Retrival QA Chain

In [125]:
retriever = db.as_retriever(search_kwargs={"k": 4})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

In [155]:
question = "Who is Thomas Jefferson?"
result = qa.run({"query": question})
print(result["result"])

ValueError: Context information is below. 
------------
"Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams."
------------
Given the context information and not prior knowledge, answer the question: Who is Thomas Jefferson?
 argument needs to be of type (SquadExample, dict)

In [174]:
question = "Where is France?"
context = "France's capital city is Paris, known for its art, gastronomy, and culture."

squad_example = SquadExample(
    qas_id="0",
    question_text=question,
    context_text=context,
    title="France",
    answer_text=None,  # No answer text needed for the question-answering pipeline
    start_position_character=None  # No start position needed for the question-answering pipeline
)

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering")

result = qa_pipeline({
    'question': squad_example.question_text,
    'context': squad_example.context_text
})


# Print the result
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.9684469103813171, 'start': 25, 'end': 30, 'answer': 'Paris'}
