In [4]:
# Build a RAG
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install -U langchain-community

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



In [5]:
# Load the dataset
from datasets import load_dataset

dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"

dataset = load_dataset(dataset_name)
print(dataset.column_names)


{'train': ['instruction', 'context', 'response', 'category']}


In [6]:
data = dataset['train'][page_content_column]  
print(data[:2]) 

["Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", '']


In [7]:
class HuggingFaceDatasetLoader:
    def __init__(self, dataset_name, page_content_column):
        self.dataset_name = dataset_name
        self.page_content_column = page_content_column

    def load(self):
        dataset = load_dataset(self.dataset_name)
        
        data = dataset['train'][self.page_content_column]
        return data
    
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column) 
data = loader.load()
print(data[:2])


["Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", '']


In [8]:
# Split the documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

documents = [Document(page_content=doc) for doc in data]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

docs = text_splitter.split_documents(documents)
print(docs[0]) 

page_content='Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.'


In [9]:
# Embedding 
from langchain.embeddings import HuggingFaceEmbeddings
import tensorflow.keras as keras

modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

# Initialize HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
model_name=modelPath,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)

In [10]:
# Test
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(query_result[:3])

[-0.038338541984558105, 0.12346471101045609, -0.02864299900829792]


In [11]:
# Create a vector store
from langchain.vectorstores import FAISS
db = FAISS.from_documents(docs, embeddings)
print("FAISS vector store created successfully.")

FAISS vector store created successfully.


In [12]:
# Prepare the LLM model
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from langchain import HuggingFacePipeline

# Load the tokenizer and question-answering model:
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering pipeline
model_name = "Intel/dynamic_tinybert"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)
Youtubeer = pipeline(
  "question-answering",
  model=model_name,
  tokenizer=tokenizer,
  return_tensors='pt'
)

# Create a Langchain pipeline wrapper
llm = HuggingFacePipeline(
  pipeline=Youtubeer,
  model_kwargs={"temperature": 0.7, "max_length": 512},
)

tokenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Device set to use mps:0


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  llm = HuggingFacePipeline(


In [13]:
# Build the Retrieval QA Chain
from langchain.chains import RetrievalQA
retriever = db.as_retriever(search_kwargs={"k": 4}) 
# Optional: You can adjust k for number of documents retrieved

# Build the RetrievalQA chain
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)


In [20]:
# Test your RAG system
question = "What is cheesemaking?"
result = qa.run({"query": question})  # Pass the question in a dictionary format
print(result) 



ValueError: Context information is below. 
------------
The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow's milk is most commonly used worldwide. The cheesemaker's goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.

Some cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.
------------
Given the context information and not prior knowledge, answer the question: What is cheesemaking?
 argument needs to be of type (SquadExample, dict)