In [11]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu

In [94]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveJsonSplitter
import json
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document


# Test on an other text

In [40]:
## Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

# Display the first 15 entries
data[:2]



Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

[Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}),
 Document(page_content='""', metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'})]

In [43]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)
docs[0]

Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'})

In [44]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]



[-0.03833853825926781, 0.12346469610929489, -0.02864299900829792]

In [45]:
db = FAISS.from_documents(docs, embeddings)

In [48]:
question = "What is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

"The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow's milk is most commonly used worldwide. The cheesemaker's goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.\n\nSome cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.\n\nCulturing\nCheese is made by bringing milk (possibly pasteurised) in the cheese vat to a temperature required to promote the growth of the bacteria that feed on lactose and thus ferment the lactose into lactic acid. These bacteria in the milk may be wild, as is the case with unpasteurised milk, added from a culture,


In [49]:
# Create a tokenizer object by loading the pretrained "Intel/dynamic_tinybert" tokenizer.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

# Create a retriever object from the 'db' using the 'as_retriever' method.
# This retriever is likely used for retrieving data or documents from the database.
retriever = db.as_retriever()
docs = retriever.get_relevant_documents("What is Cheesemaking?")
print(docs[0].page_content)

"The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow's milk is most commonly used worldwide. The cheesemaker's goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.\n\nSome cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.\n\nCulturing\nCheese is made by bringing milk (possibly pasteurised) in the cheese vat to a temperature required to promote the growth of the bacteria that feed on lactose and thus ferment the lactose into lactic acid. These bacteria in the milk may be wild, as is the case with unpasteurised milk, added from a culture,


In [51]:
# Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "refine," the retriever we created, and an option to not return source documents.
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

In [52]:
question = "Who is Thomas Jefferson?"
result = qa.run({"query": question})
print(result["result"])

ValueError: Context information is below. 
------------
"Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams."
------------
Given the context information and not prior knowledge, answer the question: Who is Thomas Jefferson?
 argument needs to be of type (SquadExample, dict)

# Test on the Q&A_format.md

In [54]:
# Create a loader instance
loader = HuggingFaceDatasetLoader('Q&A_format.md')

# Load the data
with open('Q&A_format.md', 'r', encoding='utf-8') as file:
    data = file.read()

# Display the first 100 entries
data[:100]

'Question: What is a Reservoir Computing architecture?\nAnswer: A Reservoir Computing (RC) architectur'

In [99]:
# Load the data from the file
with open('Q&A_format.md', 'r', encoding='utf-8') as file:
    data = file.read()

# Manually split the document based on headers
questions_answers = data.split("Question: ")

# Remove empty first element if it exists
if questions_answers[0].strip() == "":
    questions_answers = questions_answers[1:]

# Further split each Q&A pair into question and answer
split_docs = []
for qa in questions_answers:
    parts = qa.split("Answer: ")
    if len(parts) == 2:
        question = parts[0].strip()
        answer = parts[1].strip()
        split_docs.append({"Question": question, "Answer": answer})

# Display the first few splits to verify
print(split_docs[:5])

[{'Question': 'What is a Reservoir Computing architecture?', 'Answer': 'A Reservoir Computing (RC) architecture is a type of recurrent neural network (RNN) where the recurrent layer, called the reservoir, consists of randomly and recurrently connected neurons. This reservoir projects input data into a high-dimensional space to encode temporal information. The only part of the network that is trained is the output layer, called the readout, typically using simple linear regression.'}, {'Question': 'What is an Echo State Network?', 'Answer': 'An Echo State Networks (ESNs) are a type of recurrent neural network architecture in reservoir computing (RC). They consist of two main components: **Feedback Connections**: Optionally, readout activations can be fed back to the reservoir to stabilize neuron activities. **Implementation**: Connections are stored as NumPy arrays or SciPy sparse matrices.'}, {'Question': 'What is a Feature?', 'Answer': 'A feature is an attribute associated with an inp

In [83]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device': 'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)



In [100]:
query_result = embeddings.embed_query(data)
query_result[:3]

[-0.06041721999645233, -0.057492200285196304, -0.028663907200098038]

In [102]:
# Prepare the documents in the required format for FAISS
documents = []
for item in split_docs:
    documents.append(Document(page_content=f"Question: {item['Question']}\nAnswer: {item['Answer']}"))

# Create FAISS index from documents
db = FAISS.from_documents(documents, embeddings)

In [134]:
question = "What is classification?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

Question: What is a classification task?
Answer: A classification task involves assigning input data to one of several predefined categories or classes. The goal is to predict the category to which new data points belong, based on the training data. Examples include identifying email as spam or not spam, classifying images of animals, or recognizing spoken words.


In [139]:
question = "What is classification?"
searchDocs = db.similarity_search_with_score(question)
list = []
for i in range(len(searchDocs)):
    if i > 5:
        break
    else:
        # Here the returned distance score is L2 distance. Therefore, a lower score is better. (https://python.langchain.com/v0.2/docs/integrations/vectorstores/faiss/)
        print("Similarity = ", searchDocs[i][1],"/", searchDocs[i][0])

Similarity =  0.62285554 / page_content='Question: What is a classification task?\nAnswer: A classification task involves assigning input data to one of several predefined categories or classes. The goal is to predict the category to which new data points belong, based on the training data. Examples include identifying email as spam or not spam, classifying images of animals, or recognizing spoken words.'
Similarity =  0.9217788 / page_content="Question: What is supervised learning?\nAnswer: This is when models learn form data that's already been labeled. It's like having an answer key."
Similarity =  0.94594586 / page_content="Question: What is the difference betwwen regression and classification model?\nAnswer: Regression models predict a continuous variable, such as rainfall amount or sunlight intensity. They can also predict probabilities, such as the probability that an image contains a cat. A probability-predicting regression model can be used as part of a classifier by imposing 

### Time to prepare a LLM Model
#### We will use here Intel/dynamic_tinybert which is a fine-tuned model for the purpose of question-answering.

In [140]:
# Create a tokenizer object by loading the pretrained "Intel/dynamic_tinybert" tokenizer.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

In [142]:
# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

In [143]:
# Create a retriever object from the 'db' using the 'as_retriever' method.
# This retriever is used for retrieving data or documents from the database.
retriever = db.as_retriever()

In [146]:
docs = retriever.get_relevant_documents("What is classification?")
print(docs[0].page_content)

Question: What is a classification task?
Answer: A classification task involves assigning input data to one of several predefined categories or classes. The goal is to predict the category to which new data points belong, based on the training data. Examples include identifying email as spam or not spam, classifying images of animals, or recognizing spoken words.
