In [None]:
pip install chromadb

In [None]:
pip install langchain

In [None]:
pip install sentence_transformers

In [None]:
import os
import chromadb
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_transformers import (
    EmbeddingsRedundantFilter,
    EmbeddingsClusteringFilter,
)
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Get the Embedding Model

In [None]:
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
print("Embedding Model Loaded..........")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Embedding Model Loaded..........


## Data Preprocessing

In [None]:
from langchain_community.document_loaders import TextLoader
def process_audio_text(audio_filename):
    # Load text from the audio file
    loader = TextLoader(audio_filename)
    text_audio = loader.load()

    # Split text using RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_content_audio = text_splitter.split_documents(text_audio)

    return text_content_audio

In [None]:
audio_files=["audio_1.txt","audio_2.txt","audio_3.txt","audio_4.txt","audio_5.txt"]

In [None]:
text_contents={}
for i,audio_file in enumerate(audio_files):
  content=process_audio_text(audio_file)
  file_name=f"text_content_audio_{i+1}"
  text_contents[file_name]=content

In [None]:
text_contents['text_content_audio_1'][0]

Document(page_content="So welcome to this first lecture on this course, DMML. So today I will try to set some context for the course and also tell you a little bit about start the first topic. So I have set up some information just a few minutes back on the moodle page to give you information about the list of topics, roughly that we plan to cover and also about the assessment and all that. So if you have any questions on that, you can look it up and maybe next time we can discuss that. So I won't spend too much time right now on the administrative aspects of the course. We'll just start looking at what we are going to do in this course. So if you look at the title of the course, it clearly says two things. It says data mining and machine learning. In a sense, this is a kind of historical thing. That's how this course was initially created some many years back. So if you want to look at the two parts of the title in some detail. So data mining is a loose term which talks about identify

## Create and Store Vectors

In [None]:
audio1_store = Chroma.from_documents(text_contents['text_content_audio_1'], hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="audio1_chroma_cosine")

In [None]:
 !zip -r /content/audio1_chroma_cosine.zip /content/audio1_chroma_cosine

  adding: content/audio1_chroma_cosine/ (stored 0%)
  adding: content/audio1_chroma_cosine/23e64686-b861-4f1e-8411-c279494758b4/ (stored 0%)
  adding: content/audio1_chroma_cosine/23e64686-b861-4f1e-8411-c279494758b4/header.bin (deflated 61%)
  adding: content/audio1_chroma_cosine/23e64686-b861-4f1e-8411-c279494758b4/length.bin (deflated 37%)
  adding: content/audio1_chroma_cosine/23e64686-b861-4f1e-8411-c279494758b4/link_lists.bin (stored 0%)
  adding: content/audio1_chroma_cosine/23e64686-b861-4f1e-8411-c279494758b4/data_level0.bin (deflated 100%)
  adding: content/audio1_chroma_cosine/chroma.sqlite3 (deflated 53%)


In [None]:
audio2_store = Chroma.from_documents(text_contents['text_content_audio_2'], hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="audio2_chroma_cosine")

In [None]:
 !zip -r /content/audio2_chroma_cosine.zip /content/audio2_chroma_cosine

  adding: content/audio2_chroma_cosine/ (stored 0%)
  adding: content/audio2_chroma_cosine/39fe4e3b-be9c-44a2-84d4-5c8f189c301d/ (stored 0%)
  adding: content/audio2_chroma_cosine/39fe4e3b-be9c-44a2-84d4-5c8f189c301d/header.bin (deflated 61%)
  adding: content/audio2_chroma_cosine/39fe4e3b-be9c-44a2-84d4-5c8f189c301d/length.bin (deflated 30%)
  adding: content/audio2_chroma_cosine/39fe4e3b-be9c-44a2-84d4-5c8f189c301d/link_lists.bin (stored 0%)
  adding: content/audio2_chroma_cosine/39fe4e3b-be9c-44a2-84d4-5c8f189c301d/data_level0.bin (deflated 7%)
  adding: content/audio2_chroma_cosine/chroma.sqlite3 (deflated 50%)


In [None]:
audio3_store = Chroma.from_documents(text_contents['text_content_audio_3'], hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="audio3_chroma_cosine")

In [None]:
 !zip -r /content/audio3_chroma_cosine.zip /content/audio3_chroma_cosine

  adding: content/audio3_chroma_cosine/ (stored 0%)
  adding: content/audio3_chroma_cosine/2cd5c8b3-2d84-4649-8d4a-0448b568a700/ (stored 0%)
  adding: content/audio3_chroma_cosine/2cd5c8b3-2d84-4649-8d4a-0448b568a700/header.bin (deflated 61%)
  adding: content/audio3_chroma_cosine/2cd5c8b3-2d84-4649-8d4a-0448b568a700/length.bin (deflated 98%)
  adding: content/audio3_chroma_cosine/2cd5c8b3-2d84-4649-8d4a-0448b568a700/link_lists.bin (stored 0%)
  adding: content/audio3_chroma_cosine/2cd5c8b3-2d84-4649-8d4a-0448b568a700/data_level0.bin (deflated 8%)
  adding: content/audio3_chroma_cosine/chroma.sqlite3 (deflated 53%)


In [None]:
audio4_store = Chroma.from_documents(text_contents['text_content_audio_4'], hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="audio4_chroma_cosine")

In [None]:
 !zip -r /content/audio4_chroma_cosine.zip /content/audio4_chroma_cosine

  adding: content/audio4_chroma_cosine/ (stored 0%)
  adding: content/audio4_chroma_cosine/9af56ffe-cdde-4811-853d-d366cdcb2eee/ (stored 0%)
  adding: content/audio4_chroma_cosine/9af56ffe-cdde-4811-853d-d366cdcb2eee/header.bin (deflated 61%)
  adding: content/audio4_chroma_cosine/9af56ffe-cdde-4811-853d-d366cdcb2eee/length.bin (deflated 32%)
  adding: content/audio4_chroma_cosine/9af56ffe-cdde-4811-853d-d366cdcb2eee/link_lists.bin (stored 0%)
  adding: content/audio4_chroma_cosine/9af56ffe-cdde-4811-853d-d366cdcb2eee/data_level0.bin (deflated 94%)
  adding: content/audio4_chroma_cosine/chroma.sqlite3 (deflated 50%)


In [None]:
audio5_store = Chroma.from_documents(text_contents['text_content_audio_5'], hf, collection_metadata={"hnsw:space": "cosine"}, persist_directory="audio5_chroma_cosine")

In [None]:
 !zip -r /content/audio5_chroma_cosine.zip /content/audio5_chroma_cosine

  adding: content/audio5_chroma_cosine/ (stored 0%)
  adding: content/audio5_chroma_cosine/c048879c-0cc0-4984-8f6b-74bc56de0af7/ (stored 0%)
  adding: content/audio5_chroma_cosine/c048879c-0cc0-4984-8f6b-74bc56de0af7/header.bin (deflated 61%)
  adding: content/audio5_chroma_cosine/c048879c-0cc0-4984-8f6b-74bc56de0af7/length.bin (deflated 37%)
  adding: content/audio5_chroma_cosine/c048879c-0cc0-4984-8f6b-74bc56de0af7/link_lists.bin (stored 0%)
  adding: content/audio5_chroma_cosine/c048879c-0cc0-4984-8f6b-74bc56de0af7/data_level0.bin (deflated 17%)
  adding: content/audio5_chroma_cosine/chroma.sqlite3 (deflated 54%)


## Load Vector Store

In [None]:
load_audio1_store = Chroma(persist_directory="audio1_chroma_cosine", embedding_function=hf)

In [None]:
load_audio2_store = Chroma(persist_directory="audio2_chroma_cosine", embedding_function=hf)

In [None]:
load_audio3_store = Chroma(persist_directory="audio3_chroma_cosine", embedding_function=hf)

In [None]:
load_audio4_store = Chroma(persist_directory="audio4_chroma_cosine", embedding_function=hf)

In [None]:
load_audio5_store = Chroma(persist_directory="audio5_chroma_cosine", embedding_function=hf)

## Init Merge Retriever

In [None]:
retriever_audio1 = load_audio1_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

retriever_audio2 = load_audio2_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

retriever_audio3 = load_audio3_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

retriever_audio4 = load_audio4_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

retriever_audio5 = load_audio5_store.as_retriever(search_type = "similarity", search_kwargs = {"k":3})


In [None]:
lotr = MergerRetriever(retrievers=[retriever_audio1, retriever_audio2,retriever_audio3,retriever_audio4,retriever_audio5])

In [None]:
lotr

MergerRetriever(retrievers=[VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c4853f65900>, search_kwargs={'k': 3}), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c4853f65c00>, search_kwargs={'k': 3}), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c4853f67130>, search_kwargs={'k': 3}), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c4853f67a00>, search_kwargs={'k': 3}), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c4853f673d0>, search_kwargs={'k': 3})])

## Perform Semantic Search

In [None]:
query=" What are some challenges associated with data collection?"

In [None]:
docs = lotr.get_relevant_documents(query)
docs

[Document(page_content="entered by somebody and then converted to electronic forms. So that would be, there would be two levels of potential sources for errors. The person writing down the information and then the person typing in the information. Now, gradually these kind of electronic forms are spilled in directly, so at least the source of the error is reduced to one step. But still, people mistype things. I mean, there are any number of situations where people type their email address wrong and so notifications don't reach them and so on. So there is this data collection. How do you collect the data and how do you clean it? And the third thing is, how do you make it uniform? So when data is being collected by different people, they may collect different things. And for instance, if you look at the government, typically the government collects data in different forms. For instance, there is a public distribution system which the ration shops, so they collect some information about w

In [None]:
from langchain_community.document_transformers import (
    LongContextReorder,
)
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content="entered by somebody and then converted to electronic forms. So that would be, there would be two levels of potential sources for errors. The person writing down the information and then the person typing in the information. Now, gradually these kind of electronic forms are spilled in directly, so at least the source of the error is reduced to one step. But still, people mistype things. I mean, there are any number of situations where people type their email address wrong and so notifications don't reach them and so on. So there is this data collection. How do you collect the data and how do you clean it? And the third thing is, how do you make it uniform? So when data is being collected by different people, they may collect different things. And for instance, if you look at the government, typically the government collects data in different forms. For instance, there is a public distribution system which the ration shops, so they collect some information about w

In [None]:
query_2=" What are the advantages of the Apriori algorithm?"


In [None]:
docs = lotr.get_relevant_documents(query_2)
docs

[Document(page_content="yet, first of all, a truly learning problem. It's more an algorithmic problem, but still, it's an interesting problem nonetheless, because I just want to illustrate how it affects the way in which we calculate what might be trivial with small data becomes nontrivial with large data. So this is the problem. So given a set of items, capital n, which is large, and a, given a set of transactions m, which is again large, and given these two ratios between zero and one, find every pair x and y such that x implies y is a valid association. So we can break up this thing into two steps. I mean, we want to first check whether x implies y is worth looking at at all. So we first look at the support part. We want to know whether x, sorry, yeah, we want to know whether the count divided by m is bigger than the support, which is the same as taking this m to the other side and saying whether the count is bigger than a certain fraction of the total. So the first idea is to ident

In [None]:
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content="yet, first of all, a truly learning problem. It's more an algorithmic problem, but still, it's an interesting problem nonetheless, because I just want to illustrate how it affects the way in which we calculate what might be trivial with small data becomes nontrivial with large data. So this is the problem. So given a set of items, capital n, which is large, and a, given a set of transactions m, which is again large, and given these two ratios between zero and one, find every pair x and y such that x implies y is a valid association. So we can break up this thing into two steps. I mean, we want to first check whether x implies y is worth looking at at all. So we first look at the support part. We want to know whether x, sorry, yeah, we want to know whether the count divided by m is bigger than the support, which is the same as taking this m to the other side and saying whether the count is bigger than a certain fraction of the total. So the first idea is to ident

In [None]:
query_3=" What distinguishes training data in supervised learning, and what is its purpose? "

In [None]:
docs = lotr.get_relevant_documents(query_3)
docs

[Document(page_content="a specification, but generating a concrete program from a generic program you can think of a model template as. So I just give you a line here which says a Y equal to Mx plus c. And here I want a specific m zero and C zero, which fits the given data in the best possible way, which is also part of the description of the algorithm. What does it mean for one line to be better than another line? So that's part of it. So the other side of this picture, as I mentioned, is when you don't have this training data, you don't have any examples that people have already labeled, but you are looking for some kind of patterns. So there is no guidance. So the thing about supervised learning is somebody should have gone through and given you this information before. Now, this information could have been collected over time. Maybe it's not manually collected, like the school marks. The school marks over time. Everybody has. I mean, the school keeps a record of what happens in the

In [None]:
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content="a specification, but generating a concrete program from a generic program you can think of a model template as. So I just give you a line here which says a Y equal to Mx plus c. And here I want a specific m zero and C zero, which fits the given data in the best possible way, which is also part of the description of the algorithm. What does it mean for one line to be better than another line? So that's part of it. So the other side of this picture, as I mentioned, is when you don't have this training data, you don't have any examples that people have already labeled, but you are looking for some kind of patterns. So there is no guidance. So the thing about supervised learning is somebody should have gone through and given you this information before. Now, this information could have been collected over time. Maybe it's not manually collected, like the school marks. The school marks over time. Everybody has. I mean, the school keeps a record of what happens in the

In [None]:
query_4=" What is cross-validation?"

In [None]:
docs = lotr.get_relevant_documents(query_4)
docs

[Document(page_content="But there may be more subtle things. Somebody might be talking about events, some kind of pop music events, and another person may be talking about some spiritual gatherings. And one person may be interested in pop music events, and another person may not be. And another person may be interested in spiritual gatherings, and the first person may not be. So then you can train these things. So in that sense, which words signal junk and which words don't signal junk is also a parameter of the model. Just like in this linear fit, the shape of the line is a parameter. So this is the learning part. So what we are going to do is look at different types of models, as I said, and then we will look at this parameter adjustment. How does the data actually determine the model, the concrete model? How do we build the best model that we can get for the given data? That's the algorithm. So we have a kind of model template. So we have a model template on this side, we have train

In [None]:
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content="But there may be more subtle things. Somebody might be talking about events, some kind of pop music events, and another person may be talking about some spiritual gatherings. And one person may be interested in pop music events, and another person may not be. And another person may be interested in spiritual gatherings, and the first person may not be. So then you can train these things. So in that sense, which words signal junk and which words don't signal junk is also a parameter of the model. Just like in this linear fit, the shape of the line is a parameter. So this is the learning part. So what we are going to do is look at different types of models, as I said, and then we will look at this parameter adjustment. How does the data actually determine the model, the concrete model? How do we build the best model that we can get for the given data? That's the algorithm. So we have a kind of model template. So we have a model template on this side, we have train

In [None]:
query_5="What is the process of building the decision tree classifier, and how is it trained on the dataset?"

In [None]:
docs = lotr.get_relevant_documents(query_5)
docs

[Document(page_content="the order, sometimes we don't. Addresses, of course, are written in a million different ways. So there are all kinds of issues with just getting the data to a format where you can work on it. So this is an entirely different ballgame. It's much more kind of. There are lots of tools and techniques to deal with this, but that's not really going to be the focus of the course. So, in a sense, if you want to think about it, this data mining aspect will be almost missing from this course, even though it's part of the title. So what we are really going to look at is the machine learning aspect. So learning means you are trying to understand something that you don't know before. And machine learning suggests that it's done automatically. It's done by now machine as a computer. So there is an algorithm which learns something about, again, it's always with respect to data. So it's something about the data. So what we are trying to do is learn some kind of mathematical mod

In [None]:
reordered_docs = reordering.transform_documents(docs)

# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

[Document(page_content="the order, sometimes we don't. Addresses, of course, are written in a million different ways. So there are all kinds of issues with just getting the data to a format where you can work on it. So this is an entirely different ballgame. It's much more kind of. There are lots of tools and techniques to deal with this, but that's not really going to be the focus of the course. So, in a sense, if you want to think about it, this data mining aspect will be almost missing from this course, even though it's part of the title. So what we are really going to look at is the machine learning aspect. So learning means you are trying to understand something that you don't know before. And machine learning suggests that it's done automatically. It's done by now machine as a computer. So there is an algorithm which learns something about, again, it's always with respect to data. So it's something about the data. So what we are trying to do is learn some kind of mathematical mod

In [None]:
for chunks in lotr.get_relevant_documents("What is supervised Learning"):
    print(chunks.page_content)

are looking for patterns, but you don't really have a clear idea beforehand what patterns they are. So a typical example of this might be that a company which is selling something might want to know some information about the demographics of its customers. So what are the groupings? I mean, which age groups? What is the proportion of people who are, say, between 20 and 30 who buy their products, people above 50 who buy their products, and so on. So this kind of thing is unsupervised because you don't know what you're going to get. But after you get this information, maybe you can put it to good use. So these are broadly the two types of things that are there. There is a third type of machine learning called reinforcement learning, which we are not going to talk about at all in this course. So to look at supervised learning a little more detail, we will of course do it in much more detail in the lectures to come. So we are trying to extrapolate. So basically it's a prediction problem,
a

In [None]:
pip install cohere

Collecting cohere
  Downloading cohere-4.56-py3-none-any.whl (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting fastavro<2.0,>=1.8 (from cohere)
  Downloading fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fastavro, cohere
Successfully installed cohere-4.56 fastavro-1.9.4


In [None]:
import cohere
co = cohere.Client('w8CnnlzVol2aZEiirZNLUs0onAqXUUYBZCw2Oj7g')

#### Query 1 Merge Retriever with Cohere Reranker

In [None]:
query_1=" What are some challenges associated with data collection?"

In [None]:
retrieved_docs_query1=[]
for i in range(len(docs)):
  retrieved_docs_query1.append(docs[i].page_content)

In [None]:
results = co.rerank(query=query_1, documents=retrieved_docs_query1, top_n=3, model='rerank-english-v2.0') # Change top_n to change the number of results returned. If top_n is not passed, all results will be returned.
for idx, r in enumerate(results):
  print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
  print(f"Document: {r.document['text']}")
  print(f"Relevance Score: {r.relevance_score:.2f}")
  print("\n")

Document Rank: 1, Document Index: 0
Document: entered by somebody and then converted to electronic forms. So that would be, there would be two levels of potential sources for errors. The person writing down the information and then the person typing in the information. Now, gradually these kind of electronic forms are spilled in directly, so at least the source of the error is reduced to one step. But still, people mistype things. I mean, there are any number of situations where people type their email address wrong and so notifications don't reach them and so on. So there is this data collection. How do you collect the data and how do you clean it? And the third thing is, how do you make it uniform? So when data is being collected by different people, they may collect different things. And for instance, if you look at the government, typically the government collects data in different forms. For instance, there is a public distribution system which the ration shops, so they collect so

#### Query 2

In [None]:
query_2=" What are the advantages of the Apriori algorithm?"

In [None]:
retrieved_docs_query2=[]
for i in range(len(docs)):
  retrieved_docs_query2.append(docs[i].page_content)

In [None]:
results = co.rerank(query=query_2, documents=retrieved_docs_query2, top_n=3, model='rerank-english-v2.0') # Change top_n to change the number of results returned. If top_n is not passed, all results will be returned.
for idx, r in enumerate(results):
  print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
  print(f"Document: {r.document['text']}")
  print(f"Relevance Score: {r.relevance_score:.2f}")
  print("\n")

Document Rank: 1, Document Index: 3
Document: against some unknown data. So, one other strategy where you don't want to, I mean, one of the other disadvantages of doing this is that maybe that there are some minority anomalies in this data, and the choice of test data that you have made might have hidden all these anomalies, so you never see them, and maybe it's important to see them. So there may be many situations, or maybe you just don't have enough data as a whole to build a good model by only looking at 80%. So another strategy is to systematically do this with different subsets. So what you're really asking at some higher level is that machine learning approach. So, remember, we have seen only decision trees, but we are going to see many models. So there are clearly many ways to build models. And the reason, whenever you see that there are many ways to do something, it's only because there is no guarantee that a given way is the best one. So more or less what you want to validate

#### Query 3

In [None]:
query_3=" What distinguishes training data in supervised learning, and what is its purpose? "

In [None]:
retrieved_docs_query3=[]
for i in range(len(docs)):
  retrieved_docs_query3.append(docs[i].page_content)

In [None]:
results = co.rerank(query=query_3, documents=retrieved_docs_query3, top_n=3, model='rerank-english-v2.0') # Change top_n to change the number of results returned. If top_n is not passed, all results will be returned.
for idx, r in enumerate(results):
  print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
  print(f"Document: {r.document['text']}")
  print(f"Relevance Score: {r.relevance_score:.2f}")
  print("\n")

Document Rank: 1, Document Index: 2
Document: whatever, income, stuff like that. So each row in the table will be one item, and there will be a special column which indicates what category, depending on the classification problem that we are dealing with, what category it is. And the goal of supervised learning is to now construct, given a new row, a new item, a new combination of these attributes, which is not seen before, try to predict what would be the most appropriate category to assign. So these items which are given with the labels. So these are called labels. So these items are called training data. So we have labeled training data, which consists of some information which has been collected in the past, some historical information about items and their classification. And we want to find a model that generalizes the training data. So since normally this categorization corresponds to some kind of classification, this is typically called a classification problem. So classificati

#### query 4

In [None]:
query_4=" What is cross-validation?"

In [None]:
retrieved_docs_query4=[]
for i in range(len(docs)):
  retrieved_docs_query4.append(docs[i].page_content)

In [None]:
results = co.rerank(query=query_4, documents=retrieved_docs_query4, top_n=3, model='rerank-english-v2.0') # Change top_n to change the number of results returned. If top_n is not passed, all results will be returned.
for idx, r in enumerate(results):
  print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
  print(f"Document: {r.document['text']}")
  print(f"Relevance Score: {r.relevance_score:.2f}")
  print("\n")

Document Rank: 1, Document Index: 3
Document: distribution changes slightly, one attribute might get better entropy than another attribute. And you will choose that question first. So the shape of the tree might vary a lot between these, so you may not get the same. Okay, so the question about cross validation, cross validation. What I'm saying is that you do not make a hard and fast choice once and for all to split your data this way. So you're taking your data and you're saying, hey, first let me leave out this 20% and build one model. So I am building multiple models. I'm building m one, m two, m three, m four, and m five. Each of these models is built using 80% of the test data, but a different 80%. So each of them leaves out some 20%. So I'm building five different models and comparing what I see in all these five. So the first thing is that if this approach is good, then in general, these five models should behave in a similar way with respect to the outcome. So if they all behav

#### Query 5

In [None]:
query="What is the process of building the decision tree classifier, and how is it trained on the dataset?"


In [None]:
results = co.rerank(query=query, documents=retrieved_docs, top_n=3, model='rerank-english-v2.0') # Change top_n to change the number of results returned. If top_n is not passed, all results will be returned.
for idx, r in enumerate(results):
  print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
  print(f"Document: {r.document['text']}")
  print(f"Relevance Score: {r.relevance_score:.2f}")
  print("\n")

Document Rank: 1, Document Index: 9
Document: and is therefore picking up some peculiarities which don't necessarily exist within. So one of the things we mentioned in passing was that we like short trees for two reasons. One is because they are easier to explain. The second thing, which I claim without any justification, is that they generalize better. So here, priority, they are saying, let us not construct deep trees. So whatever tree we construct, we are going to stop it when it reaches. So it's a two step process, right? So what you do is you first say what are the parameters for the decision tree classifier? So he says, I want a decision tree classifier to be set up with this random state 42 and which will not grow to more than depth two. And then I have to actually construct the classifier for a particular data set. So that's the next thing. So I use this fit function, right? In some sense this creates a decision tree object with certain operating parameters, and then you pass i