In [29]:
!pip install -q chromadb spacy sentence_transformers langchain_text_splitters

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
from chromadb.utils import embedding_functions
import chromadb
import numpy as np
import pandas as pd
import json
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [32]:
json_file ='/content/drive/MyDrive/AI_RAG/Data/AI.json'

with open(json_file, 'r', encoding='utf-8') as f:
    data1 = json.load(f)

In [33]:
json_file ='/content/drive/MyDrive/AI_RAG/Data/hands_on_machine_learning.json'

with open(json_file, 'r', encoding='utf-8') as f:
    data2 = json.load(f)

In [34]:
texts1 = [chapter['text'] for part in data1.values() for chapter in part['chapters']]

In [35]:
texts1[0]

'1\nINTRODUCTION\nIn which we try to explain why we consider artiﬁcial intelligence to be a subject\nmost worthy of study, and in which we try to decide what exactly it is, this being a\ngood thing to decide before embarking.\nWe call ourselves Homo sapiens—man the wise—because our intelligence is so important\nINTELLIGENCE\nto us. For thousands of years, we have tried to understand how we think; that is, how a mere\nhandful of matter can perceive, understand, predict, and manipulate a world far larger and\nmore complicated than itself. The ﬁeld of artiﬁcial intelligence, or AI, goes further still: it\nARTIFICIAL\nINTELLIGENCE\nattempts not just to understand but also to build intelligent entities.\nAI is one of the newest ﬁelds in science and engineering. Work started in earnest soon\nafter World War II, and the name itself was coined in 1956. Along with molecular biology,\nAI is regularly cited as the “ﬁeld I would most like to be in” by scientists in other disciplines.\nA student in

In [36]:
texts2 = [chapter['text'] for part in data2 for chapter in part['chapters']]

In [37]:
texts2[0]



In [38]:
sentences=texts1+texts2

## Text Pre-processing

In [39]:
nlp = spacy.load('en_core_web_sm')

In [40]:
# define the preprocess function
def preprocess_data(text):
    sentence = nlp(text)
    cleaned_data = []
    for words in sentence:
        if words.text.lower() not in STOP_WORDS and words.text.isalpha():
            cleaned_data.append(words.lemma_.lower())
    return ' '.join(cleaned_data)

In [41]:
# sentences1=[preprocess_data(text) for text in sentences]

In [42]:
# sentences[0]

In [43]:
# output_file = '/content/drive/MyDrive/AI_RAG/Vector Database/AI_data.json'

# with open(output_file, 'w', encoding='utf-8') as f:
#     json.dump(sentences1, f, ensure_ascii=False, indent=4)

# print(f"Preprocessed data successfully saved to {output_file}")

In [44]:
json_file = '/content/drive/MyDrive/AI_RAG/Vector Database/AI_data.json'

with open(json_file, 'r', encoding='utf-8') as f:
    sentences1 = json.load(f)

In [45]:
# sentences1[:2]

In [46]:
EMBED_MODEL = "all-MiniLM-L6-v2"

# client = chromadb.Client()
client = chromadb.PersistentClient(path="/content/drive/MyDrive/AI_RAG/Vector Database/vectordata1")



In [47]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODEL)
collection1 = client.get_or_create_collection(
                                                name='AI_data',
                                                embedding_function=embedding_func,
                                                metadata={"hnsw:space": "cosine"},
                                            )
collection2 = client.get_or_create_collection(
                                                name='Machine-learning_data',
                                                embedding_function=embedding_func,
                                                metadata={"hnsw:space": "cosine"},
                                            )

In [48]:
type(sentences)

list

In [49]:
text_splitter1 = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)

all_splits1 = []

for sentence in texts1:
    splits = text_splitter1.split_text(sentence)
    all_splits1.extend(splits)


In [50]:
text_splitter2 = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)

all_splits2 = []

for sentence in texts2:
    splits = text_splitter1.split_text(sentence)
    all_splits2.extend(splits)

In [51]:
all_splits1[0], all_splits2[0]

('1\nINTRODUCTION\nIn which we try to explain why we consider artiﬁcial intelligence to be a subject\nmost worthy of study, and in which we try to decide what exactly it is, this being a\ngood thing to decide before embarking.\nWe call ourselves Homo sapiens—man the wise—because our intelligence is so important\nINTELLIGENCE\nto us. For thousands of years, we have tried to understand how we think; that is, how a mere\nhandful of matter can perceive, understand, predict, and manipulate a world far larger and\nmore complicated than itself. The ﬁeld of artiﬁcial intelligence, or AI, goes further still: it\nARTIFICIAL\nINTELLIGENCE\nattempts not just to understand but also to build intelligent entities.\nAI is one of the newest ﬁelds in science and engineering. Work started in earnest soon\nafter World War II, and the name itself was coined in 1956. Along with molecular biology,\nAI is regularly cited as the “ﬁeld I would most like to be in” by scientists in other disciplines.',
 'Chapter 

In [52]:
# ids1=[str(x) for x in range(len(sentences1))]
ids1=[f"id_{i}" for i in range(len(all_splits1))]
ids2=[f"id_{i}" for i in range(len(all_splits2))]

In [53]:
collection1.upsert(
                documents=all_splits1,
                ids=ids1
                        )
collection2.upsert(
                documents=all_splits2,
                ids=ids2
                        )

In [54]:
query='What is Artificial Intelligence?'

In [57]:
# query function
def chroma_query(query):
    #query = query.lower()
    #query = preprocess_data(query)
    result1 = collection1.query(
                                query_texts=query,
                                n_results=1,
                                include=["documents", "distances"]
                            )
    result2 = collection2.query(
                                query_texts=query,
                                n_results=1,
                                include=["documents", "distances"]
                            )

    return result1, result2

In [58]:
result=chroma_query(query)
result

({'ids': [['id_0']],
  'distances': [[0.3643718957901001]],
  'metadatas': None,
  'embeddings': None,
  'documents': [['1\nINTRODUCTION\nIn which we try to explain why we consider artiﬁcial intelligence to be a subject\nmost worthy of study, and in which we try to decide what exactly it is, this being a\ngood thing to decide before embarking.\nWe call ourselves Homo sapiens—man the wise—because our intelligence is so important\nINTELLIGENCE\nto us. For thousands of years, we have tried to understand how we think; that is, how a mere\nhandful of matter can perceive, understand, predict, and manipulate a world far larger and\nmore complicated than itself. The ﬁeld of artiﬁcial intelligence, or AI, goes further still: it\nARTIFICIAL\nINTELLIGENCE\nattempts not just to understand but also to build intelligent entities.\nAI is one of the newest ﬁelds in science and engineering. Work started in earnest soon\nafter World War II, and the name itself was coined in 1956. Along with molecular bi