# **Installing Dependencies**

In [None]:
!pip install langchain langchain-community transformers sentence-transformers
!pip install auto-gptq optim accelerate faiss-gpu

Collecting langchain
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain-core<0.4.0,>=0.3.12 (from langchain)
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.137-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<

# **Importing Libraries**

In [None]:
#Processing
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

#Importing Knowledge Base
from langchain.document_loaders import GitbookLoader
import nest_asyncio
import asyncio

from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from uuid import uuid4
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import optimum
import auto_gptq
from transformers import AutoModelForCausalLM,AutoTokenizer, pipeline
import accelerate

# **Loading Knowledge Base**

In [None]:
nest_asyncio.apply()
loader = GitbookLoader("https://docs.gitbook.com", load_all_paths=True)
all_pages_data = loader.load()

  k = self.parse_starttag(i)
Fetching pages: 100%|##########| 112/112 [01:22<00:00,  1.35it/s]


In [None]:
print(f"fetched {len(all_pages_data)} documents.")

fetched 112 documents.


# **Loading GPT Model**

In [None]:
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
model_basename = "gptq_model-4bit-128g"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
type(all_pages_data)

list

In [None]:
token_counts = [len(tokenizer(page.page_content, return_tensors='pt').input_ids[0])
for page in all_pages_data]

In [None]:
print(f"""Min: {min(token_counts)}
Avg: {int(sum(token_counts) / len(token_counts))}
Max: {max(token_counts)}""")

Min: 11
Avg: 238
Max: 1418


In [None]:
def tokenizer_len(text):
  return len(tokenizer(text, return_tensors='pt').input_ids[0])

In [None]:
doc_text = "\n".join([page.page_content for page in all_pages_data])

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 10,   #chunk_size +/- allowance to arrive at a separator
    length_function=tokenizer_len,
    separators=['\n\n', '\n', ' ', '']
)

chunks = text_splitter.split_text(doc_text)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 10,   #chunk_size +/- allowance to arrive at a separator
    length_function=tokenizer_len,
    separators=['\n\n', '\n', ' ', '']
)

chunks = text_splitter.split_documents(all_pages_data)

In [None]:
def clean_text(text):
    # List of patterns or words to exclude
    exclude_patterns = ["Homepage", "Community", "Pricing", "Blog", "Developer Documentation", "Ask or Search"]

    # Filter out chunks that contain these patterns
    for pattern in exclude_patterns:
        if pattern in text:
            return None  # Discard this chunk
    return text

In [None]:
clean_chunks = [clean_text(chunk) for chunk in chunks if clean_text(chunk) is not None]

In [None]:
def post_process_chunk(chunk):
    chunk = re.sub(r'\u2006', ' ', chunk) # Removing unicode characters
    chunk = re.sub(r'\s+', ' ', chunk)  # Replace multiple spaces/newlines
    chunk = re.sub(r'Last updated.*', '', chunk) #Trimming unnecessary content

    return chunk.strip()

In [None]:
!pip install chromadb
import re
from chromadb.api.types import Document

def post_process_chunk(chunk: Document):
    """
    Processes a chunk of text to remove unwanted characters and patterns.

    Args:
        chunk (Document): A Document object containing the text to be processed.

    Returns:
        str: The processed text as a string, or None if the input is not a string or bytes-like object.
    """

    try:
        # Assuming 'page_content' is the key for the actual text content
        chunk_text = chunk.page_content

        # Check if chunk_text is a string or bytes-like object
        if not isinstance(chunk_text, (str, bytes)):
            return None

        chunk_text = re.sub(r'\u2006', ' ', chunk_text)  # Removing unicode characters
        chunk_text = re.sub(r'\s+', ' ', chunk_text)  # Replace multiple spaces/newlines
        chunk_text = re.sub(r'Last updated.*', '', chunk_text)  # Trimming unnecessary content

        return chunk_text.strip()

    except AttributeError:  # Catch attribute errors if chunk doesn't have 'page_content'
        print(f"Error: Input chunk doesn't have a 'page_content' attribute: {chunk}")
        return None

Collecting chromadb
  Downloading chromadb-0.5.15-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.27.0-py3

In [None]:
processed_chunks = [post_process_chunk(chunk) for chunk in clean_chunks]

In [None]:
document = " ".join(processed_chunks)

Welcome GitBook is a platform for capturing and documenting technical knowledge — from product docs, to internal knowledge bases and APIs. Overview Edit pages, collections, content and more. Import Find out how to easily migrate your existing documentation — and which formats GitBook supports. There are two methods for importing content into GitBook: Using our import tool Using Git Sync Using our import tool You can migrate and unify existing documentation in GitBook using the import tool. You have the option to import single or multiple pages — although some limits apply, which we’ll explain below. Permissions Only users with editor permissions or higher can edit pages. Supported import formats GitBook supports imports from websites or files in the following formats: Markdown (.md or .markdown) HTML (.html) Microsoft Word (.docx) We also support imports from: Confluence Notion GitHub Wiki Quip Dropbox Paper Google Docs If you want to import multiple pages , you can upload a ZIP file c

In [None]:
type(document)

str

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



In [None]:
vectorstore = Chroma.from_documents(documents= chunks,
                                    embedding=embeddings)

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
prompt = hub.pull("rlm/rag-prompt")

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(processed_chunks, embeddings)



In [None]:
query = input('The chatbot will assist you with your queries. ')

# Create query embedding
query_embedding = embeddings.embed_query(query)

# Retrieving chunks based on query
retrieved_docs = vector_store.similarity_search_by_vector(query_embedding, k=3)

# Outputting Result
for doc in retrieved_docs:
    print(doc)

The chatbot will assist you with your queries. What is a Gitbook?
page_content='Welcome GitBook is a platform for capturing and documenting technical knowledge — from product docs, to internal knowledge bases and APIs. We want to help teams to work more efficiently with a simple but powerful platform that helps them share their knowledge . Our mission is to make a user-friendly and collaborative product for everyone to create, edit and share knowledge through documentation. Discover GitBook GitBook Product Demo GitBook Product Demo'
page_content='here. Security as a company value To find more information about how GitBook handles security head over to our Security FAQ.'
page_content='Only administrators can access an organization’s billing settings. What plans does GitBook offer? Plus Ideal for small teams that want to collaborate and document publicly. Pro Ideal for multi-role teams who want advanced publishing and collaboration options. Enterprise Ideal for teams of 20+ with more sec

In [None]:
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map='auto',
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          use_fast=True)

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

Some weights of the model checkpoint at TheBloke/Llama-2-7b-Chat-GPTQ were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11.mlp.gate_p

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



In [None]:
# Step 1: Query input
def get_query_input():
    return input("Enter your query: ")

# Step 2: Search for relevant information from FAISS vector store
def search_faiss_vectorstore(query, vector_store, embeddings):
    # Embed the query
    query_embedding = embeddings.embed_query(query)

    # Search for relevant documents in the FAISS index
    docs_and_scores = vector_store.similarity_search_with_score(query, k=3)  # You can adjust 'k' based on your needs
    return docs_and_scores

# Step 3: Use a language model to generate a response based on the retrieved info
def generate_response(retrieved_docs):
    # Concatenate all retrieved documents
    context = " ".join([doc[0].page_content for doc in retrieved_docs])

    # Initialize a HuggingFace model for text generation (You can use a summarization model or GPT-style models)
    generator = pipeline("text-generation", model="gpt2")  # or use any other text-generation model

    # Use the retrieved context to generate a response
    response = generator(context, max_length=200, num_return_sequences=1)
    return response[0]['generated_text']

# Main function to tie everything together
def query_knowledge_base():
    query = get_query_input()
    retrieved_docs = search_faiss_vectorstore(query, vector_store, embeddings)

    if retrieved_docs:
        response = generate_response(retrieved_docs)
        print("Generated Response: ", response)
    else:
        print("No relevant information found.")

# Run the system
query_knowledge_base()

In [None]:
query = input('The chatbot will assist you with your queries. ')
query_embedding = embeddings.embed_query(query)
context = vector_store.similarity_search(
    query_embedding,
    k=3)

cleaned_context=''
for i in context:
  cleaned_context+=i.page_content+'\n\n'

prompt_template=f'''
Please answer the following question with respect to the below context.

Question: {query}

Context: {cleaned_context}
'''

print("\n\n*** Response:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids
output = model.generate(inputs=input_ids,
                        temperature=0.7,
                        do_sample=True,
                        top_p=0.95,
                        top_k=40,
                        max_new_tokens=500)

print(tokenizer.decode(output[0],skip_special_tokens=True))

The chatbot will assist you with your queries. What is a Gitbook?


AttributeError: 'list' object has no attribute 'replace'

In [None]:
documents = []
for page in tqdm(all_pages_data):
  url = page.metadata['source']
  hasher.update(url.encode('utf-8'))
  uid = hasher.hexdigest()[:12]
  chunks = text_splitter.split_text(page.page_content)
  for i, chunk in enumerate(chunks):
    documents.append({
        'id':f'{uid}-{i}',
        'text': chunk,
        'source': url
    })

  0%|          | 0/108 [00:00<?, ?it/s]

In [None]:
documents_df  = pd.DataFrame.from_records(documents)
documents_df

Unnamed: 0,id,text,source
0,6f6cda09b14c-0,Welcome\nGitBook is a platform for capturing a...,https://docs.gitbook.com/
1,51d82ce91626-0,"Overview\nEdit pages, collections, content and...",https://docs.gitbook.com/content-editor/overview
2,f53a3c26bb78-0,Import\nFind out how to easily migrate your ex...,https://docs.gitbook.com/content-editor/import
3,f53a3c26bb78-1,Only users with \neditor permissions or higher...,https://docs.gitbook.com/content-editor/import
4,261f9663ed70-0,Editor\nGitBook’s editor supports different wr...,https://docs.gitbook.com/content-editor/editor
...,...,...,...
196,7fdb1a72f100-1,Can I move a space between organizations?\nYes...,https://docs.gitbook.com/help-and-faq/faq/cont...
197,03001e553f73-0,Security FAQs\nFind out more about how GitBook...,https://docs.gitbook.com/help-and-faq/faq/faqs
198,113acedeecc8-0,Keyboard shortcuts\nHelping you to make change...,https://docs.gitbook.com/help-and-faq/keyboard...
199,113acedeecc8-1,"Tab\nTab\nIn a list, increase item indent leve...",https://docs.gitbook.com/help-and-faq/keyboard...


In [None]:
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Apply the embedding model to the text column of the DataFrame
documents_df['embedding'] = documents_df['text'].apply(lambda text: embeddings.embed_query(text))



In [None]:
embeddings_list = np.stack(documents_df['embedding'].values)
text_embeddings = list(zip(documents_df['text'], embeddings_list))

# Prepare the FAISS vector store, mapping embeddings with their corresponding IDs
vector_store = FAISS.from_embeddings(
    text_embeddings,  # Pass the list of (text, embedding) tuples
    embeddings,  # Pass the embedding model
    # Consider passing metadatas (ids or other data) if needed
)

In [None]:
embeddings_list = np.stack(documents_df['embedding'].values)
text_embeddings = list(zip(documents_df['text'].tolist(), embeddings_list))
vector_store = FAISS.from_embeddings(text_embeddings, embedding_model,
                                     metadata = documents_df['source'].to_dict(),
                                     ids=documents_df['id'].tolist())

TypeError: FAISS.__init__() got an unexpected keyword argument 'metadata'

In [None]:
query = input('The chatbot will assist you with your queries. ')
query_embedding = embedding_model.embed_query(query)

search_results = vector_store.similarity_search(query_embedding, k=3)
for search in search_results:
  print(search)

The chatbot will assist you with your queries. How to upload files in Gitbook?


AttributeError: 'list' object has no attribute 'replace'

In [None]:
query = input('The chatbot will assist you with your queries. ')

# Create query embedding
query_embedding = embeddings.embed_query(query)

# Retrieving chunks based on query
retrieved_docs = vector_store.similarity_search_by_vector(query_embedding, k=3)

# Outputting Result
for doc in retrieved_docs:
    print(doc)

In [None]:
temp = documents_df['text'][1]
embeds = embedding_model.encode(temp)
len(embeds)

384

In [None]:
!pip install -U "pinecone-client[grpc]"

Collecting pinecone-client[grpc]
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client[grpc])
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client[grpc])
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting lz4>=3.1.3 (from pinecone-client[grpc])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting protobuf<5.0,>=4.25 (from pinecone-client[grpc])
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone-client[grpc])
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="5fc6c58a-e5d2-4941-a8ca-a7d28dff7e13")
index_name = 'ml-session'

pc.create_index(
    index_name,
    dimension = 384,
    metric = 'dotproduct',
    spec = ServerlessSpec(
        cloud= 'aws',
        region = 'us_east_1'
    )
)

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-04', 'X-Cloud-Trace-Context': 'aa0b60aa945ea3a85d0d09c24aeb0fc0', 'Date': 'Wed, 16 Oct 2024 16:56:18 GMT', 'Server': 'Google Frontend', 'Content-Length': '103', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource cloud: aws region: us_east_1 not found"},"status":404}


In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

In [None]:
doc = documents_df['text'].tolist()

In [None]:
batch_size = 100

metadatas = []

for i in tqdm(range(0, len(documents_df), batch_size)):
  # get end of batch
  i_end = min(len(documents_df), i+batch_size)
  batch = documents_df.iloc[i:i_end]
  # Get metadata fields for this record
  metadatas = [{
      'source':record['source'],
      'text':record['text']
  } for j, record in batch.iterrows()]
  # Get the list of contexts/docs
  pinecone_documents = batch['text']
  # Create document embeddings
  embeds = embedding_model.encode(list(pinecone_documents)).tolist()
  print(len(embeds))
  # get IDS
  ids = batch['id']
  # add everything to pinecone
  index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
doc_text = "\n".join([doc.page_content for doc in docs])

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],
)


chunks = text_splitter.split_text(doc_text)

In [None]:
def clean_text(text):
    # List of patterns or words to exclude
    exclude_patterns = ["Homepage", "Community", "Pricing", "Blog", "Developer Documentation", "Ask or Search"]

    # Filter out chunks that contain these patterns
    for pattern in exclude_patterns:
        if pattern in text:
            return None  # Discard this chunk
    return text

# Apply cleaning to the chunks
clean_chunks = [clean_text(chunk) for chunk in chunks if clean_text(chunk) is not None]

In [None]:
hasher = hashlib.md5()

documents = []

for doc in tqdm(docs):
  url = doc.metadata['source']
  hasher.update(url.encode('utf-8'))
  uid = hasher.hexdigest()[:12]
  chunks = text_splitter.split_text(doc.page_content)
  for i, chunk in enumerate(processed_chunks):
    documents.append({
        'id':f'{uid}-{i}',
        'text': chunk,
        'source': url
    })

len(documents)

  0%|          | 0/175 [00:00<?, ?it/s]

175

In [None]:
import pandas as pd
documents_df  = pd.DataFrame.from_records(documents)
documents_df

Unnamed: 0,id,text,source
0,a70fa8f4cc51-0,"your favorite tools, while content insights he...",https://docs.gitbook.com
1,3ae641bc7f8c-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/
2,1bc91c97be57-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/overview
3,72b4772623d6-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/import
4,c633559dbfec-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/editor
...,...,...,...
170,8bd537b7dff3-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/editor...
171,d8f3d4b82dfc-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/editin...
172,2d8702cf6d88-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/import...
173,94c6206ac6cc-0,"your favorite tools, while content insights he...",https://docs.gitbook.com/content-editor/import...


In [None]:
def post_process_chunk(chunk):
    chunk = re.sub(r'\u2006', ' ', chunk) # Removing unicode characters
    chunk = re.sub(r'\s+', ' ', chunk)  # Replace multiple spaces/newlines
    chunk = re.sub(r'Last updated.*', '', chunk) #Trimming unnecessary content

    return chunk.strip()

processed_chunks = [post_process_chunk(chunk) for chunk in clean_chunks]

In [None]:
for chunk in processed_chunks:
    print(chunk)

that helps them share their knowledge.Our mission is to make a user-friendly and collaborative product for everyone to create, edit and share knowledge through documentation.Discover GitBook GitBook Product DemoGitBook Product Demo
that helps them share their knowledge.Our mission is to make a user-friendly and collaborative product for everyone to create, edit and share knowledge through documentation.Discover GitBook GitBook Product DemoGitBook Product Demo
days agoOn this pageWas this helpful?Edit on GitHub The GitBook editor.EditorLearn more about GitBook’s navigation, content structure, and more.BlocksLearn about the different blocks you can insert into a GitBook page.Import existing contentImport content from Markdown, Confluence, Notion and more.Search your contentSearch the pages you write or ask GitBook AI questions about your content.Version controlView or restore pages from an earlier point in time.
Only users with editor permissions or higher can edit pages.Supported import

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(processed_chunks, embeddings)

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# **Testing**

In [None]:
# Sample query input
query = input('The chatbot will assist you with your queries. ')

# Create query embedding
query_embedding = embeddings.embed_query(query)

# Retrieving chunks based on query
retrieved_docs = vector_store.similarity_search_by_vector(query_embedding, k=3)

# Outputting Result
for doc in retrieved_docs:
    print(doc)

The chatbot will assist you with your queries. Tell me about Gitbook Integration?
page_content='GitBook’s integration platform. Visit the developer documentation for more info.'
page_content='and services.Our listing page contains apps and integrations that GitBook and verified developers have created. If you’re interested in developing your own app or integration in GitBook, head to our developer documentation to learn more.PermissionsCreators and admins can install integrations for a space. Only admins can install integrations for an entire organization.'
page_content="https://docs.gitbook.com/. Our documentation uses one of GitBook's most useful features—Git Sync!Git Sync allows you to keep your GitBook site up to date with a remote repository either on GitHub or GitLab. In our case we have the our repository public-docs synced with https://docs.gitbook.com/. This means that any changes reviewed, approved, and merged into this directory will automatically be deployed!Head to our rep