In [None]:
! pip install llama_index
! pip install llama_index.embeddings.huggingface
!pip install faiss-cpu
! pip install llama-index-vector-stores-chroma
! pip install llama-index-vector-stores-faiss
!pip install faiss-cpu
! pip install llama-index-vector-stores-chroma
! pip install llama-index-vector-stores-faiss

In [None]:
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import gzip
import base64
import numpy as np
from google.colab import drive
import os
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import StorageContext
import faiss

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **FAISS Setup**

In [None]:
persist_directory = "/content/drive/My Drive/faiss_index_batches"
batch_size = 100

In [None]:
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)

In [None]:
hf_embedding = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
embedding_dimension = 384
faiss_index = faiss.IndexFlatL2(embedding_dimension)

# **Data Extraction**

In [None]:
def extract_links(html_content, base_url):
    extracted_links = []
    soup = BeautifulSoup(html_content, 'html.parser')
    anchor_tags = soup.find_all('a')
    for tag in anchor_tags:
        href = tag.get('href')
        if href and (href.endswith('.htm') or href.endswith('.html')):
            full_url = urljoin(base_url, href)
            extracted_links.append(full_url)
    return extracted_links

In [None]:
def extract_text_content(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        for script in soup(["script", "style"]):
            script.decompose()
        return soup.get_text(separator='\n', strip=True)
    return None

In [None]:
url = 'http://hrlibrary.umn.edu/instree/ainstls1.htm'
response = requests.get(url)
extracted_links = list(extract_links(response.content, url))

# **Ingestion**

In [None]:
documents = []
for link in extracted_links:
    content = extract_text_content(link)
    if content:
        document = Document(text=content, metadata={"source": link})
        documents.append(document)
        print(f"Ingested document: {link}")
    else:
        print(f"Skipping document with no content: {link}")

FaissVectorStore

In [None]:
for i in range(0, len(documents), batch_size):
    batch_documents = documents[i:i + batch_size]

    # Create FAISS vector store using the pre-defined FAISS index
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Create an index for the current batch
    index = VectorStoreIndex.from_documents(
        batch_documents, storage_context=storage_context, embed_model=hf_embedding
    )

    # Save the FAISS index to a file
    batch_file_path = f"{persist_directory}/index_batch_{i // batch_size + 1}.faiss"
    faiss.write_index(faiss_index, batch_file_path)
    print(f"Processed and saved batch {i // batch_size + 1} at {batch_file_path}")

Processed and saved batch 1 at /content/drive/My Drive/faiss_index_batches/index_batch_1.faiss
Processed and saved batch 2 at /content/drive/My Drive/faiss_index_batches/index_batch_2.faiss
Processed and saved batch 3 at /content/drive/My Drive/faiss_index_batches/index_batch_3.faiss
Processed and saved batch 4 at /content/drive/My Drive/faiss_index_batches/index_batch_4.faiss
Processed and saved batch 5 at /content/drive/My Drive/faiss_index_batches/index_batch_5.faiss
Processed and saved batch 6 at /content/drive/My Drive/faiss_index_batches/index_batch_6.faiss
Processed and saved batch 7 at /content/drive/My Drive/faiss_index_batches/index_batch_7.faiss
Processed and saved batch 8 at /content/drive/My Drive/faiss_index_batches/index_batch_8.faiss
Processed and saved batch 9 at /content/drive/My Drive/faiss_index_batches/index_batch_9.faiss
Processed and saved batch 10 at /content/drive/My Drive/faiss_index_batches/index_batch_10.faiss
Processed and saved batch 11 at /content/drive/M

# **Query**

In [None]:
batch_number = 1
batch_file_path = f"{persist_directory}/index_batch_{batch_number}.faiss"
faiss_index = faiss.read_index(batch_file_path)

In [None]:
query_text = "What are the main principles of human rights?"
query_embedding = hf_embedding.get_text_embedding(query_text)
query_embedding = np.array(query_embedding).reshape(1, -1).astype("float32")
k = 5  # Number of nearest neighbors to retrieve
distances, indices = faiss_index.search(query_embedding, k)

print("\nTop results:")
for rank, (distance, index) in enumerate(zip(distances[0], indices[0]), start=1):
    print(f"Rank {rank}: Document Index = {index}, Distance = {distance}")


Top results:
Rank 1: Document Index = 110, Distance = 0.7405030727386475
Rank 2: Document Index = 282, Distance = 0.8165687322616577
Rank 3: Document Index = 277, Distance = 0.8338398933410645
Rank 4: Document Index = 523, Distance = 0.8580731153488159
Rank 5: Document Index = 1202, Distance = 0.8694164752960205
