In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install gdown
!pip install chromadb huggingface_hub langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install langchain requests
!pip install langdetect
!pip install beautifulsoup4

Collecting chromadb
  Downloading chromadb-0.5.15-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>

In [None]:
import requests

# List of file IDs and corresponding file names
file_ids = [
    "1nGS9gF20vKDzzf1X88MKtmr2AnUutBek",  # F1
    "1F44ktvCf-kThrW5yA5DV8YEi61lV24bo",  # F2
    "1WS6p7RNhwsUeHlLOuV4NZZ6bDV4Xs--B",  # F3
    "1hoGey_giBN183hg1Q_hW9zHDwTx3afSQ",  # F4
    "1W6WwLcst2rJpavhX8UojZmbuAUPWhMOn"    # F5
]

file_names = [
    "F1.txt",   # File 1
    "F2.txt",   # File 2
    "F3.txt",   # File 3
    "F4.txt",   # File 4
    "F5.txt"    # File 5
]

# Function to download files from Google Drive
def download_file(file_id, file_name):
    url = f"https://drive.google.com/uc?export=download&id={file_id}"
    response = requests.get(url)
    if response.status_code == 200:
        with open(file_name, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file_name} successfully.")
    else:
        print(f"Failed to download {file_name}. Status code: {response.status_code}")

# Loop through each file and download it
for file_id, file_name in zip(file_ids, file_names):
    print(f"Downloading {file_name}...")
    download_file(file_id, file_name)

Downloading F1.txt...
Downloaded F1.txt successfully.
Downloading F2.txt...
Downloaded F2.txt successfully.
Downloading F3.txt...
Downloaded F3.txt successfully.
Downloading F4.txt...
Downloaded F4.txt successfully.
Downloading F5.txt...
Downloaded F5.txt successfully.


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

# Update to use SBERT (all-mpnet-base-v2) for embeddings
def load_embeddings_model():
    """Load SBERT (all-mpnet-base-v2) embedding model from Hugging Face."""
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    return embeddings

In [None]:
from langchain.vectorstores import Chroma

def create_chroma_db(persist_directory, embedding_model):
    """Create a Chroma database for storing vectorized documents."""
    try:
        chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
        print(f"Chroma database created at: {persist_directory}")
        return chroma_db
    except Exception as e:
        print(f"Error creating Chroma database: {e}")
        raise

In [None]:
from langdetect import detect
from langchain.docstore.document import Document

def load_text_file_from_url(link):
    try:
        response = requests.get(link)
        response.raise_for_status()  # Check if the request was successful

        # Treat the content as a plain text file
        text = response.text

        # Check if the content is in English
        if detect(text) == 'en':
            print(f"Loaded English text file from URL: {link}")
            return [Document(page_content=text)]
        else:
            print(f"Skipped non-English text file from URL: {link}")
            return None  # Skip non-English content

    except requests.exceptions.RequestException as e:
        print(f"Error loading text file from URL {link}: {e}")
        return None

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=100):
    """Split documents into smaller chunks for better processing."""
    try:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        chunks = text_splitter.split_documents(documents)
        print(f"Documents split into {len(chunks)} chunks")
        return chunks
    except Exception as e:
        print(f"Error splitting documents: {e}")
        return []

In [None]:
def read_links_from_file(file_path):
    """Read links from a text file."""
    try:
        with open(file_path, "r") as file:
            links = [line.strip() for line in file.readlines() if line.strip()]
        print(f"Loaded {len(links)} links from file: {file_path}")
        return links
    except Exception as e:
        print(f"Error reading links from file: {e}")
        raise

In [None]:
import hashlib
import os
def ingestion(links_list, persist_directory, embedding_model, chunk_size=1000, chunk_overlap=100):
    """Ingest documents into Chroma database, avoiding duplicates."""
    # Step 1: Check if the Chroma database exists and load it if it does
    if os.path.exists(persist_directory):
        print(f"Loading existing Chroma database from: {persist_directory}")
        chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    else:
        print(f"Creating new Chroma database at: {persist_directory}")
        chroma_db = create_chroma_db(persist_directory, embedding_model)

    # Step 2: Create a set to track added document hashes
    existing_hashes = set()

    # Step 3: Loop through each link and add new documents
    for link in links_list:
        # Load content from the URL
        documents = load_text_file_from_url(link)
        if documents is None:
            continue  # Skip if loading failed

        # Split documents into smaller chunks
        doc_chunks = split_documents(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

        for doc in doc_chunks:
            # Generate a hash for the document's content
            doc_hash = hashlib.sha256(doc.page_content.encode('utf-8')).hexdigest()

            # Check if this document has already been added
            if doc_hash not in existing_hashes:
                # Add the document and its hash to the set
                existing_hashes.add(doc_hash)
                chroma_db.add_documents([doc])
                print(f"Added new document to the Chroma database from URL: {link}")
            else:
                print(f"Skipped duplicate document from URL: {link}")

    # Step 4: Persist the Chroma database
    try:
        chroma_db.persist()
        print(f"Chroma database persisted to: {persist_directory}")
    except Exception as e:
        print(f"Error persisting Chroma database: {e}")

    return chroma_db


In [None]:
def create_retriever(chroma_db, search_type="similarity", threshold=0.55, k=4, lambda_mult=0.25):
    """Create a retriever for the Chroma database."""
    retriever = chroma_db.as_retriever(
        search_type=search_type,
        relevance_score_threshold=threshold,  # Only retrieve documents with 55% similarity or above
        k=k,  # Max number of documents to retrieve
        lambda_mult=lambda_mult  # Diversity of results, Maximal Marginal Relevance (MMR)
    )
    return retriever

In [None]:
# Main execution block
if __name__ == "__main__":
    links_file_path = "/content/F1.txt"  # Path to the text file containing the URLs
    persist_directory = "/content/drive/My Drive/ChromaDB_Lang_Chain"  # Updated folder name

    # Step 1: Load the SBERT embedding model (all-mpnet-base-v2 from Hugging Face)
    embedding_model = load_embeddings_model()

    # Step 2: Read links from the text file
    links_list = read_links_from_file(links_file_path)

    # Step 3: Ingest documents and create Chroma DB
    chromadb = ingestion(
        links_list=links_list,
        persist_directory=persist_directory,
        embedding_model=embedding_model
    )


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded 100 links from file: /content/F1.txt
Creating new Chroma database at: /content/drive/My Drive/second_embedding_chroma_db


  chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)


Chroma database created at: /content/drive/My Drive/second_embedding_chroma_db
Skipped non-English text file from URL: http://hrlibrary.umn.edu/oasinstr/szoas3con.html
Skipped non-English text file from URL: http://hrlibrary.umn.edu/instree/RProt2onorgcrime.html
Loaded English text file from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Documents split into 26 chunks
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/f3scas.htm
Added new document to the Chroma database from URL:

  chroma_db.persist()


In [None]:
# Main execution block
if __name__ == "__main__":
    links_file_path = "/content/F2.txt"  # Path to the text file containing the URLs
    persist_directory = "/content/drive/My Drive/ChromaDB_Lang_Chain"

    # Step 1: Load the SBERT embedding model (all-mpnet-base-v2 from Hugging Face)
    embedding_model = load_embeddings_model()

    # Step 2: Read links from the text file
    links_list = read_links_from_file(links_file_path)

    # Step 3: Ingest documents and create Chroma DB
    chromadb = ingestion(
        links_list=links_list,
        persist_directory=persist_directory,
        embedding_model=embedding_model
    )


Loaded 100 links from file: /content/F2.txt
Loading existing Chroma database from: /content/drive/My Drive/second_embedding_chroma_db
Loaded English text file from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Documents split into 34 chunks
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/oasinstr/cartagena1988.html
Added new do

In [None]:
# Main execution block
if __name__ == "__main__":
    links_file_path = "/content/F3.txt"  # Path to the text file containing the URLs
    persist_directory = "/content/drive/My Drive/ChromaDB_Lang_Chain"  # Updated folder name

    # Step 1: Load the SBERT embedding model (all-mpnet-base-v2 from Hugging Face)
    embedding_model = load_embeddings_model()

    # Step 2: Read links from the text file
    links_list = read_links_from_file(links_file_path)

    # Step 3: Ingest documents and create Chroma DB
    chromadb = ingestion(
        links_list=links_list,
        persist_directory=persist_directory,
        embedding_model=embedding_model
    )


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/F-iccrulesofprocedure.html
Added new document to the Chroma databa

In [None]:
# Main execution block
if __name__ == "__main__":
    links_file_path = "/content/F4.txt"  # Path to the text file containing the URLs
    persist_directory = "/content/drive/My Drive/ChromaDB_Lang_Chain"  # Updated folder name

    # Step 1: Load the SBERT embedding model (all-mpnet-base-v2 from Hugging Face)
    embedding_model = load_embeddings_model()

    # Step 2: Read links from the text file
    links_list = read_links_from_file(links_file_path)

    # Step 3: Ingest documents and create Chroma DB
    chromadb = ingestion(
        links_list=links_list,
        persist_directory=persist_directory,
        embedding_model=embedding_model
    )


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Skipped duplicate document from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Skipped duplicate document from URL: http://hrlibrary.umn.edu/Food-VoluntaryGuidelines.rtf
Added new d

In [None]:
# Main execution block
if __name__ == "__main__":
    links_file_path = "/content/F5.txt"  # Path to the text file containing the URLs
    persist_directory = "/content/drive/My Drive/second_embedding_chroma_db"  # Updated folder name

    # Step 1: Load the SBERT embedding model (all-mpnet-base-v2 from Hugging Face)
    embedding_model = load_embeddings_model()

    # Step 2: Read links from the text file
    links_list = read_links_from_file(links_file_path)

    # Step 3: Ingest documents and create Chroma DB
    chromadb = ingestion(
        links_list=links_list,
        persist_directory=persist_directory,
        embedding_model=embedding_model
    )


Loaded 83 links from file: /content/F5.txt
Loading existing Chroma database from: /content/drive/My Drive/second_embedding_chroma_db
Skipped non-English text file from URL: http://hrlibrary.umn.edu/oasinstr/fszoas3con.html
Skipped non-English text file from URL: http://hrlibrary.umn.edu/euro/fets128.html
Loaded English text file from URL: http://hrlibrary.umn.edu/instree/extraditionmodel.html
Documents split into 37 chunks
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/extraditionmodel.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/extraditionmodel.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/extraditionmodel.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/extraditionmodel.html
Added new document to the Chroma database from URL: http://hrlibrary.umn.edu/instree/extraditionmodel.html
Added new document to the Chroma datab