In [None]:
!pip install requests langchain-community beautifulsoup4 langchain chromadb sentence-transformers together

# **Scraping**

In [None]:
# scraping import
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# ingestion import
import chromadb
from chromadb import Client
from chromadb.config import Settings
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
import gensim.downloader
from google.colab import drive
import json
import requests
import base64
import gzip
import os

In [None]:
drive.mount('/content/drive')

In [None]:
url = 'http://hrlibrary.umn.edu/instree/ainstls1.htm'
response = requests.get(url)
if response.status_code == 200:
    print("Successfully fetched the webpage.")
else:
    print(f"Failed to fetch webpage. Status code: {response.status_code}")

In [None]:
def extract_links(html_content, base_url):
    extracted_links =[]
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract all <a> tags
    anchor_tags = soup.find_all('a')

    # Loop through each <a> tag and extract href attribute
    for tag in anchor_tags:
        href = tag.get('href')  # Extract the href attribute
        if href and (href.endswith('.htm') or href.endswith('.html')):
            full_url = urljoin(base_url, href) # Join the base URL with relative links to get absolute URL
            extracted_links.append(full_url)
    return extracted_links

extracted_links = list(extract_links(response.content,url))

# **Ingestion**

extract the content from each link

In [None]:
def extract_text_content(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get all text from the page, stripping out script and style elements
        for script in soup(["script", "style"]):
            script.decompose()  # remove all scripts and styles

        # Get text and strip leading/trailing whitespace
        text_content = soup.get_text(separator='\n', strip=True)
        return text_content
    else:
        print(f"Failed to retrieve document at {link}. Status code: {response.status_code}")
        return None

Embedding models initialization

In [None]:
# 1. Hugging Face Embedding
hf_embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

In [None]:
# 2. Word2Vec Embedding
word2vec_model =  gensim.downloader.load("word2vec-google-news-300")

In [None]:
# 3. TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

functions to ingest data using the 3 embedding models

In [None]:
def ingest_huggingface(links):

    collection = client.get_or_create_collection(name="rag_with_HF")
    for link in links:
            doc_content = extract_text_content(link)
            # Check if doc_content is None before proceeding
            if doc_content is not None:
                embedding = hf_embedding_function.embed_documents([doc_content])
                doc_id = link  # Use the link as the document ID
                collection.add(
                    documents=[doc_content],
                    embeddings=[embedding[0]],
                    metadatas=[{"source": link}],
                    ids=[doc_id]
                )
                print(f"Ingested document with Hugging Face embedding: {link}")
            else:
                print(f"Skipping document with no content: {link}")

In [None]:
def ingest_word2vec(links):

    collection = client.get_or_create_collection(name="rag_with_w2v")
    for link in links:
        doc_content = extract_text_content(link)
        if doc_content is not None:
            tokens = doc_content.split()  # Tokenize content

            embeddings = [word2vec_model[token] for token in tokens if token in word2vec_model.key_to_index]

            doc_id = link  # Use the link as the document ID

            if embeddings:
                # Calculate the average of the embeddings
                average_embedding = np.mean(embeddings, axis=0)

                collection.add(
                    ids=[doc_id],
                    documents=[doc_content],
                    embeddings=[average_embedding],
                    metadatas=[{"source": link}]
                )
                print(f"Ingested document with Word2Vec embedding: {link}")
            else:
                print(f"Skipping document with no valid tokens: {link}")
        else:
            print(f"Failed to retrieve or extract content from document at {link}.")

In [None]:
def ingest_tfidf(links):
    collection = client.get_or_create_collection(name="rag_with_TF")
    all_documents = []
    doc_ids = []

    for link in links:
        doc_content = extract_text_content(link)
        if doc_content is not None:
            all_documents.append(doc_content)
            doc_ids.append(link)
        else:
            print(f"Failed to retrieve or extract content from document at {link}.")

    # Check if we have any valid documents before fitting the model
    if all_documents:
        # Fit the TF-IDF model on all documents
        tfidf_vectorizer.fit(all_documents)
        tfidf_embeddings = tfidf_vectorizer.transform(all_documents).toarray()

        for idx, link in enumerate(doc_ids):
            collection.add(
                documents=[all_documents[idx]],
                embeddings=[tfidf_embeddings[idx]],
                metadatas=[{"source": link}],
                ids=[doc_ids[idx]]
            )
            print(f"Ingested document with TF-IDF embedding: {link}")
    else:
        print("No valid documents to ingest.")

ingestion process

In [None]:
# Initialize ChromaDB client
client = Client(settings=Settings(persist_directory="/content/drive/My Drive/Chromadb"))

In [None]:
ingest_huggingface(extracted_links)

In [None]:
batch_size = 10
for i in range(0, len(extracted_links), batch_size):
    batch_links = extracted_links[i:i + batch_size]
    ingest_word2vec(batch_links)
    print(f"Processed batch {i // batch_size + 1} of {len(extracted_links) // batch_size + 1}")

In [None]:
ingest_tfidf(extracted_links)

function to extract the data from each collection and upload it to github as JSON file

In [None]:
def extract_data_from_collection(collection_name):
    collection = client.get_or_create_collection(name=collection_name)

    # Retrieve all document IDs
    documents = collection.get()
    all_data = []

    # Loop through and print each document's details
    for doc_id in documents['ids']:  # Access the 'ids' key from the returned dictionary
        # Get the document details using the document ID
        document_details = collection.get(ids=[doc_id], include=['documents', 'embeddings', 'metadatas'])

        content = document_details['documents'][0]
        embedding = document_details['embeddings'][0]
        source = document_details['metadatas'][0]['source']

        # Combine them into a structured format
        all_data.append({
            "document": content,
            "embedding": embedding,
            "metadata": {"source": source},
            "id": doc_id
        })

    return all_data

In [None]:
def upload_to_github(filename):
    # Your GitHub details
    # GITHUB_TOKEN = ""
    # REPO_NAME = "FarahSaleh121/chroma_collections"

    # Upload the file to GitHub
    with open(filename, "rb") as f:  # Open file in binary mode
        content = f.read()

    # Encode content to Base64
    encoded_content = base64.b64encode(content).decode('utf-8')

    url_g = f"https://api.github.com/repos/{REPO_NAME}/contents/{filename}"

    # Prepare the data for GitHub API
    data = {
        "message": f"Add collection data batch {filename}",
        "content": encoded_content,
    }

    # Use the GitHub API to upload the file
    response = requests.put(url_g, headers={"Authorization": f"token {GITHUB_TOKEN}"}, json=data)

    if response.status_code == 201:
        print(f"File {filename} uploaded successfully!")
    else:
        print("Error:", response.json())

In [None]:
def upload_data_in_batches(filename,all_data, batch_size=400):
    total_batches = len(all_data) // batch_size + (1 if len(all_data) % batch_size != 0 else 0)

    for i in range(total_batches):
        batch_data = all_data[i * batch_size : (i + 1) * batch_size]

        # Prepare data for saving
        collection_data = {
            "documents": [item["document"] for item in batch_data],
            # Convert embeddings to lists before saving
            "embeddings": [item["embedding"].tolist() if isinstance(item["embedding"], np.ndarray) else item["embedding"] for item in batch_data],
            "metadatas": [item["metadata"] for item in batch_data],
            "ids": [item["id"] for item in batch_data],
        }

        # Create a filename for the current batch
        current_filename = f"{filename}_{i + 1}.json"

        # Save the current batch to a JSON file
        with open(current_filename, "w") as f:
            json.dump(collection_data, f)

        # Upload the file to GitHub
        upload_to_github(current_filename)

this function speciallized for TF-IDF because the data is larger

In [None]:
def upload_data_in_compressed_batches(filename, all_data, batch_size=400):
    total_batches = len(all_data) // batch_size + (1 if len(all_data) % batch_size != 0 else 0)

    for i in range(total_batches):
        batch_data = all_data[i * batch_size : (i + 1) * batch_size]

        # Prepare data for saving
        collection_data = {
            "documents": [item["document"] for item in batch_data],
            "embeddings": [item["embedding"].tolist() if isinstance(item["embedding"], np.ndarray) else item["embedding"] for item in batch_data],
            "metadatas": [item["metadata"] for item in batch_data],
            "ids": [item["id"] for item in batch_data],
        }

        # Create a filename for the compressed batch
        compressed_filename = f"{filename}_batch_{i + 1}.json.gz"

        # Compress and save the batch to a file
        with gzip.open(compressed_filename, "wt", encoding="utf-8") as f:
            json.dump(collection_data, f)

        # Upload the compressed file to GitHub
        upload_to_github(compressed_filename)

In [None]:
all_data_HF = extract_data_from_collection("rag_with_HF")

In [None]:
upload_data_in_batches('rag_with_HF',all_data_HF)

In [None]:
all_data_w2v = extract_data_from_collection("rag_with_w2v")

In [None]:
upload_data_in_batches('rag_with_w2v',all_data_w2v)

In [None]:
all_data_TF = extract_data_from_collection("rag_with_TF")

In [None]:
upload_data_in_compressed_batches("rag_with_TF", all_data_TF)