## Imports

In [1]:
import os
import json
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.documents import Document

## Load data

In [2]:
def load_json(filename: str) -> dict:
    """
    Load the JSON file.

    Args:
        filename (str): name of the file to load

    Returns:
        dict: json file content as a dictionary
    """
    with open(filename, 'r') as f:
        file = json.load(f)
    return file

topics = load_json('topics.json')['Topics']
print(f"The {len(topics)} topics are:")
print("\n-".join(topics.keys()))

The 10 topics are:
Systems of linear equations, Gaussian elimination
-Vector equations, Matrix
-Solution sets and Linear independence
-Linear Transformations, Matrix algebra
-The Inverse of a Matrix
-Determinants, Perspective projections
-Vector Spaces
-Eigenvalues and Eigenvectors
-Diagonalization
-Orthogonality and Symmetric Matrices


## Preprocess data

In [3]:
def get_section_name(section: str) -> str:
    """
    Get the section name from the document.

    Args:
        section (str): number of the section

    Returns:
        str: looked up section name
    """
    sections = load_json('section_names.json')
    return sections[section]

In [4]:
def get_pages_string(pages: list) -> str:
    """
    Get the string representation of the pages.

    Args:
        pages (list): list of pages

    Returns:
        str: string representation of the pages
    """
    return " and ".join([str(i) for i in pages])

In [5]:
def preprocess_metadata(metadata: dict) -> dict:
    """
    Preprocess the metadata to make it more readable.

    Args:
        metadata (dict): metadata of the document

    Returns:
        dict: preprocessed metadata
    """
    #if 'section' in metadata:
    metadata["section_name"] = get_section_name(metadata["section"])
    metadata["page"] = get_pages_string(metadata["page"])
    return metadata

In [6]:
documents = []

for learning_chapter in topics:
    #print(learning_chapter)
    for section in topics[learning_chapter]:
        #print(topics[learning_chapter][section])
        for document in topics[learning_chapter][section]:
            page_content = f"{document["text"]}" #Maybe put the section name before the text for more context
            #print(page_content)

            metadata = preprocess_metadata(document["metadata"])
            #print(metadata)
            documents.append(Document(page_content=page_content, metadata=metadata))

## Create embeddings

In [None]:
# Load environment variables from a .env file
load_dotenv()
GOOGLE_API = os.getenv('GOOGLE_API_KEY')

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

#Embedding models: https://ai.google.dev/gemini-api/docs/embeddings#embeddings-models
# and https://ai.google.dev/gemini-api/docs/models#text-embedding-and-embedding
#embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=GOOGLE_API)
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API)

# Create a Chroma vector database with Google embeddings
#db_google = Chroma.from_documents(documents, embedding, persist_directory="./vectordb/google_vectorDB-004") # for new database
db_google = Chroma.from_documents(documents, embedding, persist_directory="./vectordb/google_vectorDB-001") # for new database
#db_google = Chroma(persist_directory="./vectordb/google_vectorDB", embedding_function=embedding) # for existing database
