## Imports

In [1]:
import os
import json
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI

## Load data

In [2]:
def load_json(filename: str) -> dict:
    """
    Load the JSON file.

    Args:
        filename (str): name of the file to load

    Returns:
        dict: json file content as a dictionary
    """
    with open(filename, 'r') as f:
        file = json.load(f)
    return file

topics = load_json('topics.json')['Topics']
print(f"The {len(topics)} topics are:")
print("\n-".join(topics.keys()))

The 10 topics are:
Systems of linear equations, Gaussian elimination
-Vector equations, Matrix
-Solution sets and Linear independence
-Linear Transformations, Matrix algebra
-The Inverse of a Matrix
-Determinants, Perspective projections
-Vector Spaces
-Eigenvalues and Eigenvectors
-Diagonalization
-Orthogonality and Symmetric Matrices


## Preprocess data

In [7]:
def get_section_name(section: str) -> str:
    """
    Get the section name from the document.

    Args:
        section (str): number of the section

    Returns:
        str: looked up section name
    """
    sections = load_json('section_names.json')
    return sections[section]

In [4]:
def get_pages_string(pages: list) -> str:
    """
    Get the string representation of the pages.

    Args:
        pages (list): list of pages

    Returns:
        str: string representation of the pages
    """
    return " and ".join([str(i) for i in pages])

In [5]:
def preprocess_metadata(metadata: dict) -> dict:
    """
    Preprocess the metadata to make it more readable.

    Args:
        metadata (dict): metadata of the document

    Returns:
        dict: preprocessed metadata
    """
    #if 'section' in metadata:
    metadata["section_name"] = get_section_name(metadata["section"])
    metadata["page"] = get_pages_string(metadata["page"])
    return metadata

In [8]:
documents = []

for learning_chapter in topics:
    #print(learning_chapter)
    for section in topics[learning_chapter]:
        #print(topics[learning_chapter][section])
        for document in topics[learning_chapter][section]:
            page_content = f"{document["text"]}" #Maybe put the section name before the text for more context
            #print(page_content)

            metadata = preprocess_metadata(document["metadata"])
            #print(metadata)
            documents.append(Document(page_content=page_content, metadata=metadata))

## Create embeddings

In [None]:
# Load environment variables from .env file
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAIEmbeddings class
embedding = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=openai_api_key)

# Create a Chroma vector database OpenAI embeddings
#db_openai = Chroma.from_documents(documents, embedding, persist_directory="./vectordb/openai_vectorDB") #for new database
db_openai = Chroma(persist_directory="./vectordb/openai_vectorDB", embedding_function=embedding) #for existing database

In [30]:
def get_page_numbers(page_numbers: list[str]) -> list[int]:
    """
    Get the page numbers from the metadata.

    Args:
        page_numbers (list): list of page numbers

    Returns:
        list: sorted list of page numbers
    """
    int_page_numbers = []
    for page_number in page_numbers:
        if "and" in page_number:
            page_number = page_number.split(" and ")
            page_number = [int(i) for i in page_number]
            int_page_numbers.extend(page_number)
        else:
            page_number = int(page_number)
            int_page_numbers.append(page_number)
    int_page_numbers.sort()

    # Now we need to string them again
    return [str(i) for i in int_page_numbers]

In [None]:
# Define a query
query = "Can you state all conditions for a matrix to be invertible?"

# Use the retriever to find relevant documents
retrieved_docs = db_openai.similarity_search(query, k=4)

# Prepare context from retrieved documents
context = "\n\n".join([doc.page_content for doc in retrieved_docs])
references = get_page_numbers([doc.metadata['page'] for doc in retrieved_docs])
#print(references)

# Create a prompt for the LLM
prompt = (
    "You are an assistant for question-answering tasks. Use the following pieces of "
    "retrieved context to answer the question. If you don't know the answer, say that you "
    "don't know."
    "\n\nContext:\n" + context + "\n\nQuestion:\n" + query
)

# Use the LLM to generate an answer
llm = OpenAI()

resp = llm.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user","content": query}
    ],
)

response_message = resp.choices[0].message.content
print(response_message)
print(f"References: pages {", ".join(references[:-1])}, and {references[-1]}")

['140', '145', '205', '270']
The conditions for a matrix to be invertible are as follows:

1. The matrix is row-equivalent to the n x n identity matrix.
2. The matrix has n pivot positions.
3. The matrix is square and its determinant is not equal to 0.
4. The matrix's columns form a linearly independent set.
5. The mapping x â†¦ Ax is one-to-one.
6. The equation Ax = b has at least one solution for each b in R^n.
7. The columns of the matrix span R^n.
8. The linear transformation x â†¦ Ax maps R^n onto R^n.
9. There is an n x n matrix C such that CA = I (left inverse).
10. There is an n x n matrix D such that AD = I (right inverse).
11. The transpose of the matrix is also invertible.

Additionally, the following statements are equivalent to a matrix being invertible:
- The columns of the matrix form a basis of R^n.
- The column space of the matrix is R^n.
- The rank of the matrix is equal to n.
- The nullity of the matrix is 0.
- The null space of the matrix is {0}.
References: pages 1