In [None]:
!pip install --quiet -U langchain langchain-community langchain-aws langchain-core pgvector

import os
import pandas as pd
import numpy as np
import json
import psycopg2
import ast
import pgvector
import math
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
import boto3
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_aws import BedrockLLM
import time

print("Imported and installed dependencies!")

For now, we are going to simply read the CSV file on the server. Look at other notebook on how to get CSV file from S3 Bucket.

In [None]:
# Load your CSV file into a pandas DataFrame
df = pd.read_csv('website_extracts_server_side.csv')
df.head()

Need to create our own CSV file with URL and text columns only.

In [None]:
# Select only the columns we want
selected_columns_df = df[['doc_id', 'url', 'parent_titles', 'titles', 'text', 'links']]

# Remove rows where 'text' is null
selected_columns_df = selected_columns_df.dropna(subset=['text'])

# Function to transform links dictionary to list of URLs
def transform_links(links_str):
    try:
        links_dict = ast.literal_eval(links_str)
        urls_list = [url for url, _ in links_dict.values()]
        return urls_list
    except (SyntaxError, ValueError):
        return []

# Apply the transformation to the 'links' column
selected_columns_df['links'] = selected_columns_df['links'].apply(transform_links)

# Modify the titles column to include parent_titles followed by titles
def combine_titles(row):
    try:
        # Parse parent_titles if it is not NaN
        parent_titles = ast.literal_eval(row['parent_titles']) if pd.notna(row['parent_titles']) else []

        # Parse titles if it is not NaN
        titles = ast.literal_eval(row['titles']) if pd.notna(row['titles']) else []

        # Remove the first element if it's an empty string
        if titles and titles[0] == '':
            titles = titles[1:]

        # Combine parent_titles and titles, ensuring no duplication if the last of parent_titles is the first of titles
        if parent_titles and titles:
            combined_titles = parent_titles + titles if parent_titles[-1] != titles[0] else parent_titles + titles[1:]
        else:
            combined_titles = parent_titles + titles

        return combined_titles
    except (SyntaxError, ValueError):
        # Return the original titles if there's an error
        return row['titles']

# Apply the combine_titles function to the 'titles' column
selected_columns_df['titles'] = selected_columns_df.apply(combine_titles, axis=1)

# Define the path for the new CSV file
new_file_path = 'moded_1.csv'

# Check if the file already exists
if not os.path.exists(new_file_path):
    # Save the new CSV file
    selected_columns_df.to_csv(new_file_path, index=False)
    print(f"New CSV file created at: {new_file_path}")
else:
    print(f"CSV file already exists at: {new_file_path}")

Sanity Check to count how many rows with null "text" values

In [None]:
# Load the CSV file
check = pd.read_csv('website_extracts_server_side.csv')

# Count the number of rows with null values in the 'text' column
null_text_count = check['text'].isna().sum()
print(f"Number of rows with null values in the 'text' column: {null_text_count}")

For each row of the moded.csv file, we will create vector embeddings using the Amazon Titan Embeddings model.

In [None]:
# Convert query into embedding
def get_bedrock_embeddings(input_text, model_id="amazon.titan-embed-text-v2:0", region_name="us-west-2"):
    # Initialize the boto3 client for Bedrock
    bedrock = boto3.client(
        service_name='bedrock-runtime',
        region_name=region_name
    )

    # Prepare the prompt and request body
    body = json.dumps({
        "inputText": input_text,
        "dimensions": 1024,
        "normalize": True
    })

    # Set the model ID and content type
    accept = "*/*"
    content_type = "application/json"

    # Invoke the Bedrock model to get embeddings
    response = bedrock.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=content_type
    )

    # Read and parse the response
    response_body = json.loads(response['body'].read())
    embedding = response_body.get('embedding')

    return embedding

if not os.path.exists('moded_with_embeddings_1.csv'):
    # Load the CSV file
    file_path = 'moded.csv'
    data = pd.read_csv(file_path)

    # Initialize the Bedrock Embeddings model
    # embeddings = BedrockEmbeddings()

    # Initialize lists to store the embeddings
    text_embeddings_list = []
    title_embeddings_list = []

    # Iterate over each row in the DataFrame
    for index, row in data.iterrows():
        # Get text and titles
        text = row['text']

        titles = " ".join(ast.literal_eval(row['titles']))  # Convert titles list to a single string

        # Generate the embedding for text if it's not empty
        if text.strip():
            text_embedding = get_bedrock_embeddings(text)
        else:
            text_embedding = []  # Or some default value
        text_embeddings_list.append(text_embedding)

        # Generate the embedding for titles if it's not empty
        if titles.strip():
            title_embedding = get_bedrock_embeddings(titles)
        else:
            title_embedding = []  # Or some default value
        title_embeddings_list.append(title_embedding)

        if (index + 1) % 500 == 0:
            print(f"Processed {index + 1}/{len(data)} rows")

    # Add the embeddings to the DataFrame
    data['text_embedding'] = text_embeddings_list
    data['title_embedding'] = title_embeddings_list

    # Save the updated DataFrame to a new CSV file
    data.to_csv('moded_with_embeddings_1.csv', index=False)
else:
    print("moded_with_embeddings already exists")

Sanity check to see if CSV file with embeddings has correct data.

In [None]:
# Load the CSV file with embeddings
file_path_with_embeddings = 'moded_with_embeddings_1.csv'
data_with_embeddings = pd.read_csv(file_path_with_embeddings)

# Perform the sanity check to count the number of rows
number_of_rows = len(data_with_embeddings)

print(f"The number of rows in the 'moded_with_embeddings.csv' file is: {number_of_rows}")

# Print the first row of the CSV file
first_row = data_with_embeddings.iloc[0]
print("The first row in the 'moded_with_embeddings.csv' file is:")
print(first_row.to_dict())

Connect to RDS instance and install vector extension if not already there.

In [None]:
# Define the connection parameters
connection_params = {
    "your-database-parameters"
}

# Create the connection string
connection_string = " ".join([f"{key}={value}" for key, value in connection_params.items()])

print("Connection string:", connection_string)

# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(connection_string)
cur = conn.cursor()

#install pgvector
cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()

print("Connected to RDS instance!")

Registers the vector type with psycopg2, enabling the handling of vector data types within Python.

In [4]:
# Register the vector type with psycopg2
register_vector(conn)

In [None]:
# Function to terminate process holding the lock
def terminate_locking_process(table_name):
    get_pid_query = f"""
    SELECT pid
    FROM pg_locks l
    JOIN pg_class t ON l.relation = t.oid AND t.relkind = 'r'
    WHERE t.relname = '{table_name}';
    """
    cur.execute(get_pid_query)
    locking_process = cur.fetchone()
    if locking_process:
        pid = locking_process[0]
        terminate_query = f"SELECT pg_terminate_backend({pid});"
        cur.execute(terminate_query)
        conn.commit()
        print(f"Terminated process {pid} holding lock on {table_name}.")

# Drop the table if it already exists
drop_table_command = "DROP TABLE IF EXISTS phase_2_embeddings;"

try:
    # Terminate the process holding the lock if any
    terminate_locking_process('phase_2_embeddings')

    # Drop the table if it exists
    cur.execute(drop_table_command)
    conn.commit()
    print("Table dropped if it existed.")
except psycopg2.Error as e:
    print(f"Error creating embeddings table: {e}")
    conn.rollback()

Create our vector table.

In [None]:
# Create table to store embeddings and metadata
table_create_command = """
CREATE TABLE phase_2_embeddings (
            id bigserial primary key,
            doc_id text,
            url text,
            titles jsonb,
            text text,
            links jsonb,
            text_embedding vector(1024),
            title_embedding vector(1024)
            );
            """

cur.execute(table_create_command)
cur.close()
conn.commit()

print("Table created!")

Populate phase_2_embeddings table.

In [None]:
# Load the CSV file with embeddings
print("Loading the CSV file...")
file_path_with_embeddings = 'moded_with_embeddings_1.csv'
data_with_embeddings = pd.read_csv(file_path_with_embeddings)

# Check the type of the embedding columns
text_embedding_type = type(first_row['text_embedding'])
title_embedding_type = type(first_row['title_embedding'])
print(f"The type of the text_embedding column is: {text_embedding_type}")
print(f"The type of the title_embedding column is: {title_embedding_type}")

# Function to convert string representation of list to numpy array
def parse_embedding(embedding_str):
    return np.array(eval(embedding_str))

# Apply the function to the embedding column if it's a string
if isinstance(first_row['text_embedding'], str):
    print("Converting the text_embedding column to numpy arrays...")
    data_with_embeddings['text_embedding'] = data_with_embeddings['text_embedding'].apply(parse_embedding)
    print("Conversion complete.")
else:
    print("Text Embeddings are not strings, they are a list of floats")

if isinstance(first_row['title_embedding'], str):
    print("Converting the title_embedding column to numpy arrays...")
    data_with_embeddings['title_embedding'] = data_with_embeddings['title_embedding'].apply(parse_embedding)
    print("Conversion complete.")
else:
    print("Title Embeddings are not strings, they are a list of floats")

# Verify the conversion
first_row_converted = data_with_embeddings.iloc[0]
print("The first row after converting the 'embedding' column:")
print(f"The first row after converting the 'text_embedding' and 'title_embedding' columns:\n{first_row_converted.to_dict()}")

# Convert 'titles' and 'links' columns to JSON format
data_with_embeddings['titles'] = data_with_embeddings['titles'].apply(json.dumps)
data_with_embeddings['links'] = data_with_embeddings['links'].apply(json.dumps)

# Function to ensure embeddings are lists, not empty, and pad with zeros to the correct dimensionality
def ensure_list_and_pad_embedding(embedding, expected_dim=1024):
    if isinstance(embedding, np.ndarray):
        embedding = embedding.tolist()
    if not embedding:
        embedding = [0] * expected_dim  # Replace empty embeddings with a list of zeros of the correct dimensionality
    elif len(embedding) < expected_dim:
        embedding.extend([0] * (expected_dim - len(embedding)))  # Pad with zeros
    return embedding

data_with_embeddings['text_embedding'] = data_with_embeddings['text_embedding'].apply(ensure_list_and_pad_embedding)
data_with_embeddings['title_embedding'] = data_with_embeddings['title_embedding'].apply(ensure_list_and_pad_embedding)

# Prepare the list of tuples for insertion in batches
batch_size = 500  # Set a smaller batch size
total_rows = len(data_with_embeddings)
num_batches = (total_rows // batch_size) + 1

print(f"Total rows: {total_rows}, Batch size: {batch_size}, Number of batches: {num_batches}")

# Use execute_values to perform batch insertion
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = min(start_idx + batch_size, total_rows)
    batch_data = data_with_embeddings.iloc[start_idx:end_idx]

    data_list = [(row['doc_id'],
                  row['url'],
                  row['titles'],
                  row['text'],
                  row['links'],
                  row['text_embedding'],
                  row['title_embedding']) for index, row in batch_data.iterrows()]

    print(f"Inserting batch {batch + 1}/{num_batches}...")

    try:
        conn.rollback()
        cur = conn.cursor()
        execute_values(cur, "INSERT INTO phase_2_embeddings (doc_id, url, titles, text, links, text_embedding, title_embedding) VALUES %s", data_list)
    except Exception as e:
        print(f"Error when populating table in batch {batch + 1}: {e}")
        conn.rollback()

    # Commit after each batch
    conn.commit()
    print(f"Batch {batch + 1}/{num_batches} insertion complete!")

print("All batches inserted successfully!")

Perform sanity checks on embeddings table.

In [None]:
cur = conn.cursor()
num_records = 0
try:
    cur.execute("SELECT COUNT(*) as cnt FROM phase_2_embeddings;")
    num_records = cur.fetchone()[0]
    print("Number of vector records in table: ", num_records,"\n")
except:
    print("Error when counting number of rows in embeddings table!")
    conn.rollback()
finally:
    cur.close()

In [None]:
cur = conn.cursor()
try:
    # print the first record in the table, for sanity-checking
    cur.execute("SELECT * FROM phase_2_embeddings LIMIT 1;")
    records = cur.fetchall()
    print("First record in table: ", records)
except:
    print("Error when printing first row in embeddings table!")
    conn.rollback()
finally:
    cur.close()

Apply Indexing

In [None]:
# Drops existing indexes on the embedding column
def drop_existing_indexes():
    try:
        cur = conn.cursor()
        cur.execute("""
            SELECT indexname 
            FROM pg_indexes 
            WHERE tablename = 'phase_2_embeddings' 
            AND indexdef LIKE '%embedding%' 
            AND indexdef NOT LIKE '%pkey%';
        """)
        indexes = cur.fetchall()
        for index in indexes:
            cur.execute(f"DROP INDEX IF EXISTS {index[0]};")
        conn.commit()
        print("Dropped existing indexes on embedding column!")
    except Exception as e:
        print("Error dropping existing indexes:", e)
        conn.rollback()
    finally:
        cur.close()

# Create an index on the data for faster retrieval
def create_index(index_method, distance_measure):
    drop_existing_indexes()
    try:
        cur = conn.cursor()
        if index_method == 'hnsw':
            cur.execute(f'CREATE INDEX ON phase_2_embeddings USING hnsw (text_embedding {distance_measure})')
            cur.execute(f'CREATE INDEX ON phase_2_embeddings USING hnsw (title_embedding {distance_measure})')
        elif index_method == 'ivfflat':
            num_lists = num_records / 1000
            if num_lists < 10:
                num_lists = 10
            if num_records > 1000000:
                num_lists = math.sqrt(num_records)

            cur.execute(f'CREATE INDEX ON phase_2_embeddings USING ivfflat (text_embedding {distance_measure}) WITH (lists = {num_lists});')
            cur.execute(f'CREATE INDEX ON phase_2_embeddings USING ivfflat (title_embedding {distance_measure}) WITH (lists = {num_lists});')

        conn.commit()
        print("Created Index!")
    except:
        print("Error when indexing embeddings table!")
        conn.rollback()
    finally:
        cur.close()

create_index('hnsw', 'vector_l2_ops')

Sanity Check to see if index is on embeddings table.

In [None]:
# Perform sanity check to print all indexes on phase_2_embeddings
try:
    cur = conn.cursor()
    cur.execute("SELECT indexname FROM pg_indexes WHERE tablename = 'phase_2_embeddings';")
    indexes = cur.fetchall()
    print("Indexes on phase_2_embeddings table:")
    for index in indexes:
        print(index[0])
except Exception as e:
    print("Error during sanity check:", e)
finally:
    cur.close()

Sanity Check to see details of index on embeddings table.

In [None]:
# Function to get details of an index
def get_index_details(index_name):
    index_details_query = f"""
    SELECT indexname, indexdef
    FROM pg_indexes
    WHERE indexname = '{index_name}';
    """

    try:
        cur = conn.cursor()
        cur.execute(index_details_query)
        index_details = cur.fetchone()
        if index_details:
            print(f"Details of index '{index_name}':")
            print(f"Index Name: {index_details[0]}")
            print(f"Index Definition: {index_details[1]}")
        else:
            print(f"No details found for index '{index_name}'.")
    except:
        print("Error when checking indexing details!")
        conn.rollback()
    finally:
        cur.close()

# Verify details of the created index
get_index_details('phase_2_embeddings_text_embedding_idx')
get_index_details('phase_2_embeddings_title_embedding_idx')

Retrieval Step.

In [None]:
# Get most similar documents from the database
def get_docs(query_embedding, number):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(conn)
    top_docs = []
    try:
        cur = conn.cursor()
        # Get the top N most similar documents using the KNN <=> operator
        cur.execute("""
                        SELECT doc_id, url, titles, text, links, text_similarity, title_similarity, (text_similarity + title_similarity) as total_similarity
                        FROM (
                            SELECT doc_id, url, titles, text, links, 
                                   text_embedding <=> %s AS text_similarity,
                                   title_embedding <=> %s AS title_similarity
                            FROM phase_2_embeddings
                        ) sub
                        ORDER BY total_similarity
                        LIMIT %s
                    """, (embedding_array, embedding_array, number))
        results = cur.fetchall()
        for result in results:
            doc_dict = {"doc_id": result[0],
                        "url": result[1],
                        "titles": result[2],
                        "text": result[3],
                        "links": result[4],
                        "text_similarity": result[5],
                        "title_similarity": result[6],
                        "total_similarity": result[7]}
            top_docs.append(doc_dict)
        cur.close()
    except Exception as e:
        print(f"Error when retrieving: {e}")
        conn.rollback()
    finally:
        cur.close()
    return top_docs

# Initialize the Bedrock Embeddings model
embeddings = BedrockEmbeddings()

text = "Does physics 100 count for the arts requirement?"
# text = "What are all the faculties at UBC?"
docs = get_docs(embeddings.embed_query(text), 5)
for idx, doc in enumerate(docs, 1):
    print(f"\nDocument {idx}:\n{doc['doc_id']}\n{doc['url']}\n{doc['titles']}\n{doc['text']}\n{doc['links']}\n{doc['text_similarity']}\n{doc['title_similarity']}\n{doc['total_similarity']}")

In [None]:
# Get most similar documents from the database
def get_docs(query_embedding, number, embedding_column):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(conn)
    top_docs = []
    try:
        cur = conn.cursor()
        # Get the top N most similar documents using the KNN <=> operator
        cur.execute(f"""
                        SELECT doc_id, url, titles, text, links, {embedding_column} <=> %s AS similarity
                        FROM phase_2_embeddings
                        ORDER BY similarity
                        LIMIT %s
                    """, (embedding_array, number))
        results = cur.fetchall()
        for result in results:
            doc_dict = {"doc_id": result[0],
                        "url": result[1],
                        "titles": ast.literal_eval(result[2]),
                        "text": result[3],
                        "links": ast.literal_eval(result[4]),
                        "score": result[5]}
            top_docs.append(doc_dict)
        cur.close()
    except Exception as e:
        print(f"Error when retrieving: {e}")
        conn.rollback()
    finally:
        cur.close()
    return top_docs

def get_combined_docs(query_embedding, number):
    text_docs = get_docs(query_embedding, number, embedding_column='text_embedding')
    title_docs = get_docs(query_embedding, number, embedding_column='title_embedding')

    # Combine documents, avoiding duplicates
    combined_docs = {}
    for doc in text_docs:
        doc_id = doc['doc_id']
        if doc_id not in combined_docs:
            combined_docs[doc_id] = doc
        else:
            # Choose lower score if document already exists
            combined_docs[doc_id]['score'] = min(combined_docs[doc_id]['score'], doc['score'])

    for doc in title_docs:
        doc_id = doc['doc_id']
        if doc_id not in combined_docs:
            combined_docs[doc_id] = doc
        else:
            # Choose lower score if document already exists
            combined_docs[doc_id]['score'] = min(combined_docs[doc_id]['score'], doc['score'])

    # Sort documents by score in ascending order (since lower score indicates higher similarity)
    sorted_docs = sorted(combined_docs.values(), key=lambda x: x['score'])

    return sorted_docs[:number]

# Initialize the Bedrock Embeddings model
embeddings = BedrockEmbeddings()

text = "Does physics 100 count for the arts requirement?"
# text = "What are all the faculties at UBC?"
docs = get_combined_docs(embeddings.embed_query(text), 5)
for idx, doc in enumerate(docs, 1):
    print(f"\nDocument {idx}:\n{doc['doc_id']}\n{doc['url']}\n{doc['titles']}\n{doc['text']}\n{doc['links']}\n{doc['score']}")

Putting everything together.

In [None]:
# Defining Constants
LLAMA_3_8B = "meta.llama3-8b-instruct-v1:0"
LLAMA_3_70B = "meta.llama3-70b-instruct-v1:0"
MISTRAL_7B = "mistral.mistral-7b-instruct-v0:2"
MISTRAL_LARGE = "mistral.mistral-large-2402-v1:0"
LLAMA_3_1_8B = "meta.llama3-1-8b-instruct-v1:0"
LLAMA_3_1_70B = "meta.llama3-1-70b-instruct-v1:0"

# Convert query into embedding
def get_bedrock_embeddings(input_text, model_id="amazon.titan-embed-text-v2:0", region_name="us-west-2"):
    # Initialize the boto3 client for Bedrock
    bedrock = boto3.client(
        service_name='bedrock-runtime',
        region_name=region_name
    )

    # Prepare the prompt and request body
    body = json.dumps({
        "inputText": input_text,
        "dimensions": 1024,
        "normalize": True
    })

    # Set the model ID and content type
    accept = "*/*"
    content_type = "application/json"

    # Invoke the Bedrock model to get embeddings
    response = bedrock.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=content_type
    )

    # Read and parse the response
    response_body = json.loads(response['body'].read())
    embedding = response_body.get('embedding')

    # Print the embedding
    print(embedding)
    return embedding

# Format all texts in the doc as one string when we pass prompt to LLM
def format_docs(docs):
    formatted_docs = "\n".join([f"Document {idx}:\n{doc['text']}" for idx, doc in enumerate(docs, 1)])
    return formatted_docs

# Get most similar documents from the database
def get_docs(query_embedding, number, embedding_column):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(conn)
    top_docs = []
    try:
        cur = conn.cursor()
        # Get the top N most similar documents using the KNN <=> operator
        cur.execute(f"""
                        SELECT doc_id, url, titles, text, links, {embedding_column} <=> %s AS similarity
                        FROM phase_2_embeddings
                        ORDER BY similarity
                        LIMIT %s
                    """, (embedding_array, number))
        results = cur.fetchall()
        for result in results:
            doc_dict = {"doc_id": result[0],
                        "url": result[1],
                        "titles": ast.literal_eval(result[2]),
                        "text": result[3],
                        "links": ast.literal_eval(result[4]),
                        "score": result[5]}
            top_docs.append(doc_dict)
        cur.close()
    except Exception as e:
        print(f"Error when retrieving: {e}")
        conn.rollback()
    finally:
        cur.close()
    return top_docs

def get_combined_docs(query_embedding, number):
    text_docs = get_docs(query_embedding, number, embedding_column='text_embedding')
    title_docs = get_docs(query_embedding, number, embedding_column='title_embedding')

    # Combine documents, avoiding duplicates
    combined_docs = {}
    for doc in text_docs:
        doc_id = doc['doc_id']
        if doc_id not in combined_docs:
            combined_docs[doc_id] = doc
        else:
            # Choose lower score if document already exists
            combined_docs[doc_id]['score'] = min(combined_docs[doc_id]['score'], doc['score'])

    for doc in title_docs:
        doc_id = doc['doc_id']
        if doc_id not in combined_docs:
            combined_docs[doc_id] = doc
        else:
            # Choose lower score if document already exists
            combined_docs[doc_id]['score'] = min(combined_docs[doc_id]['score'], doc['score'])

    # Sort documents by score in ascending order (since lower score indicates higher similarity)
    sorted_docs = sorted(combined_docs.values(), key=lambda x: x['score'])

    return sorted_docs

# Split documents based on character limit, 8,000 tokens is roughly 32,000 characters
def split_docs(docs, max_chars=25000):
    total_length = len(format_docs(docs))
    print(total_length)
    removed_docs = []

    while total_length > max_chars and docs:
        removed_doc = docs.pop()
        removed_docs.append(removed_doc)
        total_length = len(format_docs(docs))
    return {"docs": docs, "removed_docs": removed_docs}

def check_if_documents_relates(docs, user_prompt, llm):

    system_prompt = "Provide a short explaination if the document is relevant to the question or not."

    doc_relates = []
    for doc in docs:
        if llm.model_id == LLAMA_3_8B or llm.model_id == LLAMA_3_70B or llm.model_id == LLAMA_3_1_8B or llm.model_id == LLAMA_3_1_70B:
            prompt = f"""
                <|begin_of_text|>
                <|start_header_id|>system<|end_header_id|>
                {system_prompt}
                <|eot_id|>
                <|start_header_id|>question<|end_header_id|>
                {user_prompt}
                <|eot_id|>
                <|start_header_id|>document<|end_header_id|>
                {doc['text']}
                <|eot_id|>
                <|start_header_id|>assistant<|end_header_id|>
                """
        else:
            prompt = f"""Here is a queston that a user asked: {user_prompt}.
                Here is the text from a document: {doc['text']}.
                {system_prompt}. Only generate one human readable answer that is concise.
                """
        response = llm.invoke(prompt).strip()

        doc_info = {"doc_id": doc['doc_id'],
                    "url": doc['url'],
                    "titles": doc['titles'],
                    "text": doc['text'],
                    "links": doc['links'],
                    "relate": response}
        doc_relates.append(doc_info)

    return doc_relates

def answer_prompt(user_prompt, number_of_docs):

    # Record the start times
    total_start_time = time.time()
    answer_start_time = time.time()

    # Initialize the Bedrock Embeddings model
    # embeddings = BedrockEmbeddings()
    embedding = get_bedrock_embeddings(user_prompt)

    docs = get_combined_docs(embedding, number_of_docs)

    divided_docs = split_docs(docs)
    print(len(divided_docs["docs"]))

    documents = format_docs(divided_docs["docs"])

    # Get the LLM we want to invoke
    llm = BedrockLLM(
                        model_id=LLAMA_3_8B
                    )

    system_prompt = "You are a helpful UBC student advising assistant who answers with kindness while being concise."
    # system_prompt = "You are a helpful UBC student advising assistant who answers with kindness while being concise. If the question does not relate to UBC, respond with 'IDK.'"
    # system_prompt = """You are a helpful UBC student advising assistant. 
    #                    Using the documents given to you, consicely answer the user's prompt with kindness. 
    #                    If the question does not relate to UBC, respond with 'IDK.'"""

    if llm.model_id == LLAMA_3_8B or llm.model_id == LLAMA_3_70B or llm.model_id == LLAMA_3_1_8B or llm.model_id == LLAMA_3_1_70B:
        prompt = f"""
            <|begin_of_text|>
            <|start_header_id|>system<|end_header_id|>
            {system_prompt}
            <|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            {user_prompt}
            <|eot_id|>
            <|start_header_id|>documents<|end_header_id|>
            {documents}
            <|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>
            """
    else:
        prompt = f"""{system_prompt}. Provide your answer as if you are talking to a student.
            Here is the question: {user_prompt}.
            Here are the source documents: {documents}
            """

    answer = llm.invoke(prompt)

    # Record the end time and find duration of answer only
    answer_end_time = time.time()
    answer_duration = answer_end_time - answer_start_time

    check_docs = check_if_documents_relates(divided_docs["docs"], user_prompt, llm)
    check_additional_docs = check_if_documents_relates(divided_docs["removed_docs"], user_prompt, llm)

    # Record the end time and find duration of the total time of checking over each document
    total_end_time = time.time()
    total_duration = total_end_time - total_start_time

    return {"answer": answer, "docs": check_docs, "additional_docs": check_additional_docs, "answer_time": answer_duration, "total_time": total_duration}

# Neatly prints dictionary returned by answer_prompt
def neat_print(response):
    print(f"Answer: {response['answer']}\n")
    
    print("Documents:")
    for doc in response['docs']:
        print(f"doc_id: {doc['doc_id']}")
        print(f"URL: {doc['url']}")
        print(f"Titles: {doc['titles']}")
        print(f"Text:\n{doc['text']}")
        print(f"Links:{doc['links']}")
        print(f"Relevance: {doc['relate']}")
    
    print("_________________________________________________________________________________________________________")
    
    print("Additional Documents:")
    for doc in response['additional_docs']:
        print(f"doc_id: {doc['doc_id']}")
        print(f"URL: {doc['url']}")
        print(f"Titles: {doc['titles']}")
        print(f"Text:\n{doc['text']}")
        print(f"Links:{doc['links']}")
        print(f"Relevance: {doc['relate']}")
    
    print(f"answer_time: {response['answer_time']}\n")
    print(f"total_time: {response['total_time']}\n")

response = answer_prompt("Does physics 100 count for the arts requirement?", 5)

neat_print(response)

In [17]:
response = answer_prompt("What are all the specializations in the Faculty of Applied Sciences?", 5)

neat_print(response)

[-0.040755432, 0.034010693, -0.0144222565, -0.04391254, -0.030279564, -0.017794624, 0.02138225, 0.04190347, -0.01112164, 0.025113381, 0.003157111, 0.0057402016, 0.026835442, 0.02195627, 0.027983483, -0.057115003, -0.00050002534, -0.028988017, 0.010691125, 0.03788533, -0.01937318, -0.015857307, 0.010691125, -0.009830095, -0.006816489, 0.008072158, 0.026404927, 0.011049888, 0.09930549, -0.008502673, 0.02453936, 0.009148446, -0.07749272, -0.007426386, 0.060272116, 0.052235834, 0.0027445338, -0.024108846, 0.0456346, 0.011623908, 0.025113381, -0.008215663, 0.0005785047, 0.018942665, 0.06285521, 0.0099736005, 0.07864076, 0.064003244, -0.025974412, -0.0067088604, -0.0027445338, -0.013919989, 0.020808231, 0.06371624, 0.048504703, 0.031284098, 0.0032468014, -0.0047356663, 0.06658634, -0.09586137, -0.051087793, -0.027122453, -0.0004977831, 0.03329317, -0.050513774, -0.02138225, -0.0076416433, 0.0037849455, 0.008646178, 0.0077851485, -0.013130711, -0.0067088604, -0.046208624, -0.030566573, -0.034