In [1]:
import os
import json
import glob
from typing import List
from tqdm import tqdm
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from chromadb.config import Settings

# Define the folder for storing the database
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'db')

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
    persist_directory=PERSIST_DIRECTORY,
    anonymized_telemetry=False
)

# Configuration
source_directory = os.environ.get('SOURCE_DIRECTORY', 'External_Database')
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME', 'all-MiniLM-L6-v2')

# Optimized parameters for text splitting
chunk_size = 300
chunk_overlap = 30
max_files_to_process = 5  # Limit for debugging

# Function to load a single PDF document
def load_single_pdf(file_path: str) -> List[Document]:
    print(f"Loading file: {file_path}")
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    print(f"Finished loading file: {file_path}")
    return documents

# Function to load all PDF documents from the source directory
def load_pdf_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    all_pdf_files = glob.glob(os.path.join(source_dir, "**/*.pdf"), recursive=True)[:max_files_to_process]
    filtered_files = [file_path for file_path in all_pdf_files if file_path not in ignored_files]

    print(f"Found {len(filtered_files)} PDF files to process.")

    results = []
    for file_path in tqdm(filtered_files, desc='Loading PDF documents', ncols=80):
        results.extend(load_single_pdf(file_path))

    return results

# Function to process and split documents into text chunks
def process_documents(ignored_files: List[str] = []) -> List[Document]:
    print(f"Loading PDF documents from {source_directory}")
    documents = load_pdf_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} PDF documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

# Function to generate embeddings and save them along with the text content
def generate_and_save_embeddings(file_path: str):
    # Load the embeddings model
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
    
    # Process your documents to get the text chunks
    texts = process_documents()
    text_contents = [text.page_content for text in texts]
    
    # Generate embeddings for the text contents
    embeddings_list = embeddings.embed_documents(text_contents)
    
    # Create a data structure to save both embeddings and text
    data_to_save = [
        {"embedding": embedding, "text": text}
        for embedding, text in zip(embeddings_list, text_contents)
    ]
    
    # Save the data to a JSON file
    with open(file_path, "w") as file:
        json.dump(data_to_save, file)
    
    print(f"Saved {len(data_to_save)} embeddings and texts to {file_path}")

# Specify the file path for saving the embeddings
file_path = "embeddings_output.json"

# Call the function to generate and save embeddings
generate_and_save_embeddings(file_path)


  embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
  from tqdm.autonotebook import tqdm, trange



Loading PDF documents from External_Database
Found 3 PDF files to process.


Loading PDF documents:   0%|                              | 0/3 [00:00<?, ?it/s]

Loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf


Loading PDF documents: 100%|██████████████████████| 3/3 [00:00<00:00,  8.73it/s]

Finished loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Loading file: External_Database\Job_description.pdf
Finished loading file: External_Database\Job_description.pdf
Loading file: External_Database\Student NL Sept 19, 2024.pdf
Finished loading file: External_Database\Student NL Sept 19, 2024.pdf
Loaded 14 PDF documents from External_Database
Split into 71 chunks of text (max. 300 tokens each)





Saved 71 embeddings and texts to embeddings_output.json


In [2]:
import os
import glob
from typing import List
from multiprocessing import Pool
from tqdm import tqdm
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from chromadb.config import Settings

# Define the folder for storing the database
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'db')

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
    persist_directory=PERSIST_DIRECTORY,
    anonymized_telemetry=False
)

# Configuration
source_directory = os.environ.get('SOURCE_DIRECTORY', 'External_Database')
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME', 'all-MiniLM-L6-v2')

# Optimized parameters for debugging
chunk_size = 300
chunk_overlap = 30
max_files_to_process = 5  # Only process a few files for testing

# Function to check if a vector store exists
def does_vectorstore_exist(persist_directory: str) -> bool:
    if os.path.exists(os.path.join(persist_directory, 'index')):
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
            if len(list_index_files) > 3:
                return True
    return False

# Function to load a single PDF document with debug info
def load_single_pdf(file_path: str) -> List[Document]:
    print(f"Loading file: {file_path}")
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    print(f"Finished loading file: {file_path}")
    return documents

# Function to load all PDF documents from the source directory with debug info
def load_pdf_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    all_pdf_files = glob.glob(os.path.join(source_dir, "**/*.pdf"), recursive=True)[:max_files_to_process]
    filtered_files = [file_path for file_path in all_pdf_files if file_path not in ignored_files]

    print(f"Found {len(filtered_files)} PDF files to process.")

    # Temporarily disable multiprocessing for debugging
    results = []
    for file_path in tqdm(filtered_files, desc='Loading PDF documents', ncols=80):
        results.extend(load_single_pdf(file_path))  # Direct call without multiprocessing

    return results

# Function to process and split documents into chunks
def process_documents(ignored_files: List[str] = []) -> List[Document]:
    print(f"Loading PDF documents from {source_directory}")
    documents = load_pdf_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} PDF documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

# Main function to create or update the vector store
def main():
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

    if does_vectorstore_exist(PERSIST_DIRECTORY):
        print(f"Appending to existing vector store at {PERSIST_DIRECTORY}")
        db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
        collection = db.get()
        texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
    else:
        print("Creating new vector store")
        texts = process_documents()
    
    # Print a random chunk of text
    import random
    if texts:
        random_chunk = random.choice(texts)
        print("Sample extracted text chunk:", random_chunk.page_content)
    
    print(f"Creating embeddings. May take some minutes...")
    db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)
    db.persist()
    db = None

    

if __name__ == "__main__":
    main()


Creating new vector store
Loading PDF documents from External_Database
Found 3 PDF files to process.


Loading PDF documents: 100%|██████████████████████| 3/3 [00:00<00:00, 37.18it/s]

Loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Finished loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Loading file: External_Database\Job_description.pdf
Finished loading file: External_Database\Job_description.pdf
Loading file: External_Database\Student NL Sept 19, 2024.pdf
Finished loading file: External_Database\Student NL Sept 19, 2024.pdf
Loaded 14 PDF documents from External_Database
Split into 71 chunks of text (max. 300 tokens each)
Sample extracted text chunk: credit-hour program. By the time you graduate, your toolbox will allow you to pick the best approach
to solve the problems you face.
Career Possibilities
If it seems like you're seeing a lot of job postings for data scientists and business intelligence analysts,
Creating embeddings. May take some minutes...



  db.persist()


In [3]:
# Chunk to show the embeddings
def show_embeddings():
    # Load the embeddings model
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

    # Load the vector store
    db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
    
    # Fetch some documents and extract the text content
    texts = process_documents()  # Ensure this function provides the chunks you want
    text_contents = [text.page_content for text in texts]

    # Generate embeddings for the text contents
    embeddings_list = embeddings.embed_documents(text_contents)

    # Display the first few embeddings for inspection
    for i, embedding in enumerate(embeddings_list[:5]):
        print(f"Embedding {i + 1}: {embedding[:10]}...")  # Print the first 10 values for brevity

# Call the function to show the embeddings
show_embeddings()


  db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)


Loading PDF documents from External_Database
Found 3 PDF files to process.


Loading PDF documents: 100%|██████████████████████| 3/3 [00:00<00:00, 31.37it/s]


Loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Finished loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Loading file: External_Database\Job_description.pdf
Finished loading file: External_Database\Job_description.pdf
Loading file: External_Database\Student NL Sept 19, 2024.pdf
Finished loading file: External_Database\Student NL Sept 19, 2024.pdf
Loaded 14 PDF documents from External_Database
Split into 71 chunks of text (max. 300 tokens each)
Embedding 1: [0.04865654557943344, 0.03969587758183479, -0.04392899572849274, 0.04689142480492592, -0.03219174966216087, -0.02440021000802517, 0.07106217741966248, -0.016942642629146576, -0.07020010054111481, 0.012026670388877392]...
Embedding 2: [-0.02117951214313507, -0.0628172755241394, 0.05858604982495308, -0.0014558365801349282, 0.006836882792413235, -0.037297822535037994, -0.016918009147047997, 0.10872379690408707, -0.10490531474351883, 0.07856155186891556

In [4]:
import json

# Chunk to save the embeddings with their associated text content to a file
def save_embeddings_to_file(filename="embeddings_with_text.json"):
    # Load the embeddings model
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

    # Load the vector store
    db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
    
    # Fetch some documents and extract the text content
    texts = process_documents()  # Ensure this function provides the chunks you want
    text_contents = [text.page_content for text in texts]

    # Generate embeddings for the text contents
    embeddings_list = embeddings.embed_documents(text_contents)

    # Create a list of dictionaries to store embeddings and text content
    embeddings_with_text = [
        {"embedding": embedding, "text": text}
        for embedding, text in zip(embeddings_list, text_contents)
    ]

    # Save to a JSON file
    with open(filename, "w") as file:
        json.dump(embeddings_with_text, file)
    print(f"Embeddings and text content saved to {filename}")

# Call the function to save the embeddings to a file
save_embeddings_to_file()


Loading PDF documents from External_Database
Found 3 PDF files to process.


Loading PDF documents: 100%|██████████████████████| 3/3 [00:00<00:00, 28.13it/s]

Loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Finished loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Loading file: External_Database\Job_description.pdf
Finished loading file: External_Database\Job_description.pdf
Loading file: External_Database\Student NL Sept 19, 2024.pdf
Finished loading file: External_Database\Student NL Sept 19, 2024.pdf
Loaded 14 PDF documents from External_Database
Split into 71 chunks of text (max. 300 tokens each)





Embeddings and text content saved to embeddings_with_text.json


In [5]:
import json

def show_embeddings_and_save():
    # Load the embeddings model
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

    # Load the vector store
    db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
    
    # Fetch some documents and extract the text content
    texts = process_documents()  # Ensure this function provides the chunks you want
    text_contents = [text.page_content for text in texts]

    # Generate embeddings for the text contents
    embeddings_list = embeddings.embed_documents(text_contents)

    # Save the embeddings to a file
    with open("embeddings_output.json", "w") as file:
        json.dump(embeddings_list, file)
    
    # Display the first few embeddings for inspection
    for i, embedding in enumerate(embeddings_list[:5]):
        print(f"Embedding {i + 1}: {embedding[:10]}...")  # Print the first 10 values for brevity

# Call the function to show and save the embeddings
show_embeddings_and_save()


Loading PDF documents from External_Database
Found 3 PDF files to process.


Loading PDF documents: 100%|██████████████████████| 3/3 [00:00<00:00, 24.41it/s]

Loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Finished loading file: External_Database\clarkson_edu_academics_majors_minors_data_science_grad.pdf
Loading file: External_Database\Job_description.pdf
Finished loading file: External_Database\Job_description.pdf
Loading file: External_Database\Student NL Sept 19, 2024.pdf
Finished loading file: External_Database\Student NL Sept 19, 2024.pdf
Loaded 14 PDF documents from External_Database
Split into 71 chunks of text (max. 300 tokens each)





Embedding 1: [0.04865654557943344, 0.03969587758183479, -0.04392899572849274, 0.04689142480492592, -0.03219174966216087, -0.02440021000802517, 0.07106217741966248, -0.016942642629146576, -0.07020010054111481, 0.012026670388877392]...
Embedding 2: [-0.02117951214313507, -0.0628172755241394, 0.05858604982495308, -0.0014558365801349282, 0.006836882792413235, -0.037297822535037994, -0.016918009147047997, 0.10872379690408707, -0.10490531474351883, 0.07856155186891556]...
Embedding 3: [-0.020607495680451393, 0.01029456127434969, 0.009476795792579651, 0.055719681084156036, 0.0006447816849686205, -0.053860221058130264, -0.03251653537154198, 0.02800251543521881, -0.02673499658703804, 0.03464403748512268]...
Embedding 4: [-0.030184490606188774, -0.038931507617235184, 0.02843565307557583, 0.06997009366750717, 0.01904807612299919, -0.09459130465984344, -0.0783746987581253, 0.0028394702821969986, -0.0036320751532912254, 0.03443096578121185]...
Embedding 5: [0.018685881048440933, 0.00066222867462784

In [None]:
from langchain_groq import ChatGroq
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json

# Load the embeddings and text content from the JSON file
def load_embeddings_from_file(filename="embeddings_with_text.json"):
    with open(filename, "r") as file:
        embeddings_with_text = json.load(file)
    return embeddings_with_text

# Function to retrieve the top relevant text chunks
def retrieve_relevant_text(sample_query, embeddings_with_text, embeddings_model):
    # Generate embedding for the sample query
    query_embedding = embeddings_model.embed_documents([sample_query])[0]

    # Extract embeddings and text content from the loaded data
    all_embeddings = [item["embedding"] for item in embeddings_with_text]
    all_texts = [item["text"] for item in embeddings_with_text]

    # Compute cosine similarity between the query embedding and all stored embeddings
    similarities = cosine_similarity([query_embedding], all_embeddings)[0]

    # Find the indices of the top 3 most similar text chunks
    top_indices = np.argsort(similarities)[-3:][::-1]

    # Collect the top relevant text chunks
    relevant_texts = [all_texts[index] for index in top_indices]
    return relevant_texts

# Load the embeddings and text content
embeddings_with_text = load_embeddings_from_file()

# Load your embeddings model
embeddings_model = HuggingFaceEmbeddings(model_name=embeddings_model_name)

# Initialize your LLM
llm = ChatGroq(
    temperature=0,
    groq_api_key='',
    model_name="llama3-8b-8192"
)

# Sample query
sample_query = "how to make appoitment for international center"

# Retrieve relevant text chunks using embeddings
relevant_text_chunks = retrieve_relevant_text(sample_query, embeddings_with_text, embeddings_model)

# Format the context to pass to the LLM
context = "\n".join(relevant_text_chunks)

# Use the LLM with the retrieved context
response = llm.invoke(f"Context: {context}\n\nQuestion: {sample_query}")
print(response.content)


To make an appointment for the International Center, you can follow these steps:

1. Go to Handshake: The International Center uses Handshake, a career and internship platform, to schedule appointments. You can access Handshake through the Clarkson University website or by searching for "Handshake" in your university's portal.
2. Search for Appointments: In Handshake, search for "International Center" or "Study Abroad" to find available appointments.
3. Choose an Appointment Type: Select the type of appointment you want to schedule, such as a one-on-one appointment or a drop-in hour.
4. Choose a Date and Time: Select a date and time that works for you from the available options.
5. Fill Out the Appointment Form: Fill out the appointment form with your name, email, and any additional information requested.
6. Confirm Your Appointment: Once you've submitted the appointment form, you'll receive a confirmation email with the details of your appointment.

Alternatively, you can also schedul

In [None]:
from langchain_groq import ChatGroq
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json

# Load the embeddings and text content from the JSON file
def load_embeddings_from_file(filename="embeddings_with_text.json"):
    with open(filename, "r") as file:
        embeddings_with_text = json.load(file)
    return embeddings_with_text

# Function to retrieve the top relevant text chunks
def retrieve_relevant_text(sample_query, embeddings_with_text, embeddings_model):
    # Generate embedding for the sample query
    query_embedding = embeddings_model.embed_documents([sample_query])[0]

    # Extract embeddings and text content from the loaded data
    all_embeddings = [item["embedding"] for item in embeddings_with_text]
    all_texts = [item["text"] for item in embeddings_with_text]

    # Compute cosine similarity between the query embedding and all stored embeddings
    similarities = cosine_similarity([query_embedding], all_embeddings)[0]

    # Find the indices of the top 3 most similar text chunks
    top_indices = np.argsort(similarities)[-3:][::-1]

    # Collect the top relevant text chunks
    relevant_texts = [all_texts[index] for index in top_indices]
    return relevant_texts

# Load the embeddings and text content
embeddings_with_text = load_embeddings_from_file()

# Load your embeddings model
embeddings_model = HuggingFaceEmbeddings(model_name=embeddings_model_name)

# Initialize your LLM
llm = ChatGroq(
    temperature=0,
    groq_api_key='..',
    model_name="llama3-8b-8192"
)

# Sample query
sample_query = "who is boris?"

# Retrieve relevant text chunks using embeddings
relevant_text_chunks = retrieve_relevant_text(sample_query, embeddings_with_text, embeddings_model)

# Format the context to pass to the LLM
context = "\n".join(relevant_text_chunks)

# Use the LLM with the retrieved context
response = llm.invoke(f"Context: {context}\n\nQuestion: {sample_query}")
print(response.content)


Boris Jukic is the Director of Business Analytics at Clarkson University.
