In [2]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [3]:
import os
import nest_asyncio
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings, SimpleDirectoryReader, Document, VectorStoreIndex, get_response_synthesizer, PromptTemplate
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter, MarkdownNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from IPython.display import display, Markdown

  from tqdm.autonotebook import tqdm


In [4]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

In [27]:
client = OpenAI(model="gpt-4o-mini", temperature=0)
embedding = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.llm = client
Settings.embed_model = embedding
Settings.chunk_size_limit = 1536

In [28]:
documents = SimpleDirectoryReader('../data_collected/raw', recursive=True).load_data()
documents[0]

Document(id_='02856445-7aaa-4e8e-9890-4f7f7ab0c7e8', embedding=None, metadata={'file_path': 'c:\\Users\\timytapilla\\Documents\\BYU-Idaho\\BYUI 2025 Term2\\CSE499-Senior Project\\cuschatai\\notebooks\\..\\data_collected\\raw\\business_plan.md', 'file_name': 'business_plan.md', 'file_size': 4422, 'creation_date': '2025-03-29', 'last_modified_date': '2025-03-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\nDunder Mifflin Business Plan\n\r', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [29]:
df = pd.DataFrame([d.metadata for d in documents])
df.head(5)

Unnamed: 0,file_path,file_name,file_size,creation_date,last_modified_date
0,c:\Users\timytapilla\Documents\BYU-Idaho\BYUI ...,business_plan.md,4422,2025-03-29,2025-03-29
1,c:\Users\timytapilla\Documents\BYU-Idaho\BYUI ...,business_plan.md,4422,2025-03-29,2025-03-29
2,c:\Users\timytapilla\Documents\BYU-Idaho\BYUI ...,business_plan.md,4422,2025-03-29,2025-03-29
3,c:\Users\timytapilla\Documents\BYU-Idaho\BYUI ...,business_plan.md,4422,2025-03-29,2025-03-29
4,c:\Users\timytapilla\Documents\BYU-Idaho\BYUI ...,business_plan.md,4422,2025-03-29,2025-03-29


In [30]:
df2 = pd.DataFrame(documents)
df2.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,"(id_, 02856445-7aaa-4e8e-9890-4f7f7ab0c7e8)","(embedding, None)","(metadata, {'file_path': 'c:\Users\timytapilla...","(excluded_embed_metadata_keys, [file_name, fil...","(excluded_llm_metadata_keys, [file_name, file_...","(relationships, {})","(text, \n\nDunder Mifflin Business Plan\n\r)","(mimetype, text/plain)","(start_char_idx, None)","(end_char_idx, None)","(text_template, {metadata_str}\n\n{content})","(metadata_template, {key}: {value})","(metadata_seperator, \n)"
1,"(id_, 27ecd6a0-d056-4585-b933-23d1db174c28)","(embedding, None)","(metadata, {'file_path': 'c:\Users\timytapilla...","(excluded_embed_metadata_keys, [file_name, fil...","(excluded_llm_metadata_keys, [file_name, file_...","(relationships, {})","(text, \n\n1. Executive Summary\nDunder Miffli...","(mimetype, text/plain)","(start_char_idx, None)","(end_char_idx, None)","(text_template, {metadata_str}\n\n{content})","(metadata_template, {key}: {value})","(metadata_seperator, \n)"
2,"(id_, 8d495b91-6194-40ec-b4d6-1e05f7e1face)","(embedding, None)","(metadata, {'file_path': 'c:\Users\timytapilla...","(excluded_embed_metadata_keys, [file_name, fil...","(excluded_llm_metadata_keys, [file_name, file_...","(relationships, {})","(text, \n\n2. Company Description\nCompany Nam...","(mimetype, text/plain)","(start_char_idx, None)","(end_char_idx, None)","(text_template, {metadata_str}\n\n{content})","(metadata_template, {key}: {value})","(metadata_seperator, \n)"
3,"(id_, 66b2d7ee-840e-4149-b8f7-ef4b10c942cd)","(embedding, None)","(metadata, {'file_path': 'c:\Users\timytapilla...","(excluded_embed_metadata_keys, [file_name, fil...","(excluded_llm_metadata_keys, [file_name, file_...","(relationships, {})","(text, \n\n3. Market Analysis\n)","(mimetype, text/plain)","(start_char_idx, None)","(end_char_idx, None)","(text_template, {metadata_str}\n\n{content})","(metadata_template, {key}: {value})","(metadata_seperator, \n)"
4,"(id_, cf6197e0-b0dc-4f54-9a19-8c5ae840235e)","(embedding, None)","(metadata, {'file_path': 'c:\Users\timytapilla...","(excluded_embed_metadata_keys, [file_name, fil...","(excluded_llm_metadata_keys, [file_name, file_...","(relationships, {})","(text, \n\nIndustry Overview\nThe paper indust...","(mimetype, text/plain)","(start_char_idx, None)","(end_char_idx, None)","(text_template, {metadata_str}\n\n{content})","(metadata_template, {key}: {value})","(metadata_seperator, \n)"


In [31]:
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)

In [33]:
existing_indices = pinecone_client.list_indexes()
print("Existing indices:", existing_indices)

index_name = "cuschatai"
if index_name not in existing_indices:
    pinecone_client.create_index(index_name, dimension=1536, spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )) 

Existing indices: [{
    "name": "hints",
    "dimension": 768,
    "metric": "cosine",
    "host": "hints-xnv36h9.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}, {
    "name": "chatbot-index",
    "dimension": 1536,
    "metric": "cosine",
    "host": "chatbot-index-xnv36h9.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}]


In [34]:
pinecone_index = pinecone_client.Index("cuschatai")
vector_store = PineconeVectorStore(pinecone_index )

In [35]:
pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=1536, chunk_overlap=20), embedding], vector_store=vector_store)

In [36]:
pipeline.run(documents=documents)

Upserted vectors: 100%|██████████| 143/143 [00:02<00:00, 56.25it/s]


[TextNode(id_='7dc40fc2-1943-4813-aa36-09c296b6d205', embedding=[-0.020523877814412117, -0.006796408444643021, -0.01376999169588089, -0.008008280768990517, -0.01529369130730629, 0.030559033155441284, -0.03191973268985748, -0.02408153936266899, -0.009291023947298527, -0.010339895263314247, 0.032373297959566116, 0.030672425404191017, -0.008844544179737568, 0.021289270371198654, -0.0066015166230499744, 0.012288813479244709, 0.0026930503081530333, -0.010276112705469131, 0.014145600609481335, -0.013614078052341938, -0.013897556811571121, 0.010226503945887089, 0.011984073556959629, -0.005754623096436262, 0.00040661514503881335, -0.001336780609562993, 0.010269026271998882, -0.024889454245567322, -0.005899906158447266, 0.017759958282113075, 0.008901240304112434, -0.008383890613913536, -0.0032936714123934507, -0.027653373777866364, -0.010545417666435242, -0.008447673171758652, -0.007189735770225525, -0.01576143130660057, 0.034414347261190414, -0.010346982628107071, -0.018964743241667747, -0.004

In [37]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
retriever = VectorIndexRetriever(index=index, similarity_top_k=5)

In [41]:
prompt_template = (
    "You are a helpful and friendly chatbot specialized in providing customer support and scheduling appointments. 😊 "
    "You assist customers by answering their inquiries clearly and concisely, and you help them schedule appointments based on their needs. "
    "Please ensure the conversation is engaging and informative! 😄\n\n"
    "Context:\n"
    "#####################################\n"
    "{context_str}\n"
    "Answer the user's question: {query_str}\n\n"
    "If the question is related to our services or products, provide a detailed answer along with a summary. If the customer wants to schedule an appointment, "
    "assist them in finding a suitable time and book it for them.\n\n"
    "For appointment scheduling, please consider the following:\n"
    "- **Available Time Slots**: {available_times}\n"
    "- **Location**: {location}\n"
    "- **Required Information**: {required_info}\n\n"
    "However, if the question is unrelated to the services or scheduling, provide a direct and concise answer without any summary or extra details.\n\n"
    "Don't forget to invite the customer to schedule an appointment by highlighting the value of seeing the products in person and experiencing them firsthand. "
    "Encourage the customer to book an appointment at our office, mentioning that it's a great opportunity to get personalized advice and explore all available options. "
    "For example, you can say: 'Would you like to visit our office and see the products in person? We have some excellent time slots available this week!'"
)
qa_template = PromptTemplate(template=prompt_template)
chain_type_kwargs = {"prompt": qa_template}
response_synthesizer = get_response_synthesizer(
    llm=client, text_qa_template=qa_template, response_mode="compact"
)
query_engine = RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer)

In [1]:
def get_model_response(query):
    return query_engine.query(query)

In [42]:
response = query_engine.query("What is Dunder Mifflin?")
display(Markdown(str(response)))

Dunder Mifflin Paper Company is a regional supplier of high-quality paper and office products, primarily catering to small and mid-sized businesses. Founded in 1949 and headquartered in Scranton, Pennsylvania, Dunder Mifflin has built a strong reputation for personalized customer service and fostering strong client relationships. The company operates in a competitive market but stands out through its relationship-driven sales approach and a unique workplace culture that emphasizes employee engagement and humor.

Would you like to visit our office and see the products in person? We have some excellent time slots available this week! It’s a great opportunity to get personalized advice and explore all available options. Let me know if you’d like to schedule an appointment! 😊