In [8]:
!pip install langchain_community langchain_pinecone langchain_openai unstructured langchainhub langchain_text_splitters




In [9]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import glob


In [11]:
from langchain.schema import Document
sample_docs = [
    Document(page_content="Q: How do I request access to SAP?\nA: You need to submit a request via the SAP Access Portal and get approval from your manager."),
    Document(page_content="Q: How do I reset my SAP password?\nA: Use the SAP Password Reset tool or contact IT support for assistance."),
    Document(page_content="Q: What are the required trainings before accessing SAP?\nA: Complete the mandatory SAP onboarding e-learning modules assigned by HR."),
    Document(page_content="Q: Who do I contact for SAP issues?\nA: Contact the SAP Support Desk via email or internal ticketing system.")
]


In [12]:
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
index_name = 'sap-onboarding-faq'


# Create index if not exists
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=3072,  # matches OpenAI embedding dimension
        metric='euclidean',
        deletion_protection='enabled',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )


index = pc.Index(index_name)
print(index.describe_index_stats())


{'dimension': 3072,
 'index_fullness': 0.0,
 'metric': 'euclidean',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [13]:
# Initialize embeddings and the vector store
embeddings = OpenAIEmbeddings(
    model='text-embedding-3-large'
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
split_docs = text_splitter.split_documents(sample_docs)


# Create a vector store for the documents using the specified embeddings
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

In [14]:
query = "How can I reset my SAP password?"
results = vectorstore.similarity_search(query, k=3)


In [15]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
index_name = "sap-onboarding-faq4"

# Connect to the existing index
index = pc.Index(index_name)
print("Index stats:", index.describe_index_stats())


Index stats: {'dimension': 3072,
 'index_fullness': 0.0,
 'metric': 'euclidean',
 'namespaces': {'': {'vector_count': 8}},
 'total_vector_count': 8,
 'vector_type': 'dense'}
