In [1]:
pip install pinecone-client[grpc] transformers datasets torch sentence-transformers




In [2]:
import pandas as pd
import pinecone
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import os

In [15]:
# Step 1: Load the data
file_path = '/content/synthetic_knowledge_items.csv'
data = pd.read_csv(file_path)

In [16]:
# Inspect the data (ensure it has 'question' and 'context' columns)
print(data.head())

                                       ki_topic  \
0  Setting Up a Mobile Device for Company Email   
1                     Resetting a Forgotten PIN   
2     Configuring VPN Access for Remote Workers   
3  Troubleshooting Issues with Microsoft Office   
4   Setting Up a Conference Call on Cisco Webex   

                                             ki_text  \
0  **Setting Up a Mobile Device for Company Email...   
1  **Resetting a Forgotten PIN**\n\nIf you have f...   
2  **Configuring VPN Access for Remote Workers**\...   
3  **Troubleshooting Issues with Microsoft Office...   
4  To set up a conference call on Cisco Webex, fo...   

                                         alt_ki_text  \
0  To set up a mobile device for company email, f...   
1  If you have forgotten your PIN, you can reset ...   
2  To configure VPN access for remote workers at ...   
3  When troubleshooting issues with Microsoft Off...   
4  To set up a conference call on Cisco Webex, fo...   

                   

In [5]:
import os
os.environ["PINECONE_API_KEY"] = "your_key"

In [6]:
PINECONE_API_KEY ="your_key"

In [13]:
# Step 2: Initialize Pinecone client
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "dataset"

# Create the index if it doesn't exist
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",  # Specify the cloud provider
            region="us-west-2"  # Specify the region
        )
    )

# Connect to the index
index = pc.Index(index_name)

In [14]:
from pinecone import Pinecone

pinecone_client = Pinecone(api_key="pcsk_6hggzA_3b6fDDvw8cLMZXoKuCUer5RerbWysfzjQUZQoeg38jVnc4BtWKYFjMXhstwjWvU")
print(pinecone_client.list_indexes().names())


['rag-index', 'business-qa-index', 'qa-model', 'qa', 'dataset']


In [18]:
# Step 3: Preprocess and Embed Contexts
# Use SentenceTransformer for embedding
embedder = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Extract the context from the ki_text column (this contains the relevant content)
contexts = data["ki_text"].tolist()

# Generate embeddings for the contexts
context_embeddings = embedder.encode(contexts, batch_size=32, show_progress_bar=True)

# Upload embeddings to Pinecone
for i, (context, embedding) in enumerate(zip(contexts, context_embeddings)):
    index.upsert([(str(i), embedding.tolist(), {"context": context})])

print(f"Uploaded {len(context_embeddings)} contexts to Pinecone.")


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Uploaded 100 contexts to Pinecone.


In [19]:
# Step 4: Define the Retrieval-Augmented Generation (RAG) Pipeline
# Define retriever
def retrieve(query, top_k=5):
    # Encode the query to get its embedding
    query_embedding = embedder.encode([query])[0]

    # Query Pinecone using keyword arguments
    results = index.query(
        vector=query_embedding.tolist(),  # Use the embedding as the query vector
        top_k=top_k,                      # Number of top matches to retrieve
        include_metadata=True             # Include metadata in the response
    )

    # Extract contexts from the query results
    return [result["metadata"]["context"] for result in results["matches"]]

In [20]:
# Define generator using Hugging Face pipeline
generator = pipeline("text2text-generation", model="t5-small")

# Step 5: Ask Questions and Generate Answers
def rag_pipeline(question, top_k=5):
    # Retrieve relevant contexts
    retrieved_contexts = retrieve(question, top_k=top_k)
    combined_context = " ".join(retrieved_contexts)

    # Generate an answer
    input_text = f"question: {question} context: {combined_context}"
    answer = generator(input_text, max_length=50, truncation=True)[0]["generated_text"]
    return answer


In [21]:
# Step 6: Test the RAG Pipeline
question = "I forgot my password,what should i do now?"
answer = rag_pipeline(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: I forgot my password,what should i do now?
Answer: reset your password


In [22]:
# Step 6: Test the RAG Pipeline
question = "How do I set up VPN access on my laptop?"
answer = rag_pipeline(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: How do I set up VPN access on my laptop?
Answer: Click on the "Connect" button to establish the VPN connection


In [45]:
# Step 6: Test the RAG Pipeline
question = "How do I set up a conference call on Cisco Web?"
answer = rag_pipeline(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: How do I set up a conference call on Cisco Web?
Answer: Log in to Cisco Webex** Open a web browser and navigate to [company Webex URL]. Enter your company login credentials to access the Webex portal


In [24]:
# Step 6: Test the RAG Pipeline
question = "How to setup a secure network?"
answer = rag_pipeline(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: How to setup a secure network?
Answer: plan your network infrastructure by identifying the number of users, devices, and access points required
