In [3]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm


import ast
import os
import pandas as pd

In [7]:
from dotenv import load_dotenv
import os
env_path = r'E:\YTReusable\.env'
load_dotenv(env_path)
# Access variables from the .env file
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [9]:
openai_client = OpenAI()
pinecone = Pinecone()

In [11]:
# Create or connect to Pinecone index
index_name = 'rag-example7'
if index_name not in pinecone.list_indexes():
   pinecone.create_index(name=index_name, dimension=1536, metric='cosine',spec=ServerlessSpec(cloud='aws', region='us-west-2'))

In [13]:
index = pinecone.Index(index_name)

In [15]:
# Sample Data: A small set of documents or FAQs (can be imported easily)
data = {
    "questions": [
        "What is AI?",
        "What is Pinecone?",
        "How do embeddings work?",
        "What is GPT?",
        "What is machine learning?"
    ],
    "answers": [
        "AI is the simulation of human intelligence in machines.",
        "Pinecone is a vector database that enables efficient search and retrieval.",
        "Embeddings are a numerical representation of text that captures semantic meaning.",
        "GPT is a transformer-based model that generates human-like text.",
        "Machine learning is a subset of AI where machines learn from data to make decisions."
    ]
}
df = pd.DataFrame(data)

In [17]:
df.head()

Unnamed: 0,questions,answers
0,What is AI?,AI is the simulation of human intelligence in ...
1,What is Pinecone?,Pinecone is a vector database that enables eff...
2,How do embeddings work?,Embeddings are a numerical representation of t...
3,What is GPT?,GPT is a transformer-based model that generate...
4,What is machine learning?,Machine learning is a subset of AI where machi...


In [19]:
# Function to get OpenAI embeddings
def get_embeddings(text):
    response = openai_client.embeddings.create(input=[text], model="text-embedding-ada-002")
    return response.data[0].embedding  # Correctly access the embedding

In [21]:
# Insert data into Pinecone
def insert_data_to_pinecone(df):
    vectors = []
    for idx, row in df.iterrows():
        # Get the embedding for the answer text using OpenAI API
        response = openai_client.embeddings.create(input=[row['answers']], model="text-embedding-ada-002")
        embedding = response.data[0].embedding
        
        
        embedding_list = list(embedding)  # Explicitly convert to a list
        print(embedding_list)
        vectors.append((str(idx), embedding_list, {'text': row['answers']}))  
    
    # Upsert data to Pinecone
    index.upsert(vectors=vectors)


In [25]:
insert_data_to_pinecone(df)
index.describe_index_stats()

[-0.02838454581797123, -0.020946109667420387, -0.006102150771766901, -0.012171387672424316, -0.004871188197284937, 0.012026568874716759, -0.010874598287045956, 0.030017051845788956, -0.010966756381094456, -0.021459558978676796, 0.005493252072483301, 0.02856886014342308, 0.006177851464599371, 0.0017690970562398434, -0.007793901022523642, -0.0020998769905418158, 0.018918640911579132, -0.0060198670253157616, 0.02917446754872799, -0.009459320455789566, -0.01102600060403347, 0.022762930020689964, 0.0004566738789435476, -0.021433228626847267, -0.009650218300521374, -0.004301785957068205, 0.015324494801461697, -0.04631578177213669, 0.008445587009191513, -0.014060618355870247, 0.018023395910859108, -0.005529456771910191, -0.0004743648460134864, -0.014626730233430862, -0.014534572139382362, 0.006395080126821995, -5.212150790612213e-05, -0.003452619072049856, 0.0011116511886939406, 0.003060949267819524, 0.018589507788419724, 0.012829656712710857, -0.007240955252200365, -0.011091827414929867, -0.

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [27]:
# Function to retrieve the most similar document from Pinecone
def retrieve_relevant_documents(query, top_k=3):
    query_embedding = get_embeddings(query)
    
    # Ensure the query embedding is a list of floats (not ndarray)
    query_embedding = list(query_embedding)  # Convert to list if necessary
    
    # Query Pinecone using the correct keyword arguments
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return results['matches']


In [31]:
# Function to generate a rich response using GPT-3 based on retrieved documents
def generate_answer(query):
    # Step 1: Retrieve relevant documents from Pinecone
    matches = retrieve_relevant_documents(query)
    
    
    context = "\n".join([match['metadata'].get('text', 'No text found') for match in matches])
    
    
    
    messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"Here are some documents related to your question:\n{context}\n\nQuestion: {query}"},
    ]
    print(messages)
    # Step 4: Use GPT-3.5-turbo to generate a response based on the context and question
    response = openai_client.chat.completions.create(
        model="gpt-4-turbo",  
        messages=messages,      
        max_tokens=500,
        temperature=0.7        
    )
    
    return response.choices[0].message.content

In [35]:
# Test the RAG system
query = "What is machine learning - please explain in 300 wo"
answer = generate_answer(query)
print(f"Question: {query}\nAnswer: {answer}")

[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Here are some documents related to your question:\nMachine learning is a subset of AI where machines learn from data to make decisions.\nAI is the simulation of human intelligence in machines.\nGPT is a transformer-based model that generates human-like text.\n\nQuestion: What is machine learning?'}]
Question: What is machine learning?
Answer: Machine learning is a subset of AI where machines learn from data to make decisions.
