# Create Pinecone Index

In [5]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()
RAG_PINECONE_API_KEY = os.getenv("RAG_PINECONE_API_KEY")

pc = Pinecone(api_key=RAG_PINECONE_API_KEY)

index_name = "lore-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )
else:
    print("Pinecone index already exists")

Pinecone index already exists


In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone

pc = Pinecone(api_key=RAG_PINECONE_API_KEY)

index_list = pc.list_indexes()

print(index_list)

# Chunking Logic

In [None]:
import pandas as pd

df = pd.read_csv('data/league_lore_df.csv')
dataset = df.drop('Unnamed: 4', axis=1)

print(dataset.to_string()) 

In [None]:
for _, rows in dataset.iterrows():
    print(rows['Lore'])
    print("\n")

In [None]:
dataset

In [7]:
# testSet contains only the last champion (Zyra) entry as its entire dataset

testSet = dataset.copy()
for i in range(164):
    testSet.drop(i, inplace=True)
testSet

Unnamed: 0,Link,Champion,Region,Lore
164,https://universe.leagueoflegends.com/en_AU/sto...,zyra,Ixtal,"Zyra’s memory is long, and runs as deep as the..."


In [8]:
# Upsert Debugging / Testing
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
text_splitter = SemanticChunker(embedding_model)

index = pc.Index("lore-bot")
unique_id = 0

# Iterates through the test dataset and stores the metadata of the current row (a.k.a champion)
for _, rows in testSet.iterrows():
    link = rows['Link']
    champion  = rows['Champion']
    region = rows['Region']
    lore = rows['Lore']

    # Chunks the lore section of the champion using SemanticChunker
    chunks = text_splitter.split_text(lore)

    # Uses the same model to embed the chunks, preparing for upsert
    for c in chunks:
        vector = embedding_model.embed_documents([c])
        metadata = {
            "champion": champion,
            "region": region,
            "link": link,
            "lore": c
        }
        unique_id += 1
        index.upsert(vectors=[{"id": str(unique_id), "values": vector[0], "metadata": metadata}])

copy.py test

In [17]:
def generate_response(query: str) -> str:
    """
    Retrieval
    """
    # Query the database
    embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
    index = pc.Index("lore-bot")

    # Embed the query with the same model that embeds the lore vectors
    # query = "Who is Zyra"
    query_vector = embedding_model.embed_query(query)

    # Retrieve the list of response vectors that are most similar to the query vector
    results = index.query(vector=query_vector, top_k=1, include_metadata=True)

    # Store the lore data of each result vector into a list
    retrieved_chunks = []

    for match in results['matches']:
        retrieved_chunks.append(match['metadata']['lore'])

    return results['matches'][0]['metadata']['lore']

In [20]:
generate_response("Tell me about Zyra")

'Zyra’s memory is long, and runs as deep as the roots of the earth. Her kind was young when the Rune Wars raged, when mortal armies fought one another for the very keys of creation. Hidden in the jungles south of Kumungu, somewhere between the great rivers that divide eastern Shurima, lay the fabled Gardens of Zyr. Elemental magics had turned the soil there in strange and unpredictable ways, giving rise to fierce, carnivorous plants that preyed upon any creature that strayed within reach. They infested and they devoured, caring nothing for the squabbles of mortals, content merely to coil their vines through the forests and swamplands. In their own way, they were all Zyra… and nourishment was plentiful, even in the midst of war. A small company of soldiers, their allegiance long since lost to time, advanced through those lands in search of some now-forgotten prize. They were led by an ambitious sorceress—but they were far from home, bound to succumb to the noxious fumes and spores of th

end of test

In [9]:
# Query the database
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
index = pc.Index("lore-bot")

# Embed the query with the same model that embeds the lore vectors
query = "Who is Zyra"
query_vector = embedding_model.embed_query(query)

# Retrieve the list of response vectors that are most similar to the query vector
results = index.query(vector=query_vector, top_k=5, include_metadata=True)

In [None]:
results

In [10]:
# Store the lore data of each result vector into a list
retrieved_chunks = []

for match in results['matches']:
    retrieved_chunks.append(match['metadata']['lore'])

In [None]:
print(retrieved_chunks)

# Prompt Engineering

In [13]:
# Separate each lore chunk using \n\n so the LLM can understand where one chunk ends and where one begins

context = "\n\n".join(retrieved_chunks)

prompt = f"""Answer the following question about League of Legends champion lore based on the provided context. Be accurate and concise.

Context:{context}

Question: {query}

Answer:"""

In [None]:
print(prompt)

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="BAAI/bge-base-en-v1.5")

# Generate the answer
response = generator(prompt, max_new_tokens=512)
print(response[0]['generated_text'])


In [None]:
# context_chunks = []
# if results['matches']:
#     first_vector = results['matches'][0]['metadata']['lore']
#     context_chunks.append(first_vector)


# """
# Prompt Engineering
# """
# # Separate each lore chunk using \n\n so the LLM can understand where one chunk ends and where one begins

# context = "\n\n".join(retrieved_chunks)

# prompt = f"""Answer the following question about League of Legends champion lore based on the provided context. Be accurate and concise.

# Context:{context}

# Question: {query}

# Answer:"""

# generator = pipeline("text-generation", model="BAAI/bge-base-en-v1.5", max_length=512, truncation=True)

# # Generate the answer
# response = generator(prompt, max_new_tokens=512)
# return response[0]['generated_text']

# """
# Generation
# """

# ###
# # Initialize Flan-T5 for generation (best for lore Q&A)
# # Use flan-t5-large for better quality
# # Adjust based on your character limit
# # Lower temperature for more factual responses
# ###
# generator = pipeline(
#     "text2text-generation",
#     model="google/flan-t5-base",  
#     max_length=300,  
#     do_sample=True,
#     temperature=0.3,  
#     early_stopping=True
# )

# # Step 4: Generate response
# response = generator(prompt, max_length=200, num_return_sequences=1)
# return response[0]['generated_text']