In [None]:
import chromadb
chroma_client = chromadb.Client()

In [None]:
collection = chroma_client.create_collection(name="my_collection")

In [None]:
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [None]:
chroma_client.delete_collection(name="my_collection")

In [None]:
results = collection.query(
    query_texts=["Which champions are from Shurima"],
    n_results=1,
    include=['documents']
)

results

In [None]:
#CHUNKING LOGIC
import csv
from time import sleep
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
import os
from dotenv import load_dotenv

index_name = "lolbot"

load_dotenv()
RAG_PINECONE_API_KEY = os.getenv("RAG_PINECONE_API_KEY")

pc = Pinecone(api_key=RAG_PINECONE_API_KEY)
sleep(1)
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
            )
        )
else:
    print("Pinecone index already exists")

with open('league_lore2.csv', 'r', encoding = 'utf8') as file:
    csv_reader = csv.DictReader(file)
    championdata = [row for row in csv_reader]
    
chunk_size = 300
chunk_overlap = 20

def chunk_section(championdata, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    chunks_list = []
    for champion_entry in championdata:
        chunks = text_splitter.create_documents(
            texts=[champion_entry["Lore"]],
            metadatas=[{"Link": champion_entry["Link"], "Champion": champion_entry["Champion"], "Region": champion_entry["Region"]}],
        )
        chunks_list.extend([{"Link": champion_entry["Link"], "Lore": chunk.page_content, "Champion": chunk.metadata["Champion"], "Region": chunk.metadata["Region"]} for chunk in chunks])


    return chunks_list


In [None]:
from sentence_transformers import SentenceTransformer

# Chunk the data
chunks = chunk_section(championdata, chunk_size, chunk_overlap)

# Load the SentenceTransformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Function to embed texts using SentenceTransformer
def embed_texts(texts):
    embeddings = embedder.encode(texts, convert_to_tensor=True)
    return embeddings

# Embed the chunked texts
chunk_texts = [chunk["Lore"] for chunk in chunks]
embeddings = embed_texts(chunk_texts)

# Print to debug the embedding dimension
print(f"Embedding dimension: {embeddings.shape[1]}")

index = pc.Index(index_name)

# Prepare data for insertion into Pinecone
pinecone_data = []
for i, chunk in enumerate(chunks):
    pinecone_data.append({
        "id": f"{chunk['Champion']}_{i}",  # Unique ID for each chunk
        "values": embeddings[i].tolist(),
        "metadata": {
            "Link": chunk["Link"],
            "Champion": chunk["Champion"],
            "Region": chunk["Region"],
            "Lore": chunk["Lore"]
        }
    })

# Debug to check the first item to be upserted
print("First item to be upserted:", pinecone_data[0])

# Insert data into Pinecone
try:
    index.upsert(vectors=pinecone_data, namespace='test')
except Exception as e:
    print(f"Error during upsert: {e}")

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Function to query Pinecone and retrieve relevant chunks
def query_pinecone(index, query_text, top_k=5):
    query_embedding = embed_texts([query_text])[0].tolist()  # Embed the query using the same model
    result = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)  # Query Pinecone with the query embedding
    return result

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Function to generate a response from the GPT-2 model
def generate_response(context, query, model, tokenizer):
    input_text = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer.encode(input_text, return_tensors='pt')

    # Generate response
    outputs = model.generate(
        inputs, 
        max_length=150,  # Adjust as necessary
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Answer:")[1].strip()

# Example query
query_text = "Who is the darkin blade."

# Retrieve relevant chunks from Pinecone
results = query_pinecone(index, query_text)

# Combine the content of the retrieved chunks
combined_context = "\n\n".join([match['metadata']['Lore'] for match in results['matches']])

# Generate a response using the combined context and the query
response = generate_response(combined_context, query_text, model, tokenizer)
print(response)

In [None]:
# TEST CHUNKING SIZE AND OVERLAP
import csv

# Define the chunk size and overlap
chunk_size = 300
chunk_overlap = 20

# Open and read the CSV file
with open('league_lore.csv', 'r', encoding='utf8') as file:
    csv_reader = csv.DictReader(file)
    championdata = [row for row in csv_reader]

# Call the chunk_section method
chunks_result = chunk_section(championdata, chunk_size, chunk_overlap)

# Print the resulting chunks
for chunk in chunks_result:
    print(chunk)
