In [29]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import json 
import numpy as np
from tqdm import tqdm

ModuleNotFoundError: No module named 'pyserini'

In [12]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np

# Load your embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def truncate_text_to_bytes(text, max_bytes=40960):
    """Truncate text to fit within a specified byte limit."""
    # Start with an initial truncation within a reasonable character limit
    truncated_text = text[:15000]
    # Iteratively reduce size if byte count exceeds the limit
    while len(truncated_text.encode('utf-8')) > max_bytes:
        truncated_text = truncated_text[:-100]  # Remove more characters
    return truncated_text

def load_and_embed_dataset(dataset_path, prompt_field="prompt", completion_field="completion"):
    """
    Load dataset from JSONL, truncate metadata to fit byte limits, and embed texts.
    Args:
        dataset_path (str): Path to the JSONL file with "prompt" and "completion".
        prompt_field (str): Field name for prompt.
        completion_field (str): Field name for completion.
    Returns:
        embeddings (np.ndarray): Embedded vectors.
        metadata (list): List of dictionaries with truncated "prompt" and "completion".
    """
    print("Loading dataset and embedding texts...")

    embeddings = []
    metadata = []

    with open(dataset_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            
            # Get and truncate prompt and completion to fit within metadata size limits
            prompt = truncate_text_to_bytes(data.get(prompt_field, ""))
            completion = truncate_text_to_bytes(data.get(completion_field, ""))

            # Concatenate prompt and completion for embedding
            text_to_embed = f"{prompt} {completion}"
            embedding = model.encode(text_to_embed)

            # Append embedding and metadata to lists
            embeddings.append(embedding)
            metadata.append({"prompt": prompt, "completion": completion})

    # Convert embeddings list to numpy array
    embeddings = np.array(embeddings)
    print("Embedding and truncation complete.")

    return embeddings, metadata


In [13]:
def create_pinecone_index(index_name: str, dimension: int, metric: str = 'cosine'):
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key="35b290d2-dcfb-4fde-b395-cc19f1e5aadd")
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [None]:
# from pyserini.index.lucene import SimpleIndexer
# from pyserini.search.lucene import LuceneSearcher

# # Add sparse indexing function
# def index_sparse_with_pyserini(jsonl_file, sparse_index_path, batch_size=1000):
#     """
#     Index documents for sparse retrieval (BM25) using Lucene (Pyserini).
    
#     Args:
#         jsonl_file: Path to the JSONL file containing documents.
#         sparse_index_path: Directory path for the Lucene sparse index.
#         batch_size: Number of documents to process in each batch.
#     """
#     os.makedirs(sparse_index_path, exist_ok=True)
#     indexer = SimpleIndexer(sparse_index_path)
    
#     with open(jsonl_file, 'r') as f:
#         for line in f:
#             entry = json.loads(line)
#             prompt = entry.get("prompt", "")
#             completion = entry.get("completion", "")
#             document = f"Prompt: {prompt}\nCompletion: {completion}"
#             doc_id = f"{hash(prompt)}_{hash(completion)}"
#             indexer.add_document(doc_id, document)
    
#     indexer.close()
#     print("Sparse indexing with BM25 completed.")

In [15]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: list,
        prompt_field: str = 'prompt',
        completion_field: str = 'completion',
        batch_size: int = 128
):
    """
    Upsert vectors to a Pinecone index with metadata from a custom dataset.
    
    Args:
        index: The Pinecone index object
        embeddings: The embeddings to upsert (NumPy array)
        dataset: The dataset list, where each item is a dictionary with prompt and completion
        prompt_field: The field name for the prompt text in each data item
        completion_field: The field name for the completion text in each data item
        batch_size: The batch size to use for upserting
    
    Returns:
        The updated Pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")

    # Generate unique IDs for each embedding
    ids = [str(i) for i in range(len(embeddings))]
    
    # Create metadata by extracting prompt and completion fields from each item in the dataset
    meta = [
        {prompt_field: item[prompt_field], completion_field: item[completion_field]} 
        for item in dataset
    ]
    
    # Prepare list of (id, vector, metadata) tuples for upsert
    to_upsert = list(zip(ids, embeddings, meta))
    
    # Upsert in batches
    for i in tqdm(range(0, len(embeddings), batch_size)):
        i_end = min(i + batch_size, len(embeddings))
        index.upsert(vectors=to_upsert[i:i_end])
    
    print("Upsert completed.")
    return index



In [None]:
file_path = 'Trump_DB.jsonl'
index_name = "trumpdb"  # Ensure this name is lowercase and uses only letters, numbers, or dashes
embeddings, metadata = load_and_embed_dataset(file_path)

print("done loading and embed data")

In [None]:
dimension = embeddings.shape[1]  # Adjust to match the dimension of your embeddings
pc = create_pinecone_index(index_name, dimension)
index = pc.Index(index_name)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch 
from trump_agent import TrumpAgent
from biden_agent import BidenAgent
from eval_agent import EvalAgent
# Load tokenizer and model for embedding if needed
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


def load_debate_data(file_path):
    with open(file_path, 'r') as file:
        debate_data = json.load(file)
    return debate_data

def embed_text(text):
    # Tokenize and embed the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).pooler_output  # use the pooled output as the embedding
    return embeddings[0].cpu().numpy()  # convert to numpy array

def query_index(question, index, top_k=3):
    # Generate the embedding for the query question
    query_embedding = embed_text(question)
    
    # Perform the query with Pinecone
    try:
        response = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
        # Extract the relevant metadata, which includes the "completion" field
        top_completions = [match['metadata']['completion'] for match in response['matches']]
    except AttributeError as e:
        print(f"Error querying index: {e}")
        top_completions = []

    return top_completions



def generate_debate_response(question, trump_agent):
    # Query Pinecone to retrieve relevant context for the question
    top_completions = query_index(question, index)
    
    # Combine retrieved completions as context for the Trump agent
    context = "\n".join(top_completions)
    
    # Generate Trump agent response with the context
    trump_response = trump_agent.generate_response(context)
    return trump_response



In [None]:

# def query_index(question, index, sparse_searcher, top_k=3, dense_weight=0.5, sparse_weight=0.5):
#     """
#     Perform a hybrid search using both Pinecone dense retrieval and Pyserini sparse retrieval.
    
#     Args:
#         question: Query question string.
#         index: Pinecone dense index instance.
#         sparse_searcher: Pyserini sparse searcher instance (BM25).
#         top_k: Number of top results to retrieve.
#         dense_weight: Weight for dense retrieval scores.
#         sparse_weight: Weight for sparse retrieval scores.
    
#     Returns:
#         List of top completions based on hybrid scoring.
#     """
#     # Dense retrieval with Pinecone
#     query_embedding = embed_text(question)
#     dense_response = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
#     dense_results = {match['id']: dense_weight * match['score'] for match in dense_response['matches']}
    
#     # Sparse retrieval with Pyserini
#     sparse_hits = sparse_searcher.search(question, k=top_k)
#     sparse_results = {hit.docid: sparse_weight * hit.score for hit in sparse_hits}
    
#     # Combine dense and sparse scores
#     combined_scores = {}
#     for doc_id, score in dense_results.items():
#         combined_scores[doc_id] = combined_scores.get(doc_id, 0) + score
#     for doc_id, score in sparse_results.items():
#         combined_scores[doc_id] = combined_scores.get(doc_id, 0) + score
    
#     # Sort by combined score and retrieve top completions
#     sorted_docs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
#     top_completions = [sparse_searcher.doc(doc_id).raw() for doc_id, _ in sorted_docs]
    
#     return top_completions


In [18]:
upsert_vectors(index=index, embeddings=embeddings, dataset=metadata)


Upserting the embeddings to the Pinecone index...


100%|██████████| 32/32 [01:14<00:00,  2.33s/it]

Upsert completed.





<pinecone.data.index.Index at 0x2a4638616a0>

In [19]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4005}},
 'total_vector_count': 4005}


Processing question 1 for Biden...
Generating response from Biden agent...

Running model: gpt-4o-mini
Biden Generated Response: {'gpt-4o-mini': 'Thank you for the question. Look, I understand the anxiety that so many families are feeling right now. I really do. When I walk through communities, when I talk to folks just like you, I hear the worries—grocery bills that are stretched thin, housing costs that seem to go up overnight. It’s a heavy burden, and it’s one that weighs on me every single day.\n\nWhen I took office, we faced unprecedented challenges. The pandemic left our economy in shambles, and many people were struggling just to make ends meet. But we rolled up our sleeves and got to work. The thing is, we didn’t just throw a blanket over these problems. The bipartisan Infrastructure Law, for instance, isn’t just about roads and bridges; it’s about creating good-paying jobs and ensuring that when you work hard, you can build a decent life for yourself and your family. And let’

KeyboardInterrupt: 