In [1]:
import chromadb
import json
from tqdm import tqdm
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.cuda.amp import autocast
import os
import csv
import pandas as pd

In [2]:
models_to_compare = {
    'openai-community': 'gpt2-medium', 
    # 'FacebookAI': 'roberta-base',
}

In [3]:
os.environ["HUGGINGFACE_TOKEN"] = "<YOUR_TOKEN_HERE>"

In [4]:
# Initialize Chroma client
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="documents",metadata={"hnsw:M":200})

# Function to add documents to the collection
def add_documents_to_chromadb(documents, collection):
    for doc_id, doc in enumerate(documents):
        collection.add(ids=[str(doc_id)], documents=[doc])
        

In [5]:
# Directory containing the chunked documents
chunked_documents_dir = '/home/ubuntu/Desktop/capstone/chunked_data'
chunk_sizes = [128, 256, 512, 1024, 2048]

In [6]:
def retrieve_documents(query, collection, num_docs):
    results = collection.query(query_texts=[query], n_results=1)
    retrieved_docs = [doc for sublist in results['documents'] for doc in sublist]
    return retrieved_docs

In [7]:
def reset_and_add_documents(correct_doc, all_docs, num_random_docs, collection, seed):
    clear_collection(collection)
    random.seed(seed)
    documents_to_add = [correct_doc] + random.sample(all_docs, num_random_docs)
    add_documents_to_chromadb(documents_to_add, collection)

In [8]:
def clear_collection(collection):
    result = collection.get()
    document_ids = result.get('ids', [])
    if len(document_ids) > 0:
        collection.delete(ids=document_ids) 
    

In [9]:
def generate_answer_with_context(model, tokenizer, query, retrieved_docs, max_new_tokens=500, max_length=4000):
    context = "\n\n".join(retrieved_docs)
    # first run:
    # input_text = f"Read the following context and answer the question using only information within the context. Start your answer with 'Answer':\n\n{context}\n\nQuestion: {query}"
    # second run: also changed to top 3 returned from docs
    input_text = f"Context:\n{context}\n\nQuestion: {query}\n\nPlease provide a detailed \
    answer to the question above based on the context provided."
    
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=False, max_length=4096).to('cuda')
    
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens, 
            num_return_sequences=1, 
            no_repeat_ngram_size=2, 
            top_p=0.95, 
            top_k=25,
            temperature=1,
            pad_token_id=tokenizer.eos_token_id,
        )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    
    return generated_text

# def generate_answer_with_context(model, tokenizer, query, retrieved_docs, max_new_tokens=100, max_length=1024, num_return_sequences=1):
#     context = "\n\n".join(retrieved_docs)
#     input_text = f"Context:\n{context}\n\nQuestion: {query}\n\nPlease provide a detailed answer to the question above based on the context provided."

#     # Tokenize the input and check the length
#     inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=False).to('cuda')
    
#     if inputs['input_ids'].shape[1] > max_length:
#         # Manually truncate the context to fit within max_length
#         context_tokens = tokenizer(context, return_tensors='pt')['input_ids'][0]
#         question_tokens = tokenizer(query, return_tensors='pt')['input_ids'][0]
#         max_context_length = max_length - len(question_tokens) - 20  # Allow space for question and special tokens
#         truncated_context_tokens = context_tokens[:max_context_length]
#         truncated_context = tokenizer.decode(truncated_context_tokens, skip_special_tokens=True)
#         input_text = f"Context:\n{truncated_context}\n\nQuestion: {query}\n\nPlease provide a detailed answer to the question above based on the context provided."
#         inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=False).to('cuda')

#     with torch.no_grad():
#         output_ids = model.generate(
#             input_ids=inputs['input_ids'], 
#             attention_mask=inputs['attention_mask'],
#             max_new_tokens=max_new_tokens, 
#             num_return_sequences=num_return_sequences, 
#             no_repeat_ngram_size=2, 
#             top_p=0.9, 
#             top_k=50,
#             temperature=0.7,
#             pad_token_id=tokenizer.eos_token_id
#         )

#     generated_texts = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]
    
#     # Post-process the generated texts to remove the input context and question
#     answers = [generated_text.replace(input_text, '').strip() for generated_text in generated_texts]
#     return answers[0] if num_return_sequences == 1 else answers

In [10]:
all_documents = []
for size in chunk_sizes:
    size_dir = os.path.join(chunked_documents_dir, str(size))
    for filename in os.listdir(size_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                all_documents.append(file.read())


In [11]:
seed = 498

for model_name, model_path in models_to_compare.items():
    # Load the model and tokenizer
    mode_name = model_path
    tokenizer = AutoTokenizer.from_pretrained(model_path, token=os.getenv("HUGGINGFACE_TOKEN"))
    model = AutoModelForCausalLM.from_pretrained(model_path, token=os.getenv("HUGGINGFACE_TOKEN"), device_map="cuda", torch_dtype="auto")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print(f"Model {model_name} loaded.")
    
    for size in chunk_sizes:
        size_dir = os.path.join(chunked_documents_dir, str(size))
        output_json = f'/home/ubuntu/Desktop/capstone/3-{model_name}_q_a_{size}.json'
        # Load existing CSV into DataFrame
        input_csv = f'/home/ubuntu/Desktop/capstone/cleaned_questions/generated_questions_{size}.csv'

        # added in case of interruptions during answer generations
        if os.path.exists(output_json):
            with open(output_json, 'r', encoding='utf-8') as file:
                output_data = json.load(file)
        else:
            output_data = []
        
        
        if os.path.exists(input_csv):
            df = pd.read_csv(input_csv)
        else:
            print("Question CSV not located!")
            exit()
            
        # Generate answers for all entries
        for index, row in df.iterrows():
            filename = row['Filename']
            if isinstance(filename, str) and filename.endswith('.txt'):
                with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                    chunk = file.read()
                
                try:
                    question = row['Generated Question']
                    entry = {
                        'Filename': filename,
                        'Chunk Size': size,
                        'Generated Question': question,
                        'Answers': {}
                    }

                    for num_docs in [1, 3, 5, 10]:
                        if num_docs > 1:
                            num_random_docs = num_docs - 1
                        else:
                            num_random_docs = 0

                        # Reset and add documents to ChromaDB
                        reset_and_add_documents(chunk, all_documents, num_random_docs, collection, seed)

                        # Retrieve documents and generate the answer
                        retrieved_docs = retrieve_documents(question, collection, num_docs)
                        if not retrieved_docs:
                            print(f"No documents retrieved for {filename} with chunk size {size} using model {model_name} and {num_docs} docs")
                            continue

                        answer = generate_answer_with_context(model, tokenizer, question, retrieved_docs, max_length=1024)
                        entry['Answers'][f'{num_docs} Docs'] = answer
                        print(f"Generated answer for {filename} with chunk size {size} using model {model_name} and {num_docs} docs")

                    output_data.append(entry)

                    # Save the updated JSON after each answer
                    with open(output_json, 'w', encoding='utf-8') as file:
                        json.dump(output_data, file, indent=4)

                    torch.cuda.empty_cache()

                except Exception as e:
                    print(f"Error generating answer for {filename} with model {model_name}: {e}")

        print(f"Updated answers saved to {output_json} for model {model_name}.")

Model openai-community loaded.




Generated answer for aa23-213a_chunk_10.txt with chunk size 128 using model openai-community and 1 docs
Generated answer for aa23-213a_chunk_10.txt with chunk size 128 using model openai-community and 3 docs
Generated answer for aa23-213a_chunk_10.txt with chunk size 128 using model openai-community and 5 docs
Generated answer for aa23-213a_chunk_10.txt with chunk size 128 using model openai-community and 10 docs
Generated answer for aa23-347a_chunk_17.txt with chunk size 128 using model openai-community and 1 docs
Generated answer for aa23-347a_chunk_17.txt with chunk size 128 using model openai-community and 3 docs
Generated answer for aa23-347a_chunk_17.txt with chunk size 128 using model openai-community and 5 docs
Generated answer for aa23-347a_chunk_17.txt with chunk size 128 using model openai-community and 10 docs
Generated answer for aa23-158a_chunk_33.txt with chunk size 128 using model openai-community and 1 docs
Generated answer for aa23-158a_chunk_33.txt with chunk size 12

KeyboardInterrupt: 

First runs prompt:
input_text = f"Read the following context and answer the question using only information within the context.\
Start your answer with 'Answer':\n\n{context}\n\nQuestion: {query}"
with top n = num_docs == all document contexts