In [1]:
import chromadb
import json
from tqdm import tqdm
import random
import torch
from transformers import pipeline, GPT2ForQuestionAnswering, T5Tokenizer
from torch.cuda.amp import autocast
import os
import csv
import pandas as pd

In [2]:
models_to_compare = {
    'openai-community': 'gpt2-medium',
    # 'FacebookAI': 'roberta-base',
    # 'huggingface-distilbert': 'distilbert-base-uncased-distilled-squad',
    # 'Intel' : 'Intel/dynamic_tinybert',
    # 'google-t5': 'google-t5/t5-base'
}

In [3]:
os.environ["HUGGINGFACE_TOKEN"] = "<YOUR_TOKEN_HERE>"

In [4]:
# Initialize Chroma client
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="documents",metadata={'ef':2048, "hnsw:M":4096})

# Function to add documents to the collection
def add_documents_to_chromadb(documents, collection):
    for doc_id, doc in enumerate(documents):
        collection.add(ids=[str(doc_id)], documents=[doc])
        

In [5]:
# Directory containing the chunked documents
chunked_documents_dir = '/home/ubuntu/Desktop/capstone/chunked_data'
chunk_sizes = [128, 256, 512, 1024, 2048]
# chunk_sizes = [512, 1024, 2048]

In [6]:
def retrieve_documents(query, collection, num_docs):
    # changing n_results each iterration
    num_docs = 2
    results = collection.query(query_texts=[query], n_results=num_docs)
    retrieved_docs = [doc for sublist in results['documents'] for doc in sublist]
    return retrieved_docs

In [7]:
def reset_and_add_documents(correct_doc, all_docs, num_random_docs, collection, seed):
    clear_collection(collection)
    random.seed(seed)
    documents_to_add = [correct_doc] + random.sample(all_docs, num_random_docs)
    add_documents_to_chromadb(documents_to_add, collection)

In [8]:
def clear_collection(collection):
    result = collection.get()
    document_ids = result.get('ids', [])
    if len(document_ids) > 0:
        collection.delete(ids=document_ids) 
    

In [9]:
def generate_answer_with_context(pipeline, query, retrieved_docs):
    # QA pipeline
    # input_text = f"question:{query}, context:{context}"
    # response = pipeline([query, context], max_nex_tokens=200)
    # answer = response['answer'].strip()
    # text2textgeneration
    context = "\n\n".join(retrieved_docs)
    input_text = f"question: {query} context: {context}"

    # pipeline
    response = pipeline(input_text, max_new_tokens=200)
    answer = response[0]['generated_text'].strip()

    
    return answer
_

''

In [10]:
all_documents = []
for size in chunk_sizes:
    size_dir = os.path.join(chunked_documents_dir, str(size))
    for filename in os.listdir(size_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                all_documents.append(file.read())


In [11]:
seed = 498

for model_key, model_path in models_to_compare.items():
    model_name = model_path  # Use the value from the dictionary for filenames
    save_model = model_name.split("/")
    save_model = save_model[-1]
    # Initialize pipeline, using model and tokenizer based on its model// device = 0 utilizes the GPU 
    # nlp_pipeline = pipeline('question-answering', model=model_path, tokenizer=model_path, device=0)
    # test t5-flan with text-gerneration
    nlp_pipeline = pipeline('text2text-generation', model=model_path, tokenizer=model_path, device=0)

    print(f"Pipeline for {model_name} loaded.")

    for size in chunk_sizes:
        size_dir = os.path.join(chunked_documents_dir, str(size))
        output_json = f'/home/ubuntu/Desktop/capstone/{save_model}_q_a_{size}.json'
        # Load existing JSON if it exists
        if os.path.exists(output_json):
            with open(output_json, 'r', encoding='utf-8') as file:
                output_data = json.load(file)
        else:
            output_data = []

        # Pull questions from CSV so all models receive the same question
        input_csv = f'/home/ubuntu/Desktop/capstone/cleaned_questions/generated_questions_{size}.csv'
        if os.path.exists(input_csv):
            df = pd.read_csv(input_csv)
        else:
            df = pd.DataFrame(columns=['Filename', 'Chunk Size', 'Generated Question', 'Generated Answer'])

        # Generate answers for each question
        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f'Processing {model_name} for chunk size {size}'):
            filename = row['Filename']
            if isinstance(filename, str) and filename.endswith('.txt'):
                with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                    chunk = file.read()

                try:
                    question = row['Generated Question']
                    entry = {
                        'Filename': filename,
                        'Chunk Size': size,
                        'Generated Question': question,
                        'Answers': {}
                    }
                    # Ask question based on different amounts of docs in vectorstore
                    # for num_docs in [1, 3, 5, 10]:
                    for num_docs in [3, 5, 10]:
                        if num_docs > 1:
                            num_random_docs = num_docs - 1
                        else:
                            num_random_docs = 0

                        # Reset and add documents to ChromaDB
                        reset_and_add_documents(chunk, all_documents, num_random_docs, collection, seed)

                        # Retrieve documents and generate the answer
                        retrieved_docs = retrieve_documents(question, collection, num_docs)
                        if not retrieved_docs:
                            print(f"No documents retrieved for {filename} with chunk size {size} using model {save_model} and {num_docs} docs")
                            continue

                        answer = generate_answer_with_context(nlp_pipeline, question, retrieved_docs)
                        entry['Answers'][f'{num_docs} Docs'] = answer
                        # print(f"Generated answer for {filename} with chunk size {size} using model {model_name} and {num_docs} docs")

                    output_data.append(entry)

                    # Save the updated JSON after each answer is generated
                    with open(output_json, 'w', encoding='utf-8') as file:
                        json.dump(output_data, file, indent=4)

                except Exception as e:
                    print(f"Error generating answer for {filename} with model {model_name}: {e}")

        print(f"Updated answers saved to {output_json} for model {model_name}.")

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pipeline for gpt2-medium loaded.


Processing gpt2-medium for chunk size 128:   0%|                                     | 1/3043 [00:00<28:55,  1.75it/s]

Error generating answer for aa23-213a_chunk_10.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 2/3043 [00:00<20:31,  2.47it/s]

Error generating answer for aa23-347a_chunk_17.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 3/3043 [00:01<18:07,  2.80it/s]

Error generating answer for aa23-158a_chunk_33.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 4/3043 [00:01<16:50,  3.01it/s]

Error generating answer for aa23-278a_chunk_6.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 5/3043 [00:01<16:01,  3.16it/s]

Error generating answer for aa23-278a_chunk_66.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 6/3043 [00:02<15:27,  3.27it/s]

Error generating answer for aa20-301a_chunk_23.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 7/3043 [00:02<15:10,  3.33it/s]

Error generating answer for aa24-109a_chunk_24.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 8/3043 [00:02<15:13,  3.32it/s]

Error generating answer for ar23-209a_chunk_3.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                     | 9/3043 [00:02<15:26,  3.27it/s]

Error generating answer for icsa-24-074-11_chunk_1.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|                                    | 10/3043 [00:03<15:23,  3.29it/s]

Error generating answer for ar23-243a_chunk_17.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|▏                                   | 11/3043 [00:03<14:35,  3.46it/s]

Error generating answer for ar23-243a_chunk_44.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|▏                                   | 13/3043 [00:03<12:34,  4.02it/s]

Error generating answer for aa23-263a_chunk_10.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment
Error generating answer for aa23-144a_chunk_36.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|▏                                   | 14/3043 [00:04<12:00,  4.20it/s]

Error generating answer for aa23-074a_chunk_35.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   0%|▏                                   | 15/3043 [00:04<12:06,  4.17it/s]

Error generating answer for aa22-320a_chunk_9.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   1%|▏                                   | 16/3043 [00:04<12:24,  4.07it/s]

Error generating answer for aa20-245a_chunk_22.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   1%|▏                                   | 17/3043 [00:04<12:00,  4.20it/s]

Error generating answer for aa21-062a_chunk_6.txt with model gpt2-medium: local variable 'truncated_context' referenced before assignment


Processing gpt2-medium for chunk size 128:   1%|▏                                   | 17/3043 [00:05<15:16,  3.30it/s]


KeyboardInterrupt: 

First runs prompt:
input_text = f"Read the following context and answer the question using only information within the context.\
Start your answer with 'Answer':\n\n{context}\n\nQuestion: {query}"
with top n = num_docs == all document contexts