In [1]:
import chromadb
import json
from tqdm import tqdm
import random
import torch
from transformers import pipeline, GPT2Tokenizer, T5Tokenizer
from torch.cuda.amp import autocast
import os
import csv
import pandas as pd

In [2]:
# models_to_compare = {
#     'openai-community': 'gpt2-medium',
#     # 'FacebookAI': 'roberta-base',
#     # 'huggingface-distilbert': 'distilbert-base-uncased-distilled-squad',
#     # 'Intel' : 'Intel/dynamic_tinybert',
#     # 'google-t5': 'google-t5/t5-base'
# }

models_to_compare = {
    # 'openai-community': {'path': 'gpt2-medium', 'pipeline': 'text-generation'},
    # 'huggingface-distilbert': {'path': 'distilbert-base-uncased-distilled-squad', 'pipeline': 'question-answering'},
    'Intel' : {'path': 'Intel/dynamic_tinybert', 'pipeline': 'question-answering'},
    # 'Google-t5' : {'path' : 'google-t5/t5-base', 'pipeline': 'text-generation'}
}

In [3]:
os.environ["HUGGINGFACE_TOKEN"] = "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"

In [4]:
# Initialize Chroma client
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="documents",metadata={'ef':2048, "hnsw:M":4096})

# Function to add documents to the collection
def add_documents_to_chromadb(documents, collection):
    for doc_id, doc in enumerate(documents):
        collection.add(ids=[str(doc_id)], documents=[doc])
        

In [5]:
# Directory containing the chunked documents
chunked_documents_dir = '/home/ubuntu/Desktop/capstone/chunked_data'
chunk_sizes = [128, 256, 512, 1024, 2048]
# chunk_sizes = [1024, 2048]
# chunk_sizes = [512, 1024, 2048]

In [6]:
def retrieve_documents(query, collection, num_docs):
    # changing n_results each iterration
    num_docs = 2
    results = collection.query(query_texts=[query], n_results=num_docs)
    retrieved_docs = [doc for sublist in results['documents'] for doc in sublist]
    return retrieved_docs

In [7]:
def reset_and_add_documents(correct_doc, all_docs, num_random_docs, collection, seed):
    clear_collection(collection)
    random.seed(seed)
    documents_to_add = [correct_doc] + random.sample(all_docs, num_random_docs)
    add_documents_to_chromadb(documents_to_add, collection)

In [8]:
def clear_collection(collection):
    result = collection.get()
    document_ids = result.get('ids', [])
    if len(document_ids) > 0:
        collection.delete(ids=document_ids) 
    

In [9]:
def generate_answer_with_qa_pipeline(pipeline, query, retrieved_docs, max_length=512):
    context = "\n\n".join(retrieved_docs)
    input_text = f"question: {query} context: {context}"

    # Tokenize the input and check the length
    inputs = pipeline.tokenizer(input_text, return_tensors='pt', truncation=True, max_length=max_length)

    # Manually truncate the context if it exceeds the max length
    if inputs['input_ids'].shape[1] > max_length:
        # Truncate context to fit within max_length considering the query length
        query_length = len(pipeline.tokenizer(query, return_tensors='pt')['input_ids'][0])
        max_context_length = max_length - query_length
        context_tokens = pipeline.tokenizer(context, return_tensors='pt', truncation=True, max_length=max_context_length)
        truncated_context = pipeline.tokenizer.decode(context_tokens['input_ids'][0], skip_special_tokens=True)
        input_text = f"question: {query} context: {truncated_context}"
    else:
        truncated_context = input_text
        
    # Generate the answer using the pipeline
    response = pipeline(question=query, context=truncated_context)
    answer = response['answer'].strip()
    
    return answer

# Function to generate answer with context using text-generation pipeline
def generate_answer_with_text_generation_pipeline(pipeline, query, retrieved_docs, max_length=512, max_new_tokens=100):
    context = "\n\n".join(retrieved_docs)
    input_text = f"Question: {query}\nContext: {context}\nAnswer:"

    # Tokenize the input and check the length
    inputs = pipeline.tokenizer(query,context, return_tensors='pt', truncation="only_second", max_length=max_length)

    # Manually truncate the context if it exceeds the max length
    # if inputs['input_ids'].shape[1] > max_length:
    #     # Truncate context to fit within max_length considering the query length
    #     # query_length = len(pipeline.tokenizer(query, return_tensors='pt')['input_ids'][0])
    #     # max_context_length = max_length - query_length
    #     # context_tokens = pipeline.tokenizer(context, return_tensors='pt', truncation=True, max_length=max_context_length)
    #     # truncated_context = pipeline.tokenizer.decode(context_tokens['input_ids'][0], skip_special_tokens=True)
    #     # input_text = f"Question: {query}\nContext: {truncated_context}\nAnswer:"
    #     inputs = 
    # else:
    #     truncated_context = inputs

    # Generate the answer using the text-generation pipeline
    response = pipeline(inputs, max_new_tokens=max_new_tokens)
    answer = response[0]['generated_text'].strip()
    
    # Extract the answer portion only
    answer_start = answer.find('Answer:') + len('Answer:')
    answer_text = answer[answer_start:].strip()
    
    return answer_text
_

''

In [10]:
all_documents = []
for size in chunk_sizes:
    size_dir = os.path.join(chunked_documents_dir, str(size))
    for filename in os.listdir(size_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                all_documents.append(file.read())


In [None]:
seed = 498

# Process each chunk size separately for each model
for model_key, model_info in models_to_compare.items():
    model_name = model_info['path']
    model_save = model_name.split('/')[-1]
    pipeline_type = model_info['pipeline']

    # Initialize the correct pipeline based on the model type
    if pipeline_type == 'question-answering':
        nlp_pipeline = pipeline(pipeline_type, model=model_name, tokenizer=model_name, device=0)
        generate_answer = generate_answer_with_qa_pipeline
    else:
        nlp_pipeline = pipeline(pipeline_type, model=model_name, tokenizer=model_name, device=0)
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        generate_answer = generate_answer_with_text_generation_pipeline

    print(f"Pipeline for {model_name} loaded.")

    for size in chunk_sizes:
        size_dir = os.path.join(chunked_documents_dir, str(size))
        output_json = f'/home/ubuntu/Desktop/capstone/2-{model_save}_q_a_{size}.json'
        # Load existing JSON if it exists
        if os.path.exists(output_json):
            with open(output_json, 'r', encoding='utf-8') as file:
                output_data = json.load(file)
        else:
            output_data = []

        # Load existing CSV into DataFrame
        input_csv = f'/home/ubuntu/Desktop/capstone/cleaned_questions/generated_questions_{size}.csv'
        if os.path.exists(input_csv):
            df = pd.read_csv(input_csv)
        else:
            df = pd.DataFrame(columns=['Filename', 'Chunk Size', 'Generated Question', 'Generated Answer'])

        # Generate answers for all entries
        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f'Processing {model_name} for chunk size {size}', miniters=10):
            filename = row['Filename']
            if isinstance(filename, str) and filename.endswith('.txt'):
                with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                    chunk = file.read()

                try:
                    question = row['Generated Question']
                    entry = {
                        'Filename': filename,
                        'Chunk Size': size,
                        'Generated Question': question,
                        'Answers': {}
                    }

                    # for num_docs in [1, 3, 5, 10]:
                    for num_docs in [3, 5, 10]:
                        if num_docs > 1:
                            num_random_docs = num_docs - 1
                        else:
                            num_random_docs = 0

                        # Reset and add documents to ChromaDB
                        reset_and_add_documents(chunk, all_documents, num_random_docs, collection, seed)

                        # Retrieve documents and generate the answer
                        retrieved_docs = retrieve_documents(question, collection, num_docs)
                        if not retrieved_docs:
                            print(f"No documents retrieved for {filename} with chunk size {size} using model {model_name} and {num_docs} docs")
                            continue

                        answer = generate_answer(nlp_pipeline, question, retrieved_docs)
                        entry['Answers'][f'{num_docs} Docs'] = answer
                        # print(f"Generated answer for {filename} with chunk size {size} using model {model_name} and {num_docs} docs")

                    output_data.append(entry)

                    # Save the updated JSON after each answer is generated
                    with open(output_json, 'w', encoding='utf-8') as file:
                        json.dump(output_data, file, indent=4)

                except Exception as e:
                    print(f"Error generating answer for {filename} with model {model_name}: {e}")

        print(f"Updated answers saved to {output_json} for model {model_name}.")

Pipeline for Intel/dynamic_tinybert loaded.


Processing Intel/dynamic_tinybert for chunk size 128:   0%| | 0/3043 [00:00<?, ?You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Intel/dynamic_tinybert for chunk size 128:  52%|▌| 1593/3043 [34:26<4

First runs prompt:
input_text = f"Read the following context and answer the question using only information within the context.\
Start your answer with 'Answer':\n\n{context}\n\nQuestion: {query}"
with top n = num_docs == all document contexts