In [2]:
import os
import csv
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
f = open("secrets.txt", 'r')
secrets = f.readlines()
hf_token = 0
for line in secrets:
    token = line.find("HUGGINGFACE_TOKEN")
    if token == 0:
        hf_token = str(line.split("HUGGINGFACE_TOKEN=")[1])

In [4]:
chunked_documents_dir = '/home/ubuntu/Desktop/capstone/chunked_data/'
cleaned_questions_dir = '/home/ubuntu/Desktop/capstone/cleaned_questions/'
chunk_sizes = [128,256,512,1024,2048]

output_csv = 'generated_q_a.csv'

In [5]:
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2b-it', token="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", device_map='auto',\
                                            torch_dtype=torch.bfloat16)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.23it/s]


In [6]:
def generate_answer(text, question, max_new_tokens=100):
    # Prepare the input text
    input_text = (f"Read the following passage and the question using only information \
                    within the passage:\n\n{text}\n\nQuestion:{question}\n\nAnswer:",text, question)
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to('cuda')
    
    # Generate the question
    with torch.no_grad():
        # Could potentially add 'with autocast():' for better performance
        output_ids = model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens, 
            num_return_sequences=1, 
            no_repeat_ngram_size=2, 
            top_p=0.95, 
            top_k=25,
            temperature=1,
        )
    
    # Decode the output
    answer_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract the question and answer parts
    answer_start = answer_text.find("Answer:") + len("Answer:")
    answer = answer_text[answer_start:].strip()
    
    return answer

In [7]:
# Process each chunk size separately
for size in chunk_sizes:
    size_dir = os.path.join(chunked_documents_dir, str(size))
    output_csv = f'/home/ubuntu/Desktop/capstone/complete_q_a_{size}.csv'
    # Load existing CSV into DataFrame
    input_csv = f'/home/ubuntu/Desktop/capstone/cleaned_questions/generated_questions_{size}.csv'
    if os.path.exists(input_csv):
        df = pd.read_csv(input_csv)
    else:
        df = pd.DataFrame(columns=['Filename', 'Chunk Size', 'Generated Question', 'Generated Answer'])
    
    # Filter the DataFrame to get filenames without answers
    incomplete_entries = df[df['Generated Answer'].isnull() | (df['Generated Answer'].str.strip() == '')]
    
    # Generate answers for the incomplete entries
    for index, row in incomplete_entries.iterrows():
        filename = row['Filename']
        if isinstance(filename, str) and filename.endswith('.txt'):
            with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                chunk = file.read()
            
            try:
                question = row['Generated Question']
                # Generate the answer
                answer = generate_answer(chunk, question)
                df.at[index, 'Generated Answer'] = answer
                print(f"Generated answer for {filename} with chunk size {size}")
                
                # Save the updated DataFrame back to the CSV file after each answer is generated
                df.to_csv(output_csv, index=False)
                
                # Clear GPU cache
                torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"Error generating answer for {filename}: {e}")

    print(f"Updated answers saved to {output_csv}.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Generated answer for aa23-213a_chunk_10.txt with chunk size 128
Generated answer for aa23-347a_chunk_17.txt with chunk size 128
Generated answer for aa23-158a_chunk_33.txt with chunk size 128
Generated answer for aa23-278a_chunk_6.txt with chunk size 128
Generated answer for aa23-278a_chunk_66.txt with chunk size 128
Generated answer for aa20-301a_chunk_23.txt with chunk size 128
Generated answer for ar23-209a_chunk_3.txt with chunk size 128
Generated answer for icsa-24-074-11_chunk_1.txt with chunk size 128
Generated answer for ar23-243a_chunk_17.txt with chunk size 128
Generated answer for ar23-243a_chunk_44.txt with chunk size 128
Generated answer for aa23-263a_chunk_10.txt with chunk size 128
Generated answer for aa23-144a_chunk_36.txt with chunk size 128
Generated answer for aa23-074a_chunk_35.txt with chunk size 128
Generated answer for aa22-320a_chunk_9.txt with chunk size 128
Generated answer for aa20-245a_chunk_22.txt with chunk size 128
Generated answer for aa21-062a_chunk_6.

AttributeError: 'float' object has no attribute 'endswith'