In [2]:
import os
import csv
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [17]:
f = open("secrets.txt", 'r')
secrets = f.readlines()
hf_token = 0
for line in secrets:
    token = line.find("HUGGINGFACE_TOKEN")
    if token == 0:
        hf_token = str(line.split("HUGGINGFACE_TOKEN=")[1])


In [3]:
chunked_documents_dir = '/home/ubuntu/Desktop/capstone/chunked_data/'
chunk_sizes = [128,256,512,1024,2048]

output_csv = 'generated_questions.csv'

In [4]:
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2b-it', token=hf_token, device_map='auto',\
                                            torch_dtype=torch.bfloat16)



`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.20it/s]


In [5]:
def generate_question(text, max_new_tokens=100):
    # Prompt to generate question
    input_text = (f"Read the following passage and generate a relevant question that can be found \
                    within the content:\n\n{text}\n\nQuestion:",text)
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to('cuda')
    
    # Generate question
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens, 
            num_return_sequences=1, 
            no_repeat_ngram_size=2, 
            top_p=0.95, 
            top_k=25,
            temperature=1,
        )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract the question and answer parts (if provided)
    question_start = generated_text.find("Question:") + len("Question:")
    question = generated_text[question_start:].strip()
    
    return question

In [5]:
# test generation and output formatting
# input = "Summary This joint advisory is the result of a collaborative research effort by the cybersecurity authorities of five nations: Australia,[1] Canada,[2] New Zealand,[3][4] the United Kingdom,[5] and the United States.[6] It highlights technical approaches to uncovering malicious activity and includes mitigation steps according to best practices. The purpose of this report is to enhance incident response among partners and network administrators along with serving as a playbook for incident investigation. Key Takeaways When addressing potential incidents and applying best practice incident response procedures: First, collect and remove for further analysis: Relevant artifacts, Logs, and Data. Next, implement mitigation steps that avoid tipping off the adversary that their presence in the network has been discovered. Finally, consider soliciting incident response support from a third-party IT security organization to: Provide"
# question = generate_question_and_answer(input)

# print(question)

In [6]:
for size in chunk_sizes:
    size_dir = os.path.join(chunked_documents_dir, str(size))
    
    # Create a CSV file for the current chunk size
    output_csv = f'generated_questions_{size}.csv'
    
    # Open the CSV file in write mode
    # Updating the csv after each generated question so less information is stored in mem
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Chunk Size', 'Generated Question'])
        
        # Process each chunk one by one
        for filename in os.listdir(size_dir):
            if filename.endswith('.txt'):
                with open(os.path.join(size_dir, filename), 'r', encoding='utf-8') as file:
                    chunk = file.read()
                
                try:
                    question = generate_question_and_answer(chunk)
                    writer.writerow([filename, size, question])
                    # commented out this line for brevity on output
                    #print(f"Generated question and answer for {filename} with chunk size {size}")
                    
                    # Clear GPU cache
                    torch.cuda.empty_cache()
                    
                except Exception as e:
                    print(f"Error generating question and answer for {filename}: {e}")

    print(f"Generated questions and answers saved to {output_csv}.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Generated question and answer for aa23-213a_chunk_10.txt with chunk size 128
Generated question and answer for aa23-347a_chunk_17.txt with chunk size 128
Generated question and answer for aa23-158a_chunk_33.txt with chunk size 128
Generated question and answer for aa23-278a_chunk_6.txt with chunk size 128
Generated question and answer for aa23-278a_chunk_66.txt with chunk size 128
Generated question and answer for aa20-301a_chunk_23.txt with chunk size 128
Generated question and answer for aa24-109a_chunk_24.txt with chunk size 128
Generated question and answer for ar23-209a_chunk_3.txt with chunk size 128
Generated question and answer for icsa-24-074-11_chunk_1.txt with chunk size 128
Generated question and answer for ar23-243a_chunk_17.txt with chunk size 128
Generated question and answer for ar23-243a_chunk_44.txt with chunk size 128
Generated question and answer for aa23-263a_chunk_10.txt with chunk size 128
Generated question and answer for aa23-144a_chunk_36.txt with chunk size 1

KeyboardInterrupt: 