In [4]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Initialize the T5 model and tokenizer (fine-tuned for QA generation)
model_name = "valhalla/t5-base-qg-hl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_question_answer(text):
    """
    Function to generate Question-Answer pairs from a given text using the T5 model.
    """
    # Custom prompt for generating a question and its answer
    prompt = f"""Based on the following text, generate a single-line question and its answer. 
    The question should be specific and answerable from the given text.
    Example: 
    Text: The Eiffel Tower, located in Paris, France, was completed in 1889. It stands at a height of 324 meters and was the tallest man-made structure in the world for 41 years until the Chrysler Building in New York City was built in 1930.
Q: In what year was the Eiffel Tower completed?
A: The Eiffel Tower was completed in 1889.
    Now, generate a question and answer for the following text:
    Text: {text}"""
    
    # Preprocess the text with the custom prompt
    input_text = f"{prompt} </s>"
    
    # Tokenize the input and generate questions
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    
    # Decode the generated output (both question and answer in one output)
    qa_pair = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return qa_pair

def process_file(file_path, file_type="txt"):
    """
    Function to process either a text file or CSV file and generate QA pairs.
    """
    if file_type == "txt":
        with open(file_path, 'r') as file:
            text = file.read()
            qa_pairs = []
            sentences = text.split('\n')
            for sentence in sentences:
                print(sentence)
                # sentence is not start with #, URL, or empty
                if sentence and not sentence.startswith('#') and not sentence.startswith('URL') and not sentence.startswith('\n'):
                    if len(sentence) > 20:
                        qa_pair = generate_question_answer(sentence.strip())
                        #qa_pair = None
                        qa_pairs.append(qa_pair)
            return qa_pairs
    
    elif file_type == "csv":
        df = pd.read_csv(file_path)
        qa_pairs = []
        for _, row in df.iterrows():
            text = row['text_column_name']  # Change 'text_column_name' to your column name
            qa_pair = generate_question_answer(text)
            print(qa_pair)
            qa_pairs.append(qa_pair)
        return qa_pairs

# Example usage:
file_path = "../raw_data/MusicAndCulture/bananasplitfest.txt"
file_type = "txt"  # or 'csv'

qa_pairs = process_file(file_path, file_type)

# Output the generated QA pairs
for idx, qa in enumerate(qa_pairs):
    print(f"QA Pair {idx+1}: {qa}")




URL: https://bananasplitfest.com/


Token indices sequence length is longer than the specified maximum sequence length for this model (671 > 512). Running this sequence through the model will result in indexing errors



The Robindale Great American Banana Split Celebration It's the Sweetest Thing to Come from Latrobe! August 23-24, 2025 Don't wait to plan! IT WILL BE HERE BEFORE YOU KNOW IT! We thank The City of Latrobe and the Latrobe Public Works Department for their financial contributions and annual support across on street closings, donations of roll off boxes, and so much more! Vendors Sign up to be a featured part of the fun! We love supporting local vendors. Come be part of one of the biggest events of the year! Sponsors The Great American Banana Split Celebration couldn’t have created this event without the help of our wonderful sponsors. Check out some of the amazing talent that will be performing at the festival! Featured Entertainer Lorem ipsum dolor sit amet, consectetur adipiscing elit. Featured Entertainer​ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Featured Entertainer​ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Activities We have activities for the whole f

KeyboardInterrupt: 