In [None]:
# Import necessary libraries and mount Google Drive

from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Unzip the model output for usage

!unzip "/content/drive/MyDrive/gator_sched/model_output.zip" -d "/content/model_output"


Archive:  /content/drive/MyDrive/gator_sched/model_output.zip
   creating: /content/model_output/model_output/
   creating: /content/model_output/model_output/custom_q_and_a/
  inflating: /content/model_output/model_output/custom_q_and_a/training_args.bin  
  inflating: /content/model_output/model_output/custom_q_and_a/special_tokens_map.json  
   creating: /content/model_output/model_output/custom_q_and_a/checkpoint-20000/
  inflating: /content/model_output/model_output/custom_q_and_a/checkpoint-20000/training_args.bin  
  inflating: /content/model_output/model_output/custom_q_and_a/checkpoint-20000/config.json  
  inflating: /content/model_output/model_output/custom_q_and_a/checkpoint-20000/trainer_state.json  
  inflating: /content/model_output/model_output/custom_q_and_a/checkpoint-20000/generation_config.json  
  inflating: /content/model_output/model_output/custom_q_and_a/checkpoint-20000/model.safetensors  
  inflating: /content/model_output/model_output/custom_q_and_a/checkpoin

In [None]:
# Load model and tokenizer functions

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer



# Generate text based on model predictions

def generate_text(model, tokenizer, sequence, max_length):
    ids = tokenizer.encode(sequence, return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)



In [None]:
# def generate_text(model_path, sequence, max_length):

#     model = load_model(model_path)
#     tokenizer = load_tokenizer(model_path)
#     ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
#     final_outputs = model.generate(
#         ids,
#         do_sample=True,
#         max_length=max_length,
#         pad_token_id=model.config.eos_token_id,
#         top_k=50,
#         top_p=0.95,
#     )
#     print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
# Read questions from a file and write answers generated by the model

def read_questions(file_path):
    with open(file_path, 'r') as file:
        questions = file.readlines()
    return questions

In [None]:
# def write_answers(model, tokenizer, questions, output_file_path, max_length=18):
#     with open(output_file_path, 'w') as file:
#         for question in questions:
#             answer = generate_text(model, tokenizer, question.strip(), max_length)
#             file.write(f"Q: {question.strip()} \n A: {answer}\n\n")


In [None]:
def write_answers(model, tokenizer, questions, output_file_path, additional_length=15):
    with open(output_file_path, 'w') as file:
        for question in questions:
            question_length = len(tokenizer.encode(question.strip()))
            max_length = question_length + additional_length
            generated_text = generate_text(model, tokenizer, question.strip(), max_length)

            # Find the end of the input question in the generated text
            question_end_index = generated_text.find(question.strip()) + len(question.strip())

            # Find the start and end of the answer
            answer_start = generated_text.find("Answer:", question_end_index) + len("Answer:")
            answer_end = generated_text.find("Question:", answer_start)

            # Extract the answer
            answer = generated_text[answer_start:answer_end].strip() if answer_end != -1 else generated_text[answer_start:].strip()

            # Write question and answer to the file
            file.write(f"Q: {question.strip()} \nA: {answer}\n\n")

In [None]:
# def write_answers(model, tokenizer, questions, output_file_path, additional_length=30):
#     with open(output_file_path, 'w') as file:
#         for question in questions:
#             question_length = len(tokenizer.encode(question.strip()))
#             max_length = question_length + additional_length
#             generated_text = generate_text(model, tokenizer, question.strip(), max_length)

#             # Extract the answer from the generated text
#             answer_start = generated_text.find("Answer:") + len("Answer:")
#             answer_end = generated_text.find("Question:", answer_start)
#             answer = generated_text[answer_start:answer_end].strip() if answer_end != -1 else generated_text[answer_start:].strip()

#             file.write(f"Q: {question.strip()} \nA: {answer}\n\n")

In [None]:
# Load model and tokenizer
model_path = "/content/model_output/model_output/custom_q_and_a"
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)

In [None]:
# Read questions and generate answers
questions_file_path = "/content/questions.txt" 
output_file_path = "/content/answers.txt"
questions = read_questions(questions_file_path) #generate answers using the model
write_answers(model, tokenizer, questions, output_file_path) #save the responses in answers.txt

In [None]:
# # model2_path = "/content/drive/MyDrive/ColabNotebooks/models/chat_models/custom_q_and_a"
# sequence2 = "Give me ideas for some new hobbies "
# max_len = 50
# generate_text(model_path, sequence2, max_len)