In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U transformers bitsandbytes accelerate

Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes, accelerate, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed accelerate-0.27.2 bitsandbytes-0.42.0 transformers-4.38.1


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", load_in_4bit=True)

input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



<pad> Wie alt sind Sie?</s>


In [None]:
def read_context_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        context = file.read().replace('\n', ' ')
    return context

#making every line into chunk
def read_context_from_txt_line(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        contexts = [line.strip() for line in file.readlines()]
    return contexts

#divide context into chunks
def divide_context_into_chunks(context, chunk_size):
    chunks = []
    for i in range(0, len(context), chunk_size):
        chunks.append(context[i:i+chunk_size])
    return chunks

# Example .txt file path
txt_file_path = "/content/drive/MyDrive/NLP Data/twentyfive_great_things.txt"

# Read context from .txt file
context = read_context_from_txt(txt_file_path)
chunk_size = 5000  # Adjust the chunk size as per your requirement
context_chunks = divide_context_into_chunks(context, chunk_size)
line_context = read_context_from_txt_line(txt_file_path)

print(line_context)
print(context)
print(context_chunks[0])

In [3]:
import itertools
import json


def paragraphs_streaming_minLength(fileobj, min_length=2000):
    """Yield paragraphs from a file with a minimum length using a state machine approach."""
    IN_PARAGRAPH, BETWEEN_PARAGRAPHS = range(2)
    state = BETWEEN_PARAGRAPHS
    paragraph = []
    buffer = ""  # Initialize a buffer to accumulate paragraphs to meet the minimum length requirement

    for line in fileobj:
        if line.strip() == "":
            if state == IN_PARAGRAPH:
                # Add the current paragraph to the buffer and check its length
                buffer += ''.join(paragraph)
                paragraph = []  # Reset current paragraph
                if len(buffer) >= min_length:
                    # If the buffer meets the minimum length, yield it and reset the buffer
                    yield buffer
                    buffer = ""
            state = BETWEEN_PARAGRAPHS
        else:
            paragraph.append(line)
            state = IN_PARAGRAPH

    # After the loop, add any remaining content to the buffer
    if paragraph:
        buffer += ''.join(paragraph)

    # If the final buffer meets the minimum length requirement, yield it
    if len(buffer) >= min_length:
        yield buffer
    elif buffer:  # If there's content in the buffer but it doesn't meet the minimum length, still yield it
        yield buffer

# Example usage
paragraph_lists = []
with open('../Scraped data/MCDS Handbook 23-24 AY.txt', 'r') as f:
    for para in paragraphs_streaming_minLength(f):
        paragraph_lists.append(para)

with open("/home/ubuntu/wenjinf/test.json",'w') as f:
    json.dump(paragraph_lists,f)
print(paragraph_lists)

['Language Technologies Institute / School of Computer Science \nGraduate Student Handbook \nAcademic Year 2023-2024  \nMaster of Computational Data Science Program \nLast revision date: July 20, 2023 \nThe information contained in this graduate handbook template focuses on the \nresources and locations available at the Carnegie Mellon Pittsburgh Campus. \n1\nTable of Contents \n1 Welcome .................................................................................................................. 6 \n1.1 The MCDS Degree ..................................................................................................... 6 \n1.2 Vision .......................................................................................................................... 7 \n1.3 Mission ....................................................................................................................... 7 \n1.4 MCDS Contact Information.............................................................

In [None]:
def generate_question(context):

    input_text = "Extract 10 question from this passage, as well as their corresponding answer. The answer must be contained entirely within the above text. Context: " + context
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    output = model.generate(input_ids, num_beams=5, num_beam_groups=5, max_new_tokens=200, diversity_penalty=1.0)
    question = tokenizer.decode(output[0])
    #print(tokenizer.decode(outputs[0]))
    del input_ids

    return question

def generate_answers(question, context):

    input_text = "Question: " + question + "Context: " + context
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    output = model.generate(input_ids, num_beams=7, num_beam_groups=7, max_new_tokens=300, diversity_penalty=1.0)
    answer = tokenizer.decode(output[0])
    del input_ids

    return answer

In [None]:
#question = generate_question(context)
#answer = generate_answers(question, context)

# 25 great things

output_file_path = "questions_and_answers.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
    for context in line_context:
        question = generate_question(context)
        answer = generate_answers(question, context)
        output_file.write("Question: " + question + "\n")
        output_file.write("Answer: " + answer + "\n")
        output_file.write("\n")

In [None]:
# buggy text
txt_file_path = "/content/drive/MyDrive/NLP Data/buggy.txt"

# Read context from .txt file
context = read_context_from_txt(txt_file_path)
chunk_size = 250  # Adjust the chunk size as per your requirement
context_chunks = divide_context_into_chunks(context, chunk_size)


output_file_path = "buggy.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
  for i in range(len(context_chunks)):
    question = generate_question(context_chunks[i])
    answer = generate_answers(question, context_chunks[i])
    output_file.write("Question: " + question + "\n")
    output_file.write("Answer: " + answer + "\n")
    output_file.write("\n")



In [None]:
#commencement.txt
txt_file_path = "/content/drive/MyDrive/NLP Data/commencement.txt"

# Read context from .txt file
context = read_context_from_txt(txt_file_path)
chunk_size = 260  # Adjust the chunk size as per your requirement
context_chunks = divide_context_into_chunks(context, chunk_size)


output_file_path = "commencement.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
  for i in range(len(context_chunks)):
    question = generate_question(context_chunks[i])
    answer = generate_answers(question, context_chunks[i])
    output_file.write("Question: " + question + "\n")
    output_file.write("Answer: " + answer + "\n")
    output_file.write("\n")

In [None]:


#history.txt
txt_file_path = "/content/drive/MyDrive/NLP Data/history_2.txt"

# Read context from .txt file
context = read_context_from_txt(txt_file_path)
chunk_size = 500  # Adjust the chunk size as per your requirement
context_chunks = divide_context_into_chunks(context, chunk_size)


output_file_path = "history.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
  for i in range(len(context_chunks)):
    question = generate_question(context_chunks[i])
    answer = generate_answers(question, context_chunks[i])
    output_file.write("Question: " + question + "\n")
    output_file.write("Answer: " + answer + "\n")
    output_file.write("\n")