In [1]:
# Import Libraries

import json
import hashlib
from tqdm.auto import tqdm
import pickle
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers.utils import is_flash_attn_2_available
import torch

In [2]:
# Read JSON file and prepare documents

with open ('./documents.json', 'rt') as f_in:
    doc_file = json.load(f_in)

documents = []

for course in doc_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
# Generate ids using the contents of the docunets

def generate_document_id(doc):
    # combines the content of the documents
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [5]:
# Save documents with ids

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=1)

#### **Text generation**

In [6]:

prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [7]:
torch.random.manual_seed(0)

<torch._C.Generator at 0x7fce2e250810>

In [8]:
# Initialize model

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    attn_implementation="flash_attention_2",
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [10]:
def generate_questions(doc):
    # Format the prompt using the document data
    prompt = prompt_template.format(**doc)
    
    # Generate questions using the pipeline
    response = pipe(prompt, max_new_tokens=200, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
    
    # Extract and return the generated text
    generated_text = response[0]['generated_text']
    return generated_text

In [11]:
# Function to save results to a file

def save_results(results, filename='results.json'):
    with open(filename, 'w') as f_out:
        json.dump(results, f_out, indent=2)

In [12]:
# Initialize results dictionary
results = {}

# Load intermediate results if any
try:
    with open('results.json', 'r') as f_in:
        results = json.load(f_in)
except FileNotFoundError:
    pass

# Process each document and save results periodically
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

    # Save the results after processing each document
    save_results(results)

# Save the final results
save_results(results)

  0%|          | 0/948 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [13]:
results['06014eec']

'You emulate a student who\'s taking our course.\nFormulate 5 questions this student might ask based on a FAQ record. The record\nshould contain the answer to the questions, and the questions should be complete and not too short.\nIf possible, use as fewer words as possible from the record. \n\nThe record:\n\nsection: Module 5: pyspark\nquestion: PicklingError: Could not serialize object: IndexError: tuple index out of range\nanswer: This version combination worked for me:\nPySpark = 3.3.2\nPandas = 1.5.3\n\nIf it still has an error,\n\nProvide the output in parsable JSON without using code blocks:\n\n["question1", "question2", ..., "question5"]\n\n\n# Answer\n\n[\n  "I\'m getting a PicklingError related to a tuple index out of range when working with PySpark. Can you suggest which versions of PySpark and Pandas might resolve this issue?",\n  "During my PySpark module work, I encountered a PicklingError with an IndexError regarding a tuple. What version combinations of PySpark and Pand