In [55]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM
from vllm import LLM, SamplingParams
from datasets import load_dataset, concatenate_datasets, Dataset
from openai import OpenAI

### Installing Vllm

- Clone vllm repo
- Checkout PR #15008
- Build from source
    ```bash
    VLLM_USE_PRECOMPILED=1 pip install --editable .
    ```

In [6]:
endpoint = f"http://localhost:8000/v1"
openai_api_key = "EMPTY"
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id

In [7]:
tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3_3-Nemotron-Super-49B-v1")

### Chunk the Quality aritcles using text splitter

In [104]:
from datasets import Dataset, concatenate_datasets
from instructlab.sdg.utils.chunking import chunk_document

In [108]:
# ds = load_dataset("zitongyang/entigraph-quality-corpus", split='train').remove_columns(['entity', 'entigraph']).rename_columns({'raw': 'document', 'uid': 'document_outline'})
# ds = ds.add_column('domain', ["Articles"]*ds.num_rows)
# chunked_ds = []
# for e in ds:
#     document_chunks = chunk_document(e['document'], server_ctx_size=4096, chunk_word_count=400)
#     for chunk in document_chunks:
#         chunked_ds.append(e.copy())
#         chunked_ds[-1]['document'] = chunk
# chunked_ds = Dataset.from_list(chunked_ds)
# chunked_ds.to_json(f"quality_documents_chunked_at_400_words.jsonl", orient='records', lines=True)


# ds = ds.shuffle(seed=124).select(range(5))
# chunked_ds = []
# for e in ds:
#     document_chunks = chunk_document(e['document'], server_ctx_size=4096, chunk_word_count=100)
#     for chunk in document_chunks:
#         chunked_ds.append(e.copy())
#         chunked_ds[-1]['document'] = chunk
# chunked_ds = Dataset.from_list(chunked_ds)
# chunked_ds.to_json(f"quality_documents_chunked_at_100_words.jsonl", orient='records', lines=True)

Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 85.30ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 600.04ba/s]


150664

### Construct ICL examples

In [9]:
def get_response_from_nemotron(msg):
    response = client.chat.completions.create(
        model=teacher_model,
        messages=msg,
        temperature=0.6,
        top_p=0.95,
        max_tokens=4096,
    )
    return response.choices[0].message.content

In [None]:
ds = load_dataset("zitongyang/entigraph-quality-corpus", split='train').remove_columns(['entity', 'entigraph']).rename_columns({'raw': 'document', 'uid': 'document_outline'})

In [None]:
# chunk_idx = 50
prompt = f"Give me a short 3-4 line document and 3 detailed GRE-style thematic questions and answers based and grounded on the document."
prompt = f"Give me a short 8-9 line passage that has 2 paragraphs. Then, provide 3 detailed open-ended questions and their answers. Each question should require synthesizing or connecting ideas, entities, or concepts that appear in different parts of the passage"
msg = [{"role": "system", "content": f"detailed thinking on"}, {'role': 'user', 'content': prompt}]
generated_qa_pairs = get_response_from_nemotron(msg)
msg = msg + [{"role": "assistant", "content": generated_qa_pairs}] + [{"role": "user", "content": "Verify the generated question and answer pairs are correct, complete and grounded in the document. Fix all the issues in the answer if any."}]
verified_qa_pairs = get_response_from_nemotron(msg)
msg = msg + [{"role": "assistant", "content": verified_qa_pairs}] + [{"role": "user", "content": "Can you parse the generated question and answer pairs into a json format?"}]
json_verified_qa_pairs = get_response_from_nemotron(msg)
print(generated_qa_pairs)
print("-"*3)
print(verified_qa_pairs)
print("-"*3)
print(json_verified_qa_pairs)

<think>
Okay, let's tackle this user's request. They want a short 8-9 line passage divided into two paragraphs, followed by three open-ended questions that require synthesizing ideas from different parts of the passage.

First, I need to create the passage. It should be concise, two paragraphs, around 8-9 lines total. The topic should allow for connections between paragraphs. Maybe something about environment and technology? That's a common theme with clear links.

Paragraph one could introduce a problem, like pollution, and paragraph two a solution, like eco-friendly tech. That way, the questions can connect the problem to the solution. Let me draft that.

Now, the questions. Each needs to require synthesis. So, not just recalling facts but linking ideas. For example, how does the technology address the specific problem mentioned? Or what values are implied by the solution? Also, maybe a question about potential consequences that combine elements from both paragraphs.

Wait, the user said "detailed open-ended questions" with answers. The answers should explain the thought process and the synthesis. Need to make sure the questions aren't answerable with a single sentence but require combining information.

Let me check the passage again for clarity and that the connections are there but not too obvious. Then, craft the questions to ensure they span both paragraphs. Also, ensure the answers demonstrate how the synthesis works, showing how different parts of the passage are connected.

Okay, let's put it all together. Make sure the passage is exactly two paragraphs with the right line count. Then, three questions that each link different elements. Review the answers to confirm they require combining ideas from both parts.
</think>

**Passage**  
The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.  

Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy.

---

**Questions & Answers**  

1. **Question**: How does the technological solution described in the second paragraph address the economic *and* environmental challenges highlighted in the first?  
   **Answer**: The AI-powered buoys tackle environmental degradation by absorbing spills and collecting plastics, which could revive marine life and attract tourists/fishermen, thereby rejuvenating the local economy. The public data-sharing aspect also holds corporations accountable, potentially reducing future pollution and creating a sustainable cycle.  

2. **Question**: What implicit values or priorities do the community’s actions (clean-up initiatives) and the technologists’ project reflect, and how do these align or contrast?  
   **Answer**: Both reflect a prioritization of environmental stewardship and community engagement. The clean-ups emphasize collective responsibility, while the buoy project highlights innovation and systemic accountability (targeting corporations). They align in seeking solutions but contrast in scale and approach—manual vs. technological, local vs. systemic.  

3. **Question**: Imagine the buoy project succeeds. What unintended consequences might arise from its impact, considering both paragraphs’ themes?  
   **Answer**: Success could lead to "rebound effects," where reduced pollution encourages increased industrial activity near Willow Creek. Additionally, reliance on the technology might diminish community clean-up efforts, risking a loss of collective ownership. Economically, a tourism boom could strain the town’s infrastructure or alter its character, highlighting the need for balanced growth strategies.
---
<think>
Okay, let me start by reviewing the passage and the question-answer pairs provided. The user wants me to verify if the Q&A pairs are correct, complete, and grounded in the document, then fix any issues.

First, I'll read through the passage again to make sure I understand the context. The first paragraph talks about pollution in Willow Creek affecting the economy and environment. The second introduces an AI buoy system developed by technologists to combat pollution.

Looking at the first question: It asks how the buoy system addresses both economic and environmental challenges. The answer mentions absorbing spills, collecting plastics, reviving marine life, and rejuvenating the economy. It also talks about holding corporations accountable. Checking the passage, the second paragraph does say the buoys absorb oil spills and collect microplastics, which would help the environment. The economic part is implied through revitalizing the ecosystem and economy. The public data pressuring corporations is in the passage. So the answer seems correct here.

Second question: About the implicit values of community vs. technologists' actions. The answer says both prioritize environmental stewardship and community engagement. The clean-ups show collective responsibility, while the buoy project uses innovation and targets corporations. The passage mentions weekly clean-ups by residents (community) and the technologists' project. The alignment and contrast in approach (manual vs. tech, local vs. systemic) are valid. The answer seems grounded.

Third question: Unintended consequences if the buoy project succeeds. The answer mentions rebound effects (more industry), diminished community efforts, and strain on infrastructure. The passage doesn't explicitly mention these, but they're plausible inferences. However, the passage doesn't state that the project reduces pollution enough to invite more industry, so maybe that's an assumption. Also, the passage says the project "has sparked hope," implying it's not yet succeeded, so the answer's scenarios are hypothetical but reasonable. However, the answer should strictly base on the document. The passage doesn't mention infrastructure or character changes, so those parts might be unfounded.

Now, checking for completeness and accuracy. The first answer is good. The second answer is correct. The third answer has some points not in the document. For example, "rebound effects" and "strain on infrastructure" aren't mentioned. The document only talks about the current decline in tourism and fishing. So the answer introduces elements not present. The part about community efforts diminishing is possible, as reliance on tech might reduce manual clean-ups, which is a fair point but not explicitly stated.

Issues to fix: In Q3's answer, remove unfounded assumptions. Instead of talking about increased industry or infrastructure strain, focus on what's in the passage. Maybe mention that if the buoys work, people might rely less on clean-ups, or the economic boost could have unspecified effects, but stay within the document's themes.

Also, check if all questions require synthesizing ideas from both paragraphs. Q1 does. Q2 compares community and technologist actions, which are in different paragraphs. Q3 speculates based on both paragraphs' themes.

So, the main issue is in Q3's answer adding external assumptions. Need to revise that answer to stick closer to the passage's content.
</think>

**Verified & Revised Question-Answer Pairs**  

---

### **Passage**  
The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.  

Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy.

---

### **Revised Questions & Answers**  

1. **Question**: How does the technological solution described in the second paragraph address the economic *and* environmental challenges highlighted in the first?  
   **Answer (Revised for Clarity)**: The buoys directly mitigate environmental harm by absorbing spills and collecting plastics, which could restore marine life. Economically, this restoration might revive tourism and fishing, reversing their decline. Public data-sharing also pressures corporations, potentially reducing pollution and creating a sustainable economic cycle aligned with environmental health.  

   *Issue Fixed*: Added explicit link to "restoring marine life" (from the passage) to clarify the economic revival mechanism.  

2. **Question**: What implicit values or priorities do the community’s actions (clean-up initiatives) and the technologists’ project reflect, and how do these align or contrast?  
   **Answer (No Changes)**: Both reflect a prioritization of environmental stewardship and community engagement. The clean-ups emphasize collective responsibility, while the buoy project highlights innovation and systemic accountability (targeting corporations). They align in seeking solutions but contrast in scale and approach—manual vs. technological, local vs. systemic.  

   *Verification*: Correct and fully grounded in the passage.  

3. **Question**: Imagine the buoy project succeeds. What unintended consequences might arise from its impact, considering both paragraphs’ themes?  
   **Answer (Revised for Grounding)**: Success could lead to reduced community participation in clean-up efforts if residents perceive the technology as a sufficient solution. Economically, revitalization might exacerbate existing resource strains (e.g., overwhelming local infrastructure) or create dependency on the buoy system, undermining long-term sustainability if the technology falters.  

   *Issues Fixed*:  
   - Removed speculative "rebound effects" (not implied in the passage).  
   - Focused on consequences *explicitly tied to the passage*: community engagement ("overwhelmed efforts" implies capacity limits) and economic dependency ("costly" project suggests vulnerability if it fails).  

---

### **Summary of Changes**  
1. **Q1**: Clarified the causal chain from environmental fixes to economic revival.  
2. **Q2**: No changes (sound).  
3. **Q3**:  
   - Removed unfounded assumptions (e.g., "increased industry," "tourism straining character").  
   - Anchored consequences in passage details: community capacity, economic dependency, and the project’s cost.  

**All answers are now tightly grounded in the passage**.
---
<think>
Okay, let's tackle this. The user wants the question and answer pairs parsed into JSON format. First, I need to recall the previous conversation. The user had a passage with two paragraphs, then three questions with answers that were verified and revised. Now, they want that in JSON.

Looking at the previous answer, the Revised Questions & Answers section has three entries. Each has a Question and an Answer. The structure is straightforward: each pair is a question and its corresponding answer. 

I should start by creating a JSON array where each element represents a question-answer pair. Each element would have "question" and "answer" keys. The text for each should be taken directly from the revised versions. 

Wait, the user mentioned "parse the generated question and answer pairs". So I need to make sure I'm using the final revised answers, not the original ones. In the verified response, Q1 and Q3 were revised, Q2 remained the same. So I need to copy the revised answers accurately.

Checking the content:

1. Q1's revised answer adds a link to restoring marine life.
2. Q2's answer is unchanged.
3. Q3's revised answer focuses on community participation and economic dependency.

Now, formatting in JSON. Each entry in the array will be an object with "question" and "answer". The text should be enclosed in double quotes. Also, need to ensure any special characters are handled, but in this case, there aren't any. 

Let me structure it step by step. Start with the outer array. Then each object inside. Make sure commas are correctly placed between objects and keys. 

Wait, the user might also want the passage included in the JSON? The instruction says "parse the generated question and answer pairs", so probably just the Q&A, not the passage. But to be safe, maybe check. The original passage is separate, but the user's current request is about the pairs. So focus on the three Q&A pairs.

Double-checking the revised answers for accuracy. For Q3, the revised answer mentions "reduced community participation" and "economically, revitalization might exacerbate existing resource strains" and "dependency on the buoy system". That's correctly reflected.

Now, putting it all together in JSON format. Make sure the JSON is valid—proper brackets, commas, quotes. Use a JSON validator if possible, but since I'm generating manually, careful placement is key.

Final check: three entries, each with question and answer from the revised section. Yes. That should meet the user's request.
</think>

```json
{
  "passage": {
    "paragraphs": [
      "The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.",
      "Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy."
    ]
  },
  "questions_and_answers": [
    {
      "question": "How does the technological solution described in the second paragraph address the economic *and* environmental challenges highlighted in the first?",
      "answer": "The buoys directly mitigate environmental harm by absorbing spills and collecting plastics, which could restore marine life. Economically, this restoration might revive tourism and fishing, reversing their decline. Public data-sharing also pressures corporations, potentially reducing pollution and creating a sustainable economic cycle aligned with environmental health."
    },
    {
      "question": "What implicit values or priorities do the community’s actions (clean-up initiatives) and the technologists’ project reflect, and how do these align or contrast?",
      "answer": "Both reflect a prioritization of environmental stewardship and community engagement. The clean-ups emphasize collective responsibility, while the buoy project highlights innovation and systemic accountability (targeting corporations). They align in seeking solutions but contrast in scale and approach—manual vs. technological, local vs. systemic."
    },
    {
      "question": "Imagine the buoy project succeeds. What unintended consequences might arise from its impact, considering both paragraphs’ themes?",
      "answer": "Success could lead to reduced community participation in clean-up efforts if residents perceive the technology as a sufficient solution. Economically, revitalization might exacerbate existing resource strains (e.g., overwhelming local infrastructure) or create dependency on the buoy system, undermining long-term sustainability if the technology falters."
    }
  ]
}
```

<think>
Okay, the user wants a short 3-4 line document and three detailed GRE-style thematic questions with answers based on it. Let me start by crafting the document. It needs to be concise but rich enough for questions. Maybe a philosophical or historical topic since those lend themselves to thematic analysis.

I'll go with a quote from Nietzsche about morality. That's dense and can lead to questions on interpretation, implications, and comparisons. Now, the document should be 3-4 lines. Let me check the length. The Nietzsche passage I have in mind is four sentences. Perfect.

Next, the questions. GRE thematic questions often ask for interpretation, analysis of assumptions, and application to another context. First question: Identify the main thesis and explain. Second: Analyze the assumption underlying the claim. Third: Apply the idea to a different field, like science.

For the answers, they need to be detailed but not too long. Make sure each answer addresses all parts of the question. Check that the questions are indeed GRE-style—focusing on themes, structure, and application. Ensure the answers ground their points in the document. Avoid jargon, keep it clear. Let me review each part to ensure alignment with the user's request. Document length, question types, answer structure all seem to fit. Okay, that should work.
</think>

**Document (4 lines)**  
Nietzsche argued that traditional morality is a "slave morality" created by the weak to subjugate the strong, claiming, "The noble type of man experiences himself as the goal and measure of things; he does not require approval or justification from outsiders." In contrast, slave morality, rooted in resentment, posits equality and humility as virtues to constrain natural superiority. Nietzsche saw this as a perversion of life’s true will to power. Only the Übermensch could transcend such moral dichotomies.

---

**Thematic Questions & Answers**

### **1. Interpretation of Thesis**  
**Question:** Identify the central thesis of the passage and explain how the contrast between "noble" and "slave" morality supports Nietzsche’s broader philosophical claim.  
**Answer:**  
The thesis is that traditional morality is a subversive construct of the weak ("slave morality") to undermine natural hierarchy, contrasting with the self-affirming "noble morality" of the strong. The passage supports this by framing noble morality as intrinsic (e.g., "the goal and measure of things") and slave morality as reactive ("rooted in resentment"). This dichotomy underscores Nietzsche’s claim that conventional ethics are not objective but a political tool to dilute the "will to power" of exceptional individuals.

### **2. Assumption Analysis**  
**Question:** What unexamined assumption underlies Nietzsche’s critique of slave morality as a "perversion of life"? Analyze the implications if this assumption is challenged.  
**Answer:**  
The assumption is that "life’s true will to power" is a universal, instinctual drive that should not be morally constrained. If challenged (e.g., if cooperation, not domination, is life’s primary drive), Nietzsche’s critique collapses, as slave morality’s emphasis on equality could be seen as adaptive rather than perverse. This would invert his hierarchy, legitimizing collective ethics over individual supremacy.

### **3. Application to Another Context**  
**Question:** Apply Nietzsche’s framework to scientific progress. How might a "noble" vs. "slave" dynamic manifest in the pursuit of knowledge?  
**Answer:**  
A "noble" scientific ethos might prioritize unencumbered innovation (e.g., unfunded maverick researchers pursuing risky, visionary projects). Conversely, "slave morality" could manifest in bureaucratic, consensus-driven science (e.g., funding prioritizing collaborative, "socially responsible" projects over disruptive ideas). Nietzsche would argue the latter stifles genius, while defenders might see it as ensuring ethical accountability—a parallel to his moral critique.
---
<think>
Okay, let me try to work through this. The user wants me to verify if the question and answer pairs generated earlier are correct, complete, and grounded in the document provided. First, I need to look at each question and answer carefully.

Starting with the first question: Interpretation of Thesis. The question asks to identify the central thesis and explain how the contrast between noble and slave morality supports Nietzsche’s claim. The answer given states the thesis correctly as traditional morality being a slave morality created by the weak. It also explains the contrast by saying noble morality is intrinsic and slave is reactive. The document does mention "noble type of man" experiencing himself as the goal and slave morality rooted in resentment. So the answer seems accurate here. The support from the document is there, so this pair is correct.

Next, the second question: Assumption Analysis. The question asks for an unexamined assumption in Nietzsche’s critique. The answer points out the assumption that the will to power is a universal drive that shouldn’t be constrained. The document mentions "life’s true will to power" as being perverted by slave morality. If this assumption is challenged, the critique would collapse. The implications analysis seems valid. The answer correctly identifies the assumption and discusses the consequences if it's challenged. This pair checks out too.

Third question: Application to Another Context. Applying Nietzsche’s framework to science. The answer differentiates between a noble ethos (unencumbered innovation) and slave (bureaucratic consensus). The document doesn’t mention science, but the framework of noble vs. slave morality is applied conceptually. Nietzsche’s ideas are being extended appropriately here. The answer stays true to his philosophy even though the context is new. So this is correct as well.

Now, checking completeness. Each question has a clear answer that addresses all parts. The answers reference the document specifically, like quoting "goal and measure of things" and "rooted in resentment." They don’t introduce information not in the document. The assumptions are correctly identified based on the text. 

Grounded in the document: All key points in the answers are backed by the document. For example, the Übermensch is mentioned in the document and used in the first answer to support transcending moral dichotomies. The will to power is directly from the document. 

Are there any inaccuracies? Let me re-read the document. The document says slave morality posits equality and humility to constrain natural superiority. The answers reflect that. The second answer’s assumption about will to power being universal is in the document as "life’s true will to power." The third question's application doesn't contradict the original ideas. 

I think all pairs are correct. The user might have been concerned if the answers added external info or misinterpreted the text, but they seem to align closely. Maybe the third question's answer could have a stronger link, but since it's an application, it's acceptable as long as the framework is applied correctly, which it is.
</think>

**Verification of Question & Answer Pairs**

All three question-answer pairs are **correct, complete, and grounded in the document**. Below is a detailed breakdown:

---

### **1. Interpretation of Thesis**  
- **Correctness**: ✅ The central thesis is accurately identified, and the contrast between "noble" and "slave" morality is explained using direct evidence from the document (e.g., "goal and measure of things," "rooted in resentment").  
- **Completeness**: ✅ Fully addresses both parts of the question (thesis + supporting contrast).  
- **Grounding**: ✅ Quotes and concepts (e.g., "intrinsic" vs. "reactive") align with the passage.  

---

### **2. Assumption Analysis**  
- **Correctness**: ✅ The assumption about "will to power" is correctly inferred from the phrase "perversion of life’s true will to power."  
- **Completeness**: ✅ Clearly states the assumption and analyzes implications if challenged.  
- **Grounding**: ✅ Directly tied to the document’s language about "life’s true will to power" and its contrast with slave morality.  

---

### **3. Application to Another Context**  
- **Correctness**: ✅ Faithfully applies Nietzsche’s framework to science without distorting his ideas.  
- **Completeness**: ✅ Distinguishes "noble" (unfettered innovation) and "slave" (bureaucratic) dynamics, engaging with potential counterarguments.  
- **Grounding**: ✅ While the *context* (science) is new, the moral dichotomy ("unencumbered pursuit" vs. "constrained collective ethics") is rooted in the document’s themes.  

---

### **Overall Strengths**  
1. **Document Alignment**: Answers cite specific phrases (e.g., "goal and measure of things," "rooted in resentment") to support claims.  
2. **Conceptual Accuracy**: No misinterpretation of Nietzsche’s terms (e.g., "slave morality" as reactive, "Übermensch" as transcendent).  
3. **Depth of Analysis**: Questions 2 and 3 require critical thinking (assumption-testing, application) beyond mere recall.  

### **Minor Suggestions for Enhancement (Optional)**  
- For **Question 3**, adding a brief phrase from the document to the answer (e.g., "mirroring how slave morality ‘constrains natural superiority’") could strengthen explicit grounding.  
- For **Question 2**, noting that the assumption also relies on Nietzsche’s value judgment that "domination" is more "natural" than cooperation could enrich the analysis.  

---

**Final Conclusion**: The pairs are well-designed, rigorous, and tightly anchored to the document. They effectively test interpretation, critical analysis, and thematic application—all hallmarks of GRE-style questions.  

Let me know if you’d like revisions or additional pairs!
---
<think>
Okay, the user wants the question and answer pairs parsed into JSON format. Let me start by recalling the previous interaction. They had a document and three GRE-style questions with answers. Then they asked for verification, which I did, and now they want JSON.

First, I need to structure each question and answer properly. JSON requires key-value pairs. Each question should have a "Question" key, "Answer", and maybe "Type" to categorize them. The original questions were labeled as 1, 2, 3, so I can use those as IDs.

Looking at the first question: Interpretation of Thesis. The answer has bullet points for Correctness, Completeness, Grounding in the verification, but the actual answer is the paragraph before that. Wait, no. Wait, in the previous response, the answer for Question 1 was the paragraph starting with "The thesis is...", and then the verification had correctness etc. But the user now wants the original Q&A pairs in JSON, not the verification details.

So I need to go back to the initial Q&A provided. Let me check:

Original Q1:
**1. Interpretation of Thesis**  
**Question:** Identify the central thesis...  
**Answer:** The thesis is... 

Yes. So each entry in JSON should have the question text and the answer text. The verification part was separate, but the user didn't ask for that in JSON, just the pairs.

So for each of the three questions, structure as:

{
  "id": 1,
  "type": "Interpretation of Thesis",
  "question": "Identify the central thesis...",
  "answer": "The thesis is..."
},

And similarly for the others. Also, include the document in JSON if needed, but the user specified "question and answer pairs", so maybe just the Q&A.

Wait, the user said "parse the generated question and answer pairs into a json format". So the document is separate, but the pairs are the three questions with their answers.

So the JSON should be an array of objects, each containing id, type, question, answer.

Checking the original answer:

The three questions are:

1. Interpretation of Thesis
2. Assumption Analysis
3. Application to Another Context

Each has a Question and Answer.

So the JSON would look like:

{
  "questions": [
    {
      "id": 1,
      "type": "Interpretation of Thesis",
      "question": "Identify the central thesis of the passage and explain how the contrast between 'noble' and 'slave' morality supports Nietzsche’s broader philosophical claim.",
      "answer": "The thesis is that traditional morality is a subversive construct of the weak... This dichotomy underscores Nietzsche’s claim..."
    },
    {
      "id": 2,
      "type": "Assumption Analysis",
      "question": "What unexamined assumption underlies Nietzsche’s critique of slave morality as a 'perversion of life'? Analyze the implications if this assumption is challenged.",
      "answer": "The assumption is that 'life’s true will to power' is a universal, instinctual drive... This would invert his hierarchy..."
    },
    {
      "id": 3,
      "type": "Application to Another Context",
      "question": "Apply Nietzsche’s framework to scientific progress. How might a 'noble' vs. 'slave' dynamic manifest in the pursuit of knowledge?",
      "answer": "A 'noble' scientific ethos might prioritize unencumbered innovation... while defenders might see it as ensuring ethical accountability—a parallel to his moral critique."
    }
  ]
}

I need to make sure the quotes are properly escaped in JSON. Also, check if the user wants the document included. The instruction says "question and answer pairs", so probably not. But maybe confirm. The original document was separate, so likely just the Q&A.

Yes, the user's request was specifically about the generated pairs, so the above structure should be correct.

Now, formatting the JSON correctly with escaped quotes and proper syntax. Also, ensuring that each key is properly named. The types are the question types as labeled. The IDs are 1,2,3. Questions and answers are copied exactly from the original.

Double-checking for any markdown formatting in the answers. The original answers had line breaks but in JSON, line breaks can be preserved with \n or just as continuous text. Since the user didn't specify, keeping as continuous is safer.

So the final JSON should have the questions array with three entries as above.
</think>

Here’s the parsed JSON format for the question-answer pairs, including the document for context:

```json
{
  "document": [
    "Nietzsche argued that traditional morality is a \"slave morality\" created by the weak to subjugate the strong, claiming, \"The noble type of man experiences himself as the goal and measure of things; he does not require approval or justification from outsiders.\"",
    "In contrast, slave morality, rooted in resentment, posits equality and humility as virtues to constrain natural superiority.",
    "Nietzsche saw this as a perversion of life’s true will to power.",
    "Only the Übermensch could transcend such moral dichotomies."
  ],
  "questions": [
    {
      "id": 1,
      "type": "Interpretation of Thesis",
      "question": "Identify the central thesis of the passage and explain how the contrast between \"noble\" and \"slave\" morality supports Nietzsche’s broader philosophical claim.",
      "answer": "The thesis is that traditional morality is a subversive construct of the weak (\"slave morality\") to undermine natural hierarchy, contrasting with the self-affirming \"noble morality\" of the strong. The passage supports this by framing noble morality as intrinsic (e.g., \"the goal and measure of things\") and slave morality as reactive (\"rooted in resentment\"). This dichotomy underscores Nietzsche’s claim that conventional ethics are not objective but a political tool to dilute the \"will to power\" of exceptional individuals."
    },
    {
      "id": 2,
      "type": "Assumption Analysis",
      "question": "What unexamined assumption underlies Nietzsche’s critique of slave morality as a \"perversion of life\"? Analyze the implications if this assumption is challenged.",
      "answer": "The assumption is that \"life’s true will to power\" is a universal, instinctual drive that should not be morally constrained. If challenged (e.g., if cooperation, not domination, is life’s primary drive), Nietzsche’s critique collapses, as slave morality’s emphasis on equality could be seen as adaptive rather than perverse. This would invert his hierarchy, legitimizing collective ethics over individual supremacy."
    },
    {
      "id": 3,
      "type": "Application to Another Context",
      "question": "Apply Nietzsche’s framework to scientific progress. How might a \"noble\" vs. \"slave\" dynamic manifest in the pursuit of knowledge?",
      "answer": "A \"noble\" scientific ethos might prioritize unencumbered innovation (e.g., unfunded maverick researchers pursuing risky, visionary projects). Conversely, \"slave morality\" could manifest in bureaucratic, consensus-driven science (e.g., funding prioritizing collaborative, \"socially responsible\" projects over disruptive ideas). Nietzsche would argue the latter stifles genius, while defenders might see it as ensuring ethical accountability—a parallel to his moral critique."
    }
  ]
}
```

### **Key Features of the JSON Structure**
1. **Document**: Included as an array of sentences for clarity.
2. **Questions**: Array with `id`, `type`, `question`, and `answer` fields.
3. **Formatting**: Quotes are properly escaped (`\"`), and line breaks are omitted for compactness (add `\n` if line breaks are preferred).

### **Example Use Case (GRE Study Tool)**
```json
// Extracting a single question for display
{
  "question": questions[0].question,
  "answer":   questions[0].answer
}
```

Let me know if you’d like further refinements (e.g., adding metadata like "difficulty" or "keywords")!


In [17]:
icls = [{
  "icl_document": (
    "Nietzsche argued that traditional morality is a \"slave morality\" created by the weak to subjugate the strong, claiming, \"The noble type of man experiences himself as the goal and measure of things; he does not require approval or justification from outsiders.\"",
    "In contrast, slave morality, rooted in resentment, posits equality and humility as virtues to constrain natural superiority.",
    "Nietzsche saw this as a perversion of life’s true will to power.",
    "Only the Übermensch could transcend such moral dichotomies."
  ),
  "icl_query_1": "Identify the central thesis of the passage and explain how the contrast between \"noble\" and \"slave\" morality supports Nietzsche’s broader philosophical claim.",
  "icl_response_1": "The thesis is that traditional morality is a subversive construct of the weak (\"slave morality\") to undermine natural hierarchy, contrasting with the self-affirming \"noble morality\" of the strong. The passage supports this by framing noble morality as intrinsic (e.g., \"the goal and measure of things\") and slave morality as reactive (\"rooted in resentment\"). This dichotomy underscores Nietzsche’s claim that conventional ethics are not objective but a political tool to dilute the \"will to power\" of exceptional individuals.",
  
  "icl_query_2": "What unexamined assumption underlies Nietzsche’s critique of slave morality as a \"perversion of life\"? Analyze the implications if this assumption is challenged.",
  "icl_response_2": "The assumption is that \"life’s true will to power\" is a universal, instinctual drive that should not be morally constrained. If challenged (e.g., if cooperation, not domination, is life’s primary drive), Nietzsche’s critique collapses, as slave morality’s emphasis on equality could be seen as adaptive rather than perverse. This would invert his hierarchy, legitimizing collective ethics over individual supremacy.",

  "icl_query_3": "Apply Nietzsche’s framework to scientific progress. How might a \"noble\" vs. \"slave\" dynamic manifest in the pursuit of knowledge?",
  "icl_response_3": "A \"noble\" scientific ethos might prioritize unencumbered innovation (e.g., unfunded maverick researchers pursuing risky, visionary projects). Conversely, \"slave morality\" could manifest in bureaucratic, consensus-driven science (e.g., funding prioritizing collaborative, \"socially responsible\" projects over disruptive ideas). Nietzsche would argue the latter stifles genius, while defenders might see it as ensuring ethical accountability—a parallel to his moral critique."
},
{
    "icl_document": (
      "The coastal town of Willow Creek, once renowned for its pristine beaches, now struggles with rampant pollution. Plastic debris and oil spills have devastated marine life, prompting a decline in tourism and fishing industries. Residents have organized weekly clean-up initiatives, but the scale of the problem overwhelms their efforts.",
      "Technologists at the local university have developed an AI-powered buoy system to combat this. The buoys, equipped with solar panels and filtration technology, can identify and absorb oil spills while collecting microplastics. Data from the buoys is shared publicly, raising awareness and pressuring corporations to adopt sustainable practices. Though costly, the project has sparked hope for revitalizing the ecosystem and economy."
    ),
    "icl_query_1": "How does the technological solution address the economic *and* environmental challenges highlighted in the document?",
    "icl_response_1": "The buoys directly mitigate environmental harm by absorbing spills and collecting plastics, which could restore marine life. Economically, this restoration might revive tourism and fishing, reversing their decline. Public data-sharing also pressures corporations, potentially reducing pollution and creating a sustainable economic cycle aligned with environmental health.",
    
    "icl_query_2": "What implicit values or priorities do the community’s actions (clean-up initiatives) and the technologists’ project reflect, and how do these align or contrast?",
    "icl_response_2": "Both reflect a prioritization of environmental stewardship and community engagement. The clean-ups emphasize collective responsibility, while the buoy project highlights innovation and systemic accountability (targeting corporations). They align in seeking solutions but contrast in scale and approach—manual vs. technological, local vs. systemic.",
    
    "icl_query_3": "Imagine the buoy project succeeds. What unintended consequences might arise from its impact, considering document's themes?",
    "icl_response_3": "Success could lead to reduced community participation in clean-up efforts if residents perceive the technology as a sufficient solution. Economically, revitalization might exacerbate existing resource strains (e.g., overwhelming local infrastructure) or create dependency on the buoy system, undermining long-term sustainability if the technology falters."

}
]


In [24]:
ds = load_dataset("zitongyang/entigraph-quality-corpus", split='train').remove_columns(['entity', 'entigraph']).rename_columns({'raw': 'document', 'uid': 'document_outline'})
ds = [ds.map(lambda x: icl) for icl in icls]
ds = concatenate_datasets(ds)
ds = ds.add_column('domain', ['Article']*len(ds))

In [26]:
ds.to_json("seed_data.jsonl", orient='records', lines=True)

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 10.89ba/s]


14739375

### Start Teacher Model
```shell
export HUGGINGFACE_HUB_CACHE="/new_data/hf_cache"   
export HF_DATASETS_CACHE="/dev/shm/hf"
export HF_HOME="/new_data/hf_cache"
export HF_MODEL_CACHE="/new_data/hf_cache"

port=8000
for i in {0..7}; do
    CUDA_VISIBLE_DEVICES=$i python -m vllm.entrypoints.openai.api_server \
        --model microsoft/phi-4 \
        --dtype float16 \
        --tensor-parallel-size 1 \
        --port $port \
        --trust-remote-code > run_phi4_teacher_model_$i.log 2>&1 &
    port=$((port + 1))
done
```

### Run SDG in parallel
```shell
# Get dataset size and save into variable
dataset_size=$(wc -l /new_data/knowledge_rh/quality/base_datasets/quality_seed_example_chunked.jsonl | awk '{print $1}')
number_of_processes=8
port=8000
for i in {0..7}; do
    dataset_start_index=$((i * dataset_size / number_of_processes))
    dataset_end_index=$((dataset_start_index + dataset_size / number_of_processes))
    python scripts/generate.py --ds_path  /new_data/knowledge_rh/quality/base_datasets/quality_seed_example_chunked.jsonl \
        --bs 2 --num_workers 30 \
        --save_path /new_data/knowledge_rh/quality/knowledge1.25_extractive_phi4/gen.jsonl \
        --flow /home/lab/abhi/SDG-Research-Upstream/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5_phi4.yaml \
        --endpoint http://localhost:$port/v1 \
        --checkpoint_dir /new_data/knowledge_rh/quality/knowledge1.25_extractive_phi4/data_checkpoints \
        --save_freq 1000 \
        --dataset_start_index $dataset_start_index \
        --dataset_end_index $dataset_end_index > run_sdg_$i.log 2>&1 &
    echo "Starting process $i with dataset from $dataset_start_index to $dataset_end_index on port $port"
    port=$((port + 1))
done
```

In [146]:
from instructlab.sdg.flow import Flow
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from instructlab.sdg.prompts import PromptRegistry
from instructlab.sdg.blocks import BlockRegistry, Block
from transformers import AutoTokenizer
import re
from typing import List

tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3_3-Nemotron-Super-49B-v1")

@PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
def nemotron_chat_template():
    return """{{- bos_token }}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
{%- for message in messages %}
  {%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
    {%- set content = message['content'].split('</think>')[-1].lstrip() %}
  {%- else %}
    {%- set content = message['content'] %}
  {%- endif %}
  {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
{%- endfor %}
{%- if add_generation_prompt %}
  {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}"""


@BlockRegistry.register("PostProcessThinkingBlock")
class PostProcessThinkingBlock(Block):
    def __init__(self, block_name: str, column_name: str) -> None:
        super().__init__(block_name=block_name)  
        self.column_name = column_name
    
    
    def generate(self, samples: Dataset):
        def post_process_thinking(x):
            if '</think>' in x[self.column_name]:
                x[self.column_name] = x[self.column_name].split('</think>')[-1].lstrip()
            return x
        samples = samples.map(post_process_thinking)
        return samples

@BlockRegistry.register("RegexParserBlock")
class RegexParserBlock(Block):
    def __init__(self, block_name: str, column_name: str, parsing_pattern: str, parser_cleanup_tags: List[str], output_cols: List[str]) -> None:
        super().__init__(block_name=block_name)
        self.column_name = column_name
        self.parsing_pattern = parsing_pattern
        self.parser_cleanup_tags = parser_cleanup_tags
        self.output_cols = output_cols

    def generate(self, samples: Dataset):
        new_data = []
        for sample in samples:
            parsed_outputs = self._parse(sample[self.column_name])
            max_length = max(len(value) for value in parsed_outputs.values())
            for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
                new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
        return Dataset.from_list(new_data)

    def _parse(self, generated_string):      
        pattern = re.compile(self.parsing_pattern, re.DOTALL)
        all_matches = pattern.findall(generated_string)
        matches = {column_name: [] for column_name in self.output_cols}
        if all_matches and isinstance(all_matches[0], tuple):
            for match in all_matches:
                for column_name, value in zip(self.output_cols, match):
                    value = value.strip()
                    for clean_tag in self.parser_cleanup_tags:
                        value = value.replace(clean_tag, "")
                    matches[column_name].append(value)
        else:
            matches[self.output_cols[0]] = (
                [match.strip() for match in all_matches] if all_matches else []
            )
        return matches

knowledge_agentic_pipeline = "synth_knowledge1.5_nemotron_super_49b.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)
ds_subset = ds.select(range(1))

In [None]:
generated_data = sdg.generate(ds_subset, checkpoint_dir="Tmp")

In [None]:
print("### Question")
print(generated_data['question'][1])
print("### Response")
print(generated_data['response'][1])

### Question
Analyze the economic paradox underlying the dental industry’s shift: Why did patients prioritize discretionary cosmetic treatments over medically necessary care?
### Response
The paradox stems from dentistry’s reliance on out-of-pocket spending (60% of costs). Cosmetic procedures (e.g., bleaching) were marketed as immediate, desirable investments in appearance, while necessary care (e.g., gum surgery) was perceived as inconvenient or costly with delayed benefits. Dentists capitalized on this by aggressively selling "wants" to offset the resistance to "needs," ensuring profitability in a market where insurance coverage was limited.

In [None]:
print("### Faithfulness Evaluation")
print(generated_data['explanation'][2])
print("### Judgment")
print(generated_data['judgment'][2])

### Faithfulness Evaluation
The context directly supports the information. Section 1 states that intra-oral cameras and computer simulations were used to "exaggerate minor flaws, convincing patients to undergo costly treatments." This explicitly aligns with the claim that these tools were used to visually exaggerate flaws and persuade patients of the "necessity" of aesthetic treatments. The language "exaggerated" and "convincing" in the context directly corroborate the statement.
### Judgment
YES