In [1]:
import os
import json
import re
from pathlib import Path
import markdown
import pypdf
from tqdm import tqdm
from google import genai


In [2]:

GOOGLE_API_KEY = "AIzaSyB9gjlPn8XxGQ39uVPpDKLUbnt1NdbzyvE"

client = genai.Client(api_key=GOOGLE_API_KEY)

In [3]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF files"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

def extract_text_from_markdown(md_path):
    """Extract text from Markdown files"""
    try:
        with open(md_path, 'r', encoding='utf-8') as file:
            md_content = file.read()
            html = markdown.markdown(md_content)
            text = re.sub(r'<[^>]+>', ' ', html)
            return text
    except Exception as e:
        print(f"Error extracting text from {md_path}: {e}")
    return ""

In [6]:
def split_into_chunks(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks"""
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i + chunk_size]
        if len(chunk) > 200:  # Only keep chunks with meaningful content
            chunks.append(chunk)
    return chunks


In [22]:
def generate_qa_pairs_with_gemini(chunk, num_pairs=3):
    """Generate QA pairs using Gemini API"""
    prompt = f"""
    Based on the following text, generate {num_pairs} relevant question-answer pairs.
    For each pair, the question should be answerable from the text, and the answer should be comprehensive.
    Return the result in the following JSON format:
    [
      {{
        "question": "Question 1",
        "answer": "Answer 1"
      }},
      ...
    ]
    
    Text:
    {chunk}
    """
    

    try:
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt]
        )
        # Extract JSON from response
        response_text = response.text
        # Find JSON content (between first [ and last ])
        json_match = re.search(r'\[\s*{.*}\s*\]', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            qa_pairs = json.loads(json_str)
            # Add context to each QA pair
            for pair in qa_pairs:
                pair["context"] = chunk
            return qa_pairs
        else:
            print(f"Failed to parse JSON from response: {response_text}")
            return []
    except Exception as e:
        print(f"Error generating QA pairs with Gemini: {e}")
        return []

    

In [33]:
def process_files(dataset_dir):
    """Process all PDF and Markdown files in the given directory"""
    all_qa_pairs = []
    
    # Process PDF files
    pdf_dir = dataset_dir
    if os.path.exists(pdf_dir):
        pdf_files = ["q3_dataset/2501.12948v1.pdf"]
        print("pdf", pdf_files)
        for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
            text = extract_text_from_pdf(pdf_file)
            
            chunks = split_into_chunks(text)
            for chunk in tqdm(chunks, desc=f"Generating QA pairs for {pdf_file}", leave=False):
                qa_pairs = generate_qa_pairs_with_gemini(chunk)
                all_qa_pairs.extend(qa_pairs)
                break
    
    # Process Markdown files
    md_dir = dataset_dir
    if os.path.exists(md_dir):
        md_files = ["q3_dataset/dataset.md", "q3_dataset/deepseekv3-cost-explained.md", "q3_dataset/design-notes-3fs.md", "q3_dataset/open-source-week.md"]
        for md_file in tqdm(md_files, desc="Processing Markdown"):
            text = extract_text_from_markdown(md_file)
            chunks = split_into_chunks(text)
            for chunk in tqdm(chunks, desc=f"Generating QA pairs for {md_file}", leave=False):
                qa_pairs = generate_qa_pairs_with_gemini(chunk)
                all_qa_pairs.extend(qa_pairs)
                
            
    
    return all_qa_pairs


In [34]:
qs = process_files("qa_dataset")

pdf ['q3_dataset/2501.12948v1.pdf']


Processing PDFs: 100%|██████████| 1/1 [00:03<00:00,  3.94s/it]
Processing Markdown:  25%|██▌       | 1/4 [00:25<01:15, 25.08s/it]

Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}


Processing Markdown:  50%|█████     | 2/4 [00:40<00:39, 19.54s/it]

Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}


Processing Markdown:  75%|███████▌  | 3/4 [01:38<00:37, 37.12s/it]

Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}




Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}


Processing Markdown: 100%|██████████| 4/4 [01:43<00:00, 25.96s/it]

Error generating QA pairs with Gemini: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}





In [None]:
qs[1]

{'question': 'What are some of the benefits and drawbacks of using reinforcement learning (RL) to train DeepSeek-R1-Zero?',
 'answer': 'DeepSeek-R1-Zero, trained via RL, naturally demonstrates remarkable reasoning capabilities and emerges with numerous powerful and intriguing reasoning behaviors. However, it also encounters challenges such as poor readability and language mixing.',
 'context': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via\nReinforcement Learning\nDeepSeek-AI\nresearch@deepseek.com\nAbstract\nWe introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.\nDeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super-\nvised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.\nThrough RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing\nreasoning behaviors. However, it encounters challenges such as poor readability, and language\nmixing. To ad

In [38]:
import csv

# Define the CSV file path
csv_file_path = "qa_dataset.csv"

# Define the field names
fieldnames = ['question', 'answer', 'context']

# Write the data to the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for qa_pair in qs:
        writer.writerow(qa_pair)

print(f"CSV file created at {csv_file_path}")

CSV file created at qa_dataset.csv


In [None]:
def create_dataset(qa_pairs, output_path):
    """Create and save the dataset in a format suitable for fine-tuning"""
    # Split into train and validation sets
    import random
    random.shuffle(qa_pairs)
    split_idx = int(0.9 * len(qa_pairs))
    train_data = qa_pairs[:split_idx]
    val_data = qa_pairs[split_idx:]
    
    # Format data for Llama fine-tuning
    formatted_train = format_for_llama(train_data)
    formatted_val = format_for_llama(val_data)
    
    # Save datasets
    os.makedirs(output_path, exist_ok=True)
    with open(os.path.join(output_path, "train.json"), "w", encoding="utf-8") as f:
        json.dump(formatted_train, f, indent=2)
    
    with open(os.path.join(output_path, "val.json"), "w", encoding="utf-8") as f:
        json.dump(formatted_val, f, indent=2)
        
    print(f"Created dataset with {len(formatted_train)} training samples and {len(formatted_val)} validation samples")

In [27]:
def create_dataset(qa_pairs, output_path):
    """Create and save the dataset in a format suitable for fine-tuning"""
    # Split into train and validation sets
    import random
    random.shuffle(qa_pairs)
    split_idx = int(0.9 * len(qa_pairs))
    train_data = qa_pairs[:split_idx]
    val_data = qa_pairs[split_idx:]
    
    # Format data for Llama fine-tuning
    formatted_train = format_for_llama(train_data)
    formatted_val = format_for_llama(val_data)
    
    # Save datasets
    os.makedirs(output_path, exist_ok=True)
    with open(os.path.join(output_path, "train.json"), "w", encoding="utf-8") as f:
        json.dump(formatted_train, f, indent=2)
    
    with open(os.path.join(output_path, "val.json"), "w", encoding="utf-8") as f:
        json.dump(formatted_val, f, indent=2)
        
    print(f"Created dataset with {len(formatted_train)} training samples and {len(formatted_val)} validation samples")

In [28]:
def format_for_llama(qa_pairs):
    """Format QA pairs for Llama fine-tuning"""
    formatted_data = []
    
    for pair in qa_pairs:
        formatted_data.append({
            "text": f"### Question: {pair['question']}\n\n### Context: {pair['context']}\n\n### Answer: {pair['answer']}"
        })
    
    return formatted_data

In [29]:
create_dataset(qs, "qa_dataset")

Created dataset with 2 training samples and 1 validation samples


In [39]:
df = pd.read_csv("qa_dataset.csv")

NameError: name 'pd' is not defined