In [1]:
APIKEY = "AIzaSyA-3QJTurI2rPh9YShsompTKnfqMvsGnhk"

In [10]:
import json
import requests
from typing import Optional, Dict, Any, Union

class GeminiClient:
    """
    Simple Gemini API client for text-to-text with JSON schema output.
    Uses the free tier of Google's Gemini API.
    """
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://generativelanguage.googleapis.com/v1beta"
        self.model_name = "gemini-2.0-flash"  # Free tier model
    
    def generate(
        self, 
        prompt: str, 
        json_schema: Optional[Dict[str, Any]] = None,
        temperature: float = 0.1,
        max_tokens: int = 2048
    ) -> Union[str, Dict[str, Any]]:
        """
        Generate response from text prompt with optional JSON schema.
        
        Args:
            prompt (str): The text prompt
            json_schema (Dict, optional): JSON schema for structured output
            temperature (float): Sampling temperature (0.0 to 1.0)
            max_tokens (int): Maximum tokens to generate
            
        Returns:
            Union[str, Dict]: Generated text or parsed JSON object
        """
        url = f"{self.base_url}/models/{self.model_name}:generateContent"
        
        payload = {
            "contents": [{
                "parts": [{"text": prompt}]
            }],
            "generationConfig": {
                "temperature": temperature,
                "maxOutputTokens": max_tokens,
            }
        }
        
        # Add JSON schema if provided
        if json_schema:
            payload["generationConfig"]["response_mime_type"] = "application/json"
            payload["generationConfig"]["response_schema"] = json_schema
        
        headers = {"Content-Type": "application/json"}
        
        try:
            response = requests.post(
                f"{url}?key={self.api_key}",
                headers=headers,
                data=json.dumps(payload),
                timeout=30
            )
            response.raise_for_status()
            
            # Extract content
            # print("response : ", response)
            result = response.json()
            # print("result : ", result)
            content = result["candidates"][0]["content"]["parts"][0]["text"]

            # print("content : ", content)
            
            # Parse JSON if schema was provided
            if json_schema:
                return json.loads(content)
            
            return content
            
        except requests.exceptions.RequestException as e:
            raise Exception(f"API request failed: {str(e)}")
        except (KeyError, IndexError, json.JSONDecodeError) as e:
            raise Exception(f"Failed to parse response: {str(e)}")

# Example usage
def main():
    # Initialize client
    API_KEY = APIKEY
    client = GeminiClient(API_KEY)
    
    # Example 1: Simple text generation
    print("=== Simple Text ===")
    response = client.generate("Write a short poem about coding")
    print(response)

main()

=== Simple Text ===
The screen glows bright, a digital dawn,
With lines of code, a world is born.
Each bracket, comma, a careful plea,
To shape the logic, wild and free.

From syntax errors, a frustrated sigh,
But triumph blooms when programs fly.
A tapestry woven, thread by thread,
A coder's passion, deeply bred.



In [11]:
import json
import time
import logging
from typing import Dict, Any, List
from pathlib import Path
import re
import json
import time
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SeatingQuestionGenerator:
    """
    Generate seating arrangement reasoning questions using Gemini API
    """
    
    def __init__(self, gemini_client):
        self.client = gemini_client
        self.temp_questions = []  # Store questions temporarily for validation
    
    def create_generation_prompt(self, num_questions: int = 2) -> str:
        """
        Create a comprehensive prompt for generating seating arrangement questions.
        
        Args:
            num_questions (int): Number of questions to generate (default: 2)
            
        Returns:
            str: Formatted prompt for question generation
        """

        prompt = f"""You are an elite question setter for competitive exams specializing in truth-teller and liar logical reasoning problems. Your task is to generate exactly {num_questions} high-quality, diverse logic puzzles involving various character types and scenarios.

        NOTE : EACH INDIVIDUAL QUESTIONS AND THERE ANSWERS THAT YOU GENERATE SHOULD BE WITHIN 100 WORDS. FRAME YOUR STORY, QUESTIONS IN SUCH A MANNER THAT SATISFIES THIS CONSTRAINT.
        ALSO GENERATE DIFFICULT QUESTIONS THAT ARE NOT EASY TO ANSWER.
        **QUESTION VARIETY REQUIREMENTS:**
        Generate questions across these diverse categories:
        
        1. **CLASSIC KNIGHTS & KNAVES**:
           - Knights (always tell truth), Knaves (always lie)
           - Island settings with tribal inhabitants
           - "Exactly one/two are Knaves" constraints
        
        2. **THREE-TYPE SCENARIOS**:
           - Knight, Knave, Spy (can say anything)
           - Agent/detective themes
           - Mixed character identification puzzles
        
        3. **TIME-BASED TRUTHERS**:
           - Daymen (truth by day, lie by night)
           - Nightmen (lie by day, truth by night)
           - Unknown time scenarios requiring deduction
        
        4. **ALTERNATING CHARACTERS**:
           - Alternators (truth, lie, truth pattern)
           - Mixed with Knights/Knaves
           - Sequential statement analysis
        
        5. **WEEKLY PATTERN CHARACTERS**:
           - Lion/Unicorn types with specific day patterns
           - Complex weekly truth/lie schedules
           - "Yesterday I was lying" temporal puzzles
        
        5. **SINGLE TRUTH-TELLER SCENARIOS**:
           - Only one truth-teller among group
           - Ancient warriors, suspects, witnesses
           - Process of elimination logic
        
        6. **MASK/ROLE-BASED PUZZLES**:
           - Color-coded masks (Red=truth, Black=lie, White=random)
           - Festival, masquerade, or ceremonial settings
           - Visual role identification
        
        7. **SPECIAL CHARACTER TYPES**:
           - Parrots (repeat last heard statement)
           - Echoes (copy previous person's answer)
           - Locked room mysteries
        
        8. **CLASSIC DOOR PUZZLES**:
           - Two doors (freedom/death)
           - Guards with opposite natures
           - Single question strategy puzzles
        
        9. **PARADOXICAL SCENARIOS**:
           - Self-referential statements
           - Wizard tests and apprentice trials
           - Logic paradox resolution
        
        11. **THEMED CHARACTER TYPES**:
            - Gun (always truthful), Missile (always lies), Robot (can do either)
            - Weapon/technology themed puzzles
            - Military or sci-fi scenarios
        
        12. **WEEKLY SCHEDULE PUZZLES**:
            - Characters with complex weekly truth/lie patterns
            - Lion/Unicorn classic problems
            - Day-of-week deduction challenges
        
        **CHARACTER ARCHETYPES TO USE:**
        - **Fantasy**: Knights, Knaves, Wizards, Apprentices, Warriors
        - **Modern**: Agents, Detectives, Suspects, Witnesses
        - **Sci-Fi**: Aliens, Alternators, Space inhabitants
        - **Thematic**: Masked figures, Guards, Prisoners
        - **Named**: Simple letters (A,B,C) or thematic names (Thalon, Merek, Vorn)
        
        **ADVANCED CHARACTER TYPES:**
        - **Spy**: Can tell truth or lie at will
        - **Alternator**: Follows truth-lie-truth pattern
        - **Daymen/Nightmen**: Time-dependent truth/lie behavior
        - **Parrot**: Repeats last heard statement
        - **Echo**: Copies previous person's response
        - **Random**: Can be truthful or lie unpredictably
        - **Gun**: Always tells the truth (themed variant of Knight)
        - **Missile**: Always lies (themed variant of Knave)  
        - **Robot**: Can tell truth or lie at will (themed variant of Spy)
        - **Lion/Unicorn Types**: Complex weekly schedules (e.g., lies Mon-Wed, truth Thu-Sun)
        - **Weekly Pattern Characters**: Follow specific day-based truth/lie schedules
        
        **STATEMENT PATTERNS TO INCORPORATE:**
        - Direct accusations: "X is a liar/knight/gun/missile"
        - Conditional logic: "If I am a knight, then..."
        - Self-referential: "I am lying", "I will lie next time", "I am the robot"
        - Group dynamics: "We are of the same/different type"
        - Temporal: "It is day/night", "Yesterday I was lying"
        - Meta-statements: About masks, roles, or identity
        - Sequential: Responses that reference previous statements
        - Weekly patterns: Day-specific truth/lie behavior analysis
        
        **CONSTRAINT PATTERNS:**
        - Exact counts: "Exactly one is a knave"
        - Minimum/Maximum: "At least two tell the truth"
        - Exclusion: "Only one truth-teller exists"
        - Unknown variables: Time of day, actual roles
        - Sequential dependencies: Echo/parrot behaviors
        
        **MANDATORY QUESTION STRUCTURE:**
        Each question MUST:
        1. Include thematic background/setting (≤50 words)
        2. Present 2-4 characters with their statements
        3. Clearly state any constraints or special rules
        4. End with specific question (Who is X? What time is it? etc.)
        5. Have exactly 4 multiple choice options (A, B, C, D)
        6. Keep total length under 100 words. This is strict. You need to manipulate to generate questions and answers under 
        7. Have one logically derivable correct answer
        
        **CRITICAL JSON SCHEMA - EXACT FORMAT REQUIRED:**
        {{
            "questions": [
                {{
                    "question": "Complete question with background, character statements, and query (≤150 words)",
                    "choices": [
                        "A) Option 1",
                        "B) Option 2", 
                        "C) Option 3",
                        "D) Option 4"
                    ],
                    "answer": "A|B|C|D",
                    "explanation": "Step-by-step logical analysis testing assumptions, identifying contradictions, and reaching conclusion"
                }}
            ]
        }}
        
        **THEMATIC BACKGROUNDS TO USE:**
        - Island of Zorban with two tribes
        - Secret agent headquarters  
        - Planet Gloop with alternating aliens
        - Ancient warrior councils
        - Masquerade festivals
        - Locked room mysteries
        - Prison scenarios with unknown outcomes
        - Wizard towers and apprentice tests
        - Two-door freedom/death scenarios
        - Time-dependent truth islands
        - Military/weapon-themed scenarios (Gun, Missile, Robot)
        - Fantasy realms with Lion/Unicorn weekly schedules
        - Corporate/technology settings with Robot characters
        
        **EXPLANATION REQUIREMENTS:**
        - Test each possible assumption systematically
        - Show contradiction analysis clearly
        - Explain character behavior patterns
        - Demonstrate logical consistency
        - Address why incorrect options fail
        - Use assumption-contradiction-conclusion format
        
        **SAMPLE QUESTION TYPES:**
        - "Who is the Knave/Spy/Truth-teller/Gun/Missile/Robot?"
        - "What time is it (Day or Night)?"
        - "What day of the week is it?"
        - "Which mask represents the truth-teller?"
        - "Who will be pardoned/executed?"
        - "What question should you ask?"
        - "Who is the Alternator/Parrot/Echo?"
        - "Yesterday I was lying - what does this tell us?"
        
        **OUTPUT REQUIREMENTS:**
        - Generate EXACTLY {num_questions} unique questions with each limited to 100 words.
        - Output ONLY valid JSON with no extra text, comments, or markdown
        - Vary themes, character types, and complexity levels
        - Ensure each puzzle has unique logical structure
        - Make answer choices plausible and distinct
        - Keep questions under 100 words total
        - Include diverse character archetypes and scenarios
        
        Generate {num_questions} questions following the exact JSON schema above."""
        
        return prompt
    
    def generate_questions(self, num_questions: int = 2) -> Dict[str, Any]:
        """
        Generate seating arrangement questions using Gemini API.
        
        Args:
            num_questions (int): Number of questions to generate
            
        Returns:
            Dict[str, Any]: Generated questions in specified format
        """
        
        prompt = self.create_generation_prompt(num_questions)
        
        try:
            response = self.client.generate(
                prompt=prompt,
                temperature=0.8,  # Higher temperature for more variety
                max_tokens=4000   # Sufficient for 2 questions
            )
            
            return response
            
        except Exception as e:
            raise Exception(f"Failed to generate questions: {str(e)}")
    
    def validate_json(self, data: Any) -> bool:
        """
        Validate if the data conforms to the expected JSON schema.
        
        Args:
            data: Data to validate
            
        Returns:
            bool: True if valid, False otherwise
        """
        try:
            if not isinstance(data, dict):
                return False
                
            if "questions" not in data:
                return False
                
            questions = data["questions"]
            if not isinstance(questions, list):
                return False
                
            for question in questions:
                if not isinstance(question, dict):
                    return False
                    
                required_fields = ["question", "choices", "answer", "explanation"]
                for field in required_fields:
                    if field not in question:
                        return False
                
                # Validate choices format
                choices = question["choices"]
                if not isinstance(choices, list) or len(choices) != 4:
                    return False
                    
                for i, choice in enumerate(choices):
                    expected_prefix = f"{chr(65 + i)})"  # A), B), C), D)
                    if not choice.startswith(expected_prefix):
                        return False
                
                # Validate answer format
                answer = question["answer"]
                if answer not in ["A", "B", "C", "D"]:
                    return False
                    
            return True
            
        except Exception as e:
            logger.error(f"Validation error: {str(e)}")
            return False
    
    def append_to_jsonl(self, questions_data: Dict[str, Any], jsonl_file: str):
        """
        Append validated questions to JSONL file.
        
        Args:
            questions_data: Validated questions data
            jsonl_file: Path to JSONL file
        """
        try:
            with open(jsonl_file, 'a', encoding='utf-8') as f:
                for question in questions_data["questions"]:
                    json_line = json.dumps(question, ensure_ascii=False)
                    f.write(json_line + '\n')
                    
        except Exception as e:
            logger.error(f"Error appending to JSONL file: {str(e)}")
            raise
    

    def generate_batch_questions(
        self,
        total_calls: int = 1000,
        questions_per_call: int = 2,
        output_file: str = "seating_questions.jsonl",
        delay_between_calls: float = 1.0
    ):
        """
        Generate questions in batches and create JSONL file.
    
        Args:
            total_calls (int): Total number of API calls to make
            questions_per_call (int): Number of questions per call
            output_file (str): Output JSONL filename
            delay_between_calls (float): Delay between API calls in seconds
        """
        # Clear the output file if it exists
        Path(output_file).unlink(missing_ok=True)
    
        successful_calls = 0
        failed_calls = 0
        total_questions_generated = 0
    
        logger.info(f"Starting generation of {total_calls} calls with {questions_per_call} questions each")
        logger.info(f"Output file: {output_file}")
    
        for call_num in range(1, total_calls + 1):
            try:
                # 1️⃣ Generate raw LLM output
                raw = self.generate_questions(questions_per_call)
                # print(raw)
    
                # 2️⃣ Strip Markdown fences to get pure JSON
                m = re.search(r"```json(.*?)```", raw, re.DOTALL | re.IGNORECASE)
                payload = m.group(1).strip() if m else raw.strip()
    
                # 3️⃣ Parse JSON
                try:
                    questions_data = json.loads(payload)
                except json.JSONDecodeError as e:
                    logger.error(f"Call {call_num}: JSON decode error after trim - {e}")
                    failed_calls += 1
                    continue
    
                # 4️⃣ Validate structure
                if not self.validate_json(questions_data):
                    logger.error(f"Call {call_num}: Invalid question format")
                    failed_calls += 1
                    continue
    
                # 5️⃣ Enrich with metadata
                for i, question in enumerate(questions_data["questions"]):
                    question["id"] = f"call_{call_num:04d}_q_{i+1:02d}"
                    question["call_number"] = call_num
                    question["generated_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
    
                # 6️⃣ Append to JSONL file
                self.append_to_jsonl(questions_data, output_file)
    
                successful_calls += 1
                total_questions_generated += len(questions_data["questions"])
    
                # Progress update every 50 calls
                if call_num % 50 == 0:
                    pct = successful_calls / call_num * 100
                    logger.info(f"Progress: {call_num}/{total_calls} calls (Success rate: {pct:.1f}%)")
    
                # Delay to avoid rate limits
                if delay_between_calls > 0:
                    time.sleep(delay_between_calls)
    
            except Exception as e:
                logger.error(f"Call {call_num}: Unexpected error - {e}")
                failed_calls += 1
                continue
    
        # Final summary
        success_rate = successful_calls / total_calls * 100
        logger.info("Generation completed!")
        logger.info(f"Total calls attempted: {total_calls}")
        logger.info(f"  Successful calls:   {successful_calls}")
        logger.info(f"  Failed calls:       {failed_calls}")
        logger.info(f"  Success rate:       {success_rate:.1f}%")
        logger.info(f"  Total questions:    {total_questions_generated}")
        logger.info(f"Questions saved to:   {output_file}")
    
        return {
            "total_calls": total_calls,
            "successful_calls": successful_calls,
            "failed_calls": failed_calls,
            "total_questions": total_questions_generated,
            "output_file": output_file
        }
        
    def verify_jsonl_file(self, jsonl_file: str) -> Dict[str, Any]:
        """
        Verify the generated JSONL file and provide statistics.
        
        Args:
            jsonl_file (str): Path to JSONL file
            
        Returns:
            Dict[str, Any]: Verification results
        """

In [None]:
# Example usage
def main():
    """Main function to run the question generation process"""
    
    # Initialize your Gemini client here
    API_KEY = APIKEY  # Replace with actual API key
    gemini_client = GeminiClient(API_KEY)  # Uncomment and use your client
    generator = SeatingQuestionGenerator(gemini_client)
    
    # Generate questions with 1000 calls, 2 questions per call
    results = generator.generate_batch_questions(
        total_calls=250,  # Set to 1000 for full run
        questions_per_call=2,
        output_file="grpo_truth_and_liar.jsonl",
        delay_between_calls=0.5  # 0.5 second delay between calls
    )
    
    # Verify the generated file
    verification_results = generator.verify_jsonl_file("seating_questions.jsonl")
    
    print("\nGeneration Summary:")
    print(f"Total questions generated: {results['total_questions']}")
    print(f"Success rate: {results['successful_calls']}/{results['total_calls']}")
    print(f"Output file: {results['output_file']}")

if __name__ == "__main__":
    main()

2025-07-20 03:07:57,003 - INFO - Starting generation of 250 calls with 2 questions each
2025-07-20 03:07:57,005 - INFO - Output file: grpo_truth_and_liar.jsonl
2025-07-20 03:08:20,074 - ERROR - Call 7: JSON decode error after trim - Expecting ',' delimiter: line 23 column 180 (char 1716)
2025-07-20 03:10:41,201 - INFO - Progress: 50/250 calls (Success rate: 98.0%)
2025-07-20 03:11:37,074 - ERROR - Call 67: JSON decode error after trim - Expecting property name enclosed in double quotes: line 24 column 14 (char 1551)
2025-07-20 03:13:23,544 - INFO - Progress: 100/250 calls (Success rate: 98.0%)
2025-07-20 03:16:10,212 - INFO - Progress: 150/250 calls (Success rate: 98.7%)


In [14]:
import json
import time
import logging
from typing import Dict, Any, List
from pathlib import Path
import re
import json
import time
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SeatingQuestionGenerator:
    """
    Generate seating arrangement reasoning questions using Gemini API
    """
    
    def __init__(self, gemini_client):
        self.client = gemini_client
        self.temp_questions = []  # Store questions temporarily for validation
    
    def create_generation_prompt(self, num_questions: int = 2) -> str:
        """
        Create a comprehensive prompt for generating seating arrangement questions.
        
        Args:
            num_questions (int): Number of questions to generate (default: 2)
            
        Returns:
            str: Formatted prompt for question generation
        """
        
        prompt = f"""You are an elite question setter for competitive exams specializing in seating arrangement logical reasoning problems. Your task is to generate exactly {num_questions} high-quality, diverse seating puzzles involving various arrangements and constraint scenarios.
        
        **NOTE: EACH INDIVIDUAL QUESTION AND ANSWER SHOULD BE WITHIN 100 WORDS. FRAME YOUR STORY AND QUESTIONS TO SATISFY THIS CONSTRAINT.**
        **GENERATE DIFFICULT QUESTIONS THAT ARE NOT EASY TO ANSWER.**
        
        **QUESTION VARIETY REQUIREMENTS:**
        Generate questions across these diverse categories:
        
        1. **CIRCULAR ARRANGEMENTS**:
           - Friends from different countries sitting in a circle
           - People facing center or outward (mixed orientations)
           - Opposite seating constraints
           - Adjacent seating requirements
        
        2. **LINEAR ARRANGEMENTS**:
           - Single row seating (left to right)
           - People facing same direction
           - Position-based constraints (2nd from left, 3rd from right)
           - End position specifications
        
        3. **PARALLEL ROW ARRANGEMENTS**:
           - Two parallel rows facing each other
           - Front row faces one direction, back row faces opposite
           - Behind/in front of relationships
           - Cross-row constraints
        
        4. **PROFESSION-BASED SEATING**:
           - Doctors, engineers, lawyers, teachers mixed
           - Profession-location constraints
           - Adjacent profession requirements
           - Opposite profession rules
        
        5. **BEVERAGE/FOOD PREFERENCE SEATING**:
           - Tea, coffee, juice, milk drinkers
           - Beverage-position relationships
           - Adjacent beverage constraints
           - Opposite drink preferences
        
        6. **NATIONALITY/COUNTRY SEATING**:
           - People from different countries
           - Cultural/geographical adjacency rules
           - Nationality-position constraints
           - Cross-cultural seating patterns
        
        7. **COMPLEX MIXED CONSTRAINTS**:
           - Multiple variable types (nationality + profession + beverage)
           - Direction facing + position constraints
           - Temporal seating (who sits where when)
           - Multi-level constraint satisfaction
        
        **CONSTRAINT PATTERNS TO INCORPORATE:**
        - **Positional**: "2nd to the left of", "3rd from right end", "at extreme positions"
        - **Relational**: "adjacent to", "opposite to", "between X and Y"
        - **Directional**: "faces center", "faces outward", "faces north/south"
        - **Conditional**: "If A sits here, then B must sit there"
        - **Exclusion**: "not adjacent to", "not opposite to", "not at ends"
        - **Distance**: "exactly two seats away", "three positions apart"
        - **Group**: "all engineers together", "no two doctors adjacent"
        
        **CHARACTER ARCHETYPES TO USE:**
        - **Names**: A, B, C, D, E, F, G, H or P, Q, R, S, T, U, V, W
        - **Nationalities**: American, Chinese, Japanese, Italian, Brazilian, Indian, German, French
        - **Professions**: Doctor, Engineer, Lawyer, Teacher, Manager, Accountant, Artist, Scientist
        - **Beverages**: Tea, Coffee, Juice, Milk, Water, Soda, Wine, Beer
        - **Colors**: Red, Blue, Green, Yellow, Black, White, Purple, Orange (for clothes/preferences)
        
        **ARRANGEMENT TYPES:**
        1. **Circular (5-8 people)**: All face center, mixed facing, or all face clockwise
        2. **Linear (4-8 people)**: Single row, numbered positions
        3. **Parallel Rows**: Two rows of 3-6 each, facing each other
        4. **L-shaped**: Corner arrangements with position constraints
        5. **Square Table**: 4 sides with multiple people per side
        
        **MANDATORY QUESTION STRUCTURE:**
        Each question MUST:
        1. Include brief setup/context (≤30 words)
        2. Present 3-6 constraint statements clearly
        3. State the seating arrangement type (circular/linear/parallel)
        4. End with specific question (Who sits where? What does X drink?)
        5. Have exactly 4 multiple choice options (A, B, C, D)
        6. Keep total length under 100 words - THIS IS STRICT
        7. Have one logically derivable correct answer
        
        **CRITICAL JSON SCHEMA - EXACT FORMAT REQUIRED:**
        {{
            "questions": [
                {{
                    "question": "Complete question with background, character statements, and query (≤150 words)",
                    "choices": [
                        "A) Option 1",
                        "B) Option 2", 
                        "C) Option 3",
                        "D) Option 4"
                    ],
                    "answer": "A|B|C|D",
                    "explanation": "Step-by-step logical analysis testing assumptions, identifying contradictions, and reaching conclusion"
                }}
            ]
        }}
        
        **SAMPLE QUESTION PATTERNS:**
        - "Who drinks coffee/tea/juice?"
        - "Who sits at the extreme right/left?"
        - "What is X's profession?"
        - "Who sits opposite to Y?"
        - "Who is adjacent to the doctor?"
        - "What does the person in position 3 drink?"
        - "Which nationality sits between A and B?"
        
        **CONSTRAINT COMPLEXITY LEVELS:**
        - **Basic**: 3-4 simple constraints (adjacent, opposite, position)
        - **Intermediate**: 5-6 constraints with mixed variables
        - **Advanced**: 6+ constraints with conditional logic and multiple deductions
        
        **EXPLANATION REQUIREMENTS:**
        - Show step-by-step constraint application
        - Demonstrate elimination process
        - Explain how final answer satisfies all constraints
        - Keep under 50 words
        - Use format: "Constraint X gives us Y. Constraint Z eliminates options. Final answer satisfies all conditions."
        
        **SAMPLE SETUPS:**
        - "Five friends from different countries sit in a circle..."
        - "Eight people sit around a circular table, four face center..."
        - "Six people sit in two parallel rows of three each..."
        - "Seven professionals sit in a straight line..."
        - "Four couples arrange themselves in a square formation..."
        
        **OUTPUT REQUIREMENTS:**
        - Generate EXACTLY {num_questions} unique questions, each ≤100 words
        - Output ONLY valid JSON with no extra text, comments, or markdown
        - Vary arrangement types, constraint complexity, and themes
        - Ensure each puzzle has unique logical structure
        - Make answer choices plausible and distinct
        - Include diverse nationalities, professions, and preferences
        - Test constraint satisfaction thoroughly
        
        Generate {num_questions} seating arrangement questions following the exact JSON schema above."""
        
        return prompt
    
    def generate_questions(self, num_questions: int = 2) -> Dict[str, Any]:
        """
        Generate seating arrangement questions using Gemini API.
        
        Args:
            num_questions (int): Number of questions to generate
            
        Returns:
            Dict[str, Any]: Generated questions in specified format
        """
        
        prompt = self.create_generation_prompt(num_questions)
        
        try:
            response = self.client.generate(
                prompt=prompt,
                temperature=0.8,  # Higher temperature for more variety
                max_tokens=4000   # Sufficient for 2 questions
            )
            
            return response
            
        except Exception as e:
            raise Exception(f"Failed to generate questions: {str(e)}")
    
    def validate_json(self, data: Any) -> bool:
        """
        Validate if the data conforms to the expected JSON schema.
        
        Args:
            data: Data to validate
            
        Returns:
            bool: True if valid, False otherwise
        """
        try:
            if not isinstance(data, dict):
                return False
                
            if "questions" not in data:
                return False
                
            questions = data["questions"]
            if not isinstance(questions, list):
                return False
                
            for question in questions:
                if not isinstance(question, dict):
                    return False
                    
                required_fields = ["question", "choices", "answer", "explanation"]
                for field in required_fields:
                    if field not in question:
                        return False
                
                # Validate choices format
                choices = question["choices"]
                if not isinstance(choices, list) or len(choices) != 4:
                    return False
                    
                for i, choice in enumerate(choices):
                    expected_prefix = f"{chr(65 + i)})"  # A), B), C), D)
                    if not choice.startswith(expected_prefix):
                        return False
                
                # Validate answer format
                answer = question["answer"]
                if answer not in ["A", "B", "C", "D"]:
                    return False
                    
            return True
            
        except Exception as e:
            logger.error(f"Validation error: {str(e)}")
            return False
    
    def append_to_jsonl(self, questions_data: Dict[str, Any], jsonl_file: str):
        """
        Append validated questions to JSONL file.
        
        Args:
            questions_data: Validated questions data
            jsonl_file: Path to JSONL file
        """
        try:
            with open(jsonl_file, 'a', encoding='utf-8') as f:
                for question in questions_data["questions"]:
                    json_line = json.dumps(question, ensure_ascii=False)
                    f.write(json_line + '\n')
                    
        except Exception as e:
            logger.error(f"Error appending to JSONL file: {str(e)}")
            raise
    

    def generate_batch_questions(
        self,
        total_calls: int = 1000,
        questions_per_call: int = 2,
        output_file: str = "seating_questions.jsonl",
        delay_between_calls: float = 1.0
    ):
        """
        Generate questions in batches and create JSONL file.
    
        Args:
            total_calls (int): Total number of API calls to make
            questions_per_call (int): Number of questions per call
            output_file (str): Output JSONL filename
            delay_between_calls (float): Delay between API calls in seconds
        """
        # Clear the output file if it exists
        Path(output_file).unlink(missing_ok=True)
    
        successful_calls = 0
        failed_calls = 0
        total_questions_generated = 0
    
        logger.info(f"Starting generation of {total_calls} calls with {questions_per_call} questions each")
        logger.info(f"Output file: {output_file}")
    
        for call_num in range(1, total_calls + 1):
            try:
                # 1️⃣ Generate raw LLM output
                raw = self.generate_questions(questions_per_call)
                # print(raw)
    
                # 2️⃣ Strip Markdown fences to get pure JSON
                m = re.search(r"```json(.*?)```", raw, re.DOTALL | re.IGNORECASE)
                payload = m.group(1).strip() if m else raw.strip()
    
                # 3️⃣ Parse JSON
                try:
                    questions_data = json.loads(payload)
                except json.JSONDecodeError as e:
                    logger.error(f"Call {call_num}: JSON decode error after trim - {e}")
                    failed_calls += 1
                    continue
    
                # 4️⃣ Validate structure
                if not self.validate_json(questions_data):
                    logger.error(f"Call {call_num}: Invalid question format")
                    failed_calls += 1
                    continue
    
                # 5️⃣ Enrich with metadata
                for i, question in enumerate(questions_data["questions"]):
                    question["id"] = f"call_{call_num:04d}_q_{i+1:02d}"
                    question["call_number"] = call_num
                    question["generated_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
    
                # 6️⃣ Append to JSONL file
                self.append_to_jsonl(questions_data, output_file)
    
                successful_calls += 1
                total_questions_generated += len(questions_data["questions"])
    
                # Progress update every 50 calls
                if call_num % 50 == 0:
                    pct = successful_calls / call_num * 100
                    logger.info(f"Progress: {call_num}/{total_calls} calls (Success rate: {pct:.1f}%)")
    
                # Delay to avoid rate limits
                if delay_between_calls > 0:
                    time.sleep(delay_between_calls)
    
            except Exception as e:
                logger.error(f"Call {call_num}: Unexpected error - {e}")
                failed_calls += 1
                continue
    
        # Final summary
        success_rate = successful_calls / total_calls * 100
        logger.info("Generation completed!")
        logger.info(f"Total calls attempted: {total_calls}")
        logger.info(f"  Successful calls:   {successful_calls}")
        logger.info(f"  Failed calls:       {failed_calls}")
        logger.info(f"  Success rate:       {success_rate:.1f}%")
        logger.info(f"  Total questions:    {total_questions_generated}")
        logger.info(f"Questions saved to:   {output_file}")
    
        return {
            "total_calls": total_calls,
            "successful_calls": successful_calls,
            "failed_calls": failed_calls,
            "total_questions": total_questions_generated,
            "output_file": output_file
        }
        
    def verify_jsonl_file(self, jsonl_file: str) -> Dict[str, Any]:
        """
        Verify the generated JSONL file and provide statistics.
        
        Args:
            jsonl_file (str): Path to JSONL file
            
        Returns:
            Dict[str, Any]: Verification results
        """

In [15]:
# Example usage
def main():
    """Main function to run the question generation process"""
    
    # Initialize your Gemini client here
    API_KEY = APIKEY  # Replace with actual API key
    gemini_client = GeminiClient(API_KEY)  # Uncomment and use your client
    generator = SeatingQuestionGenerator(gemini_client)
    
    # Generate questions with 1000 calls, 2 questions per call
    results = generator.generate_batch_questions(
        total_calls=250,  # Set to 1000 for full run
        questions_per_call=2,
        output_file="grpo_seating_arrangement.jsonl",
        delay_between_calls=0.5  # 0.5 second delay between calls
    )
    
    # Verify the generated file
    verification_results = generator.verify_jsonl_file("seating_questions.jsonl")
    
    print("\nGeneration Summary:")
    print(f"Total questions generated: {results['total_questions']}")
    print(f"Success rate: {results['successful_calls']}/{results['total_calls']}")
    print(f"Output file: {results['output_file']}")

if __name__ == "__main__":
    main()

2025-07-20 03:41:19,750 - INFO - Starting generation of 250 calls with 2 questions each
2025-07-20 03:41:19,750 - INFO - Output file: grpo_seating_arrangement.jsonl
2025-07-20 03:44:04,853 - INFO - Progress: 50/250 calls (Success rate: 100.0%)
2025-07-20 03:45:54,673 - ERROR - Call 82: JSON decode error after trim - Expecting property name enclosed in double quotes: line 24 column 9 (char 1687)
2025-07-20 03:46:55,183 - INFO - Progress: 100/250 calls (Success rate: 99.0%)
2025-07-20 03:49:40,205 - INFO - Progress: 150/250 calls (Success rate: 99.3%)
2025-07-20 03:52:30,938 - INFO - Progress: 200/250 calls (Success rate: 99.5%)
2025-07-20 03:55:15,875 - INFO - Progress: 250/250 calls (Success rate: 99.6%)
2025-07-20 03:55:16,376 - INFO - Generation completed!
2025-07-20 03:55:16,377 - INFO - Total calls attempted: 250
2025-07-20 03:55:16,377 - INFO -   Successful calls:   249
2025-07-20 03:55:16,377 - INFO -   Failed calls:       1
2025-07-20 03:55:16,377 - INFO -   Success rate:       


Generation Summary:
Total questions generated: 498
Success rate: 249/250
Output file: grpo_seating_arrangement.jsonl


In [16]:
import json
import time
import logging
from typing import Dict, Any, List
from pathlib import Path
import re
import json
import time
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SeatingQuestionGenerator:
    """
    Generate seating arrangement reasoning questions using Gemini API
    """
    
    def __init__(self, gemini_client):
        self.client = gemini_client
        self.temp_questions = []  # Store questions temporarily for validation
    
    def create_generation_prompt(self, num_questions: int = 2) -> str:
        """
        Create a comprehensive prompt for generating seating arrangement questions.
        
        Args:
            num_questions (int): Number of questions to generate (default: 2)
            
        Returns:
            str: Formatted prompt for question generation
        """

        prompt = f"""You are an elite question setter for competitive exams specializing in blood relation logical reasoning problems. Your task is to generate exactly {num_questions} high-quality, diverse family relationship puzzles involving complex kinship chains and genealogical deductions.
        
        **NOTE: EACH INDIVIDUAL QUESTION AND ANSWER SHOULD BE WITHIN 100 WORDS. FRAME YOUR STORY AND QUESTIONS TO SATISFY THIS CONSTRAINT.**
        **GENERATE DIFFICULT QUESTIONS THAT ARE NOT EASY TO ANSWER.**
        
        **QUESTION VARIETY REQUIREMENTS:**
        Generate questions across these diverse categories:
        
        1. **POINTING/INTRODUCTION SCENARIOS**:
           - "Aryan pointed to a man and said..."
           - "Looking at a photograph, X mentioned..."
           - "At a family gathering, Y introduced Z as..."
           - Direct relationship identification through descriptions
        
        2. **MULTI-GENERATIONAL CHAINS**:
           - Great-grandparent to great-grandchild relationships
           - 3-4 generation family trees
           - Ancestor-descendant complex chains
           - Cross-generational marriage connections
        
        3. **SIBLING-BASED RELATIONSHIPS**:
           - Only brother/sister specifications
           - Multiple siblings with unique relationships
           - Sibling's children and spouse connections
           - "Father's only sister" or "mother's only brother" patterns
        
        4. **MARRIAGE-LINKED RELATIONSHIPS**:
           - In-law relationships through marriage
           - Spouse's family member connections
           - "Married to" chain relationships
           - Sister-in-law, brother-in-law, daughter-in-law patterns
        
        5. **AUNT/UNCLE RELATIONSHIPS**:
           - Maternal vs paternal aunts/uncles
           - Aunt's/uncle's children (cousins)
           - Complex cousin relationships
           - "Only sister/brother" specifications
        
        6. **GRANDPARENT-CENTRIC PUZZLES**:
           - Grandfather's/grandmother's children and their relationships
           - "Paternal grandfather's daughter-in-law" patterns
           - Multiple grandchildren relationships
           - Cross-generational spouse connections
        
        7. **COMPLEX MIXED FAMILY SCENARIOS**:
           - Multiple marriage connections in same family
           - Overlapping relationships through different paths
           - "Who is married to whose mother's..." type chains
           - Multi-variable relationship deduction
        
        **RELATIONSHIP TERMINOLOGY TO USE:**
        - **Direct Relations**: Father, Mother, Son, Daughter, Brother, Sister
        - **Extended Family**: Grandfather, Grandmother, Uncle, Aunt, Nephew, Niece, Cousin
        - **Marriage Relations**: Husband, Wife, Father-in-law, Mother-in-law, Son-in-law, Daughter-in-law, Brother-in-law, Sister-in-law
        - **Generational**: Great-grandfather, Great-grandmother, Great-grandchild
        - **Specifications**: Only brother, Only sister, Elder/Younger, Maternal/Paternal
        
        **COMPLEX CHAIN PATTERNS:**
        - **Single Path**: "Father's sister's son" = paternal cousin
        - **Double Path**: "Mother's only sister, who is married to..."
        - **Cross-Generation**: "Great-grandfather's daughter's granddaughter"
        - **Marriage Bridge**: "Whose father is married to X's mother"
        - **Specification Chain**: "Only brother of paternal grandfather's daughter-in-law"
        - **Multi-Variable**: "Eldest son of father's only brother's wife"
        
        **MANDATORY QUESTION STRUCTURE:**
        Each question MUST:
        1. Include scenario setup (pointing, photograph, gathering) (≤20 words)
        2. Present complex relationship chain with 2-4 connecting links
        3. Use specific family terminology (only sister, paternal, maternal, etc.)
        4. End with "How is X related to Y?" or similar relationship query
        5. Have exactly 4 multiple choice options (A, B, C, D)
        6. Keep total length under 100 words - THIS IS STRICT
        7. Have one logically derivable correct answer
        
        **CRITICAL JSON SCHEMA - EXACT FORMAT REQUIRED:**
        {{
            "questions": [
                {{
                    "question": "Complete question with background, character statements, and query (≤150 words)",
                    "choices": [
                        "A) Option 1",
                        "B) Option 2", 
                        "C) Option 3",
                        "D) Option 4"
                    ],
                    "answer": "A|B|C|D",
                    "explanation": "Step-by-step logical analysis testing assumptions, identifying contradictions, and reaching conclusion"
                }}
            ]
        }}
        
        **SAMPLE RELATIONSHIP CHAINS:**
        - "Son of mother's only sister" → maternal cousin
        - "Father's only sister's husband's son" → step-cousin/cousin-in-law
        - "Paternal grandfather's daughter-in-law's brother" → father's brother-in-law
        - "Wife of mother's maternal grandmother's grandson" → aunt/mother
        - "Eldest son of father's only brother" → paternal cousin
        - "Great-grandfather's daughter's only granddaughter" → second cousin/relative
        
        **CHARACTER NAMES TO USE:**
        - **Modern**: Aryan, Priya, Kunal, Ravi, Tanvi, Ananya, Rohan, Meera, Vikram, Shreya
        - **Traditional**: Ram, Sita, Krishna, Radha, Arjun, Kiran, Deepak, Sunita
        - **Simple**: A, B, C, X, Y, Z (for complex scenarios)
        
        **RELATIONSHIP ANSWER OPTIONS:**
        - **Direct**: Brother, Sister, Son, Daughter, Father, Mother
        - **Extended**: Uncle, Aunt, Nephew, Niece, Cousin, Grandfather, Grandmother
        - **In-laws**: Brother-in-law, Sister-in-law, Father-in-law, Mother-in-law
        - **Complex**: Second cousin, Step-brother, Maternal uncle, Paternal aunt
        - **Generational**: Great-uncle, Great-aunt, Grand-nephew, Grand-niece
        
        **EXPLANATION REQUIREMENTS:**
        - Break down each relationship link step-by-step
        - Show intermediate relationships clearly
        - Explain final relationship derivation
        - Keep under 50 words
        - Use format: "X's mother's sister = maternal aunt. Her son = cousin. Therefore..."
        
        **SAMPLE QUESTION PATTERNS:**
        - "How is the man related to Aryan?"
        - "What is X's relationship to Y?"
        - "How is the woman connected to Z?"
        - "What relation does A bear to B?"
        - "How is the eldest son's wife related to Kunal?"
        
        **COMPLEXITY LEVELS:**
        - **Basic**: 2-3 relationship links (father's sister = aunt)
        - **Intermediate**: 3-4 links with marriage connections
        - **Advanced**: 4+ links with cross-generational and multiple marriage paths
        
        **SCENARIO SETUPS:**
        - "In a family gathering, X pointed to Y and said..."
        - "Looking at a family photograph, A mentioned about B..."
        - "At a wedding, X introduced Y as..."
        - "During a reunion, A explained B's relationship as..."
        - "While discussing family, X described Y as..."
        
        **OUTPUT REQUIREMENTS:**
        - Generate EXACTLY {num_questions} unique questions, each ≤100 words
        - Output ONLY valid JSON with no extra text, comments, or markdown
        - Vary relationship complexity, chain length, and family scenarios
        - Ensure each puzzle has unique logical structure and relationship path
        - Make answer choices plausible and represent different relationship types
        - Include diverse names and family structures
        - Test relationship deduction thoroughly with step-by-step logic
        
        Generate {num_questions} blood relation questions following the exact JSON schema above."""
        return prompt
    
    def generate_questions(self, num_questions: int = 2) -> Dict[str, Any]:
        """
        Generate seating arrangement questions using Gemini API.
        
        Args:
            num_questions (int): Number of questions to generate
            
        Returns:
            Dict[str, Any]: Generated questions in specified format
        """
        
        prompt = self.create_generation_prompt(num_questions)
        
        try:
            response = self.client.generate(
                prompt=prompt,
                temperature=0.8,  # Higher temperature for more variety
                max_tokens=4000   # Sufficient for 2 questions
            )
            
            return response
            
        except Exception as e:
            raise Exception(f"Failed to generate questions: {str(e)}")
    
    def validate_json(self, data: Any) -> bool:
        """
        Validate if the data conforms to the expected JSON schema.
        
        Args:
            data: Data to validate
            
        Returns:
            bool: True if valid, False otherwise
        """
        try:
            if not isinstance(data, dict):
                return False
                
            if "questions" not in data:
                return False
                
            questions = data["questions"]
            if not isinstance(questions, list):
                return False
                
            for question in questions:
                if not isinstance(question, dict):
                    return False
                    
                required_fields = ["question", "choices", "answer", "explanation"]
                for field in required_fields:
                    if field not in question:
                        return False
                
                # Validate choices format
                choices = question["choices"]
                if not isinstance(choices, list) or len(choices) != 4:
                    return False
                    
                for i, choice in enumerate(choices):
                    expected_prefix = f"{chr(65 + i)})"  # A), B), C), D)
                    if not choice.startswith(expected_prefix):
                        return False
                
                # Validate answer format
                answer = question["answer"]
                if answer not in ["A", "B", "C", "D"]:
                    return False
                    
            return True
            
        except Exception as e:
            logger.error(f"Validation error: {str(e)}")
            return False
    
    def append_to_jsonl(self, questions_data: Dict[str, Any], jsonl_file: str):
        """
        Append validated questions to JSONL file.
        
        Args:
            questions_data: Validated questions data
            jsonl_file: Path to JSONL file
        """
        try:
            with open(jsonl_file, 'a', encoding='utf-8') as f:
                for question in questions_data["questions"]:
                    json_line = json.dumps(question, ensure_ascii=False)
                    f.write(json_line + '\n')
                    
        except Exception as e:
            logger.error(f"Error appending to JSONL file: {str(e)}")
            raise
    

    def generate_batch_questions(
        self,
        total_calls: int = 1000,
        questions_per_call: int = 2,
        output_file: str = "seating_questions.jsonl",
        delay_between_calls: float = 1.0
    ):
        """
        Generate questions in batches and create JSONL file.
    
        Args:
            total_calls (int): Total number of API calls to make
            questions_per_call (int): Number of questions per call
            output_file (str): Output JSONL filename
            delay_between_calls (float): Delay between API calls in seconds
        """
        # Clear the output file if it exists
        Path(output_file).unlink(missing_ok=True)
    
        successful_calls = 0
        failed_calls = 0
        total_questions_generated = 0
    
        logger.info(f"Starting generation of {total_calls} calls with {questions_per_call} questions each")
        logger.info(f"Output file: {output_file}")
    
        for call_num in range(1, total_calls + 1):
            try:
                # 1️⃣ Generate raw LLM output
                raw = self.generate_questions(questions_per_call)
                # print(raw)
    
                # 2️⃣ Strip Markdown fences to get pure JSON
                m = re.search(r"```json(.*?)```", raw, re.DOTALL | re.IGNORECASE)
                payload = m.group(1).strip() if m else raw.strip()
    
                # 3️⃣ Parse JSON
                try:
                    questions_data = json.loads(payload)
                except json.JSONDecodeError as e:
                    logger.error(f"Call {call_num}: JSON decode error after trim - {e}")
                    failed_calls += 1
                    continue
    
                # 4️⃣ Validate structure
                if not self.validate_json(questions_data):
                    logger.error(f"Call {call_num}: Invalid question format")
                    failed_calls += 1
                    continue
    
                # 5️⃣ Enrich with metadata
                for i, question in enumerate(questions_data["questions"]):
                    question["id"] = f"call_{call_num:04d}_q_{i+1:02d}"
                    question["call_number"] = call_num
                    question["generated_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
    
                # 6️⃣ Append to JSONL file
                self.append_to_jsonl(questions_data, output_file)
    
                successful_calls += 1
                total_questions_generated += len(questions_data["questions"])
    
                # Progress update every 50 calls
                if call_num % 50 == 0:
                    pct = successful_calls / call_num * 100
                    logger.info(f"Progress: {call_num}/{total_calls} calls (Success rate: {pct:.1f}%)")
    
                # Delay to avoid rate limits
                if delay_between_calls > 0:
                    time.sleep(delay_between_calls)
    
            except Exception as e:
                logger.error(f"Call {call_num}: Unexpected error - {e}")
                failed_calls += 1
                continue
    
        # Final summary
        success_rate = successful_calls / total_calls * 100
        logger.info("Generation completed!")
        logger.info(f"Total calls attempted: {total_calls}")
        logger.info(f"  Successful calls:   {successful_calls}")
        logger.info(f"  Failed calls:       {failed_calls}")
        logger.info(f"  Success rate:       {success_rate:.1f}%")
        logger.info(f"  Total questions:    {total_questions_generated}")
        logger.info(f"Questions saved to:   {output_file}")
    
        return {
            "total_calls": total_calls,
            "successful_calls": successful_calls,
            "failed_calls": failed_calls,
            "total_questions": total_questions_generated,
            "output_file": output_file
        }
        
    def verify_jsonl_file(self, jsonl_file: str) -> Dict[str, Any]:
        """
        Verify the generated JSONL file and provide statistics.
        
        Args:
            jsonl_file (str): Path to JSONL file
            
        Returns:
            Dict[str, Any]: Verification results
        """

In [17]:
# Example usage
def main():
    """Main function to run the question generation process"""
    
    # Initialize your Gemini client here
    API_KEY = APIKEY  # Replace with actual API key
    gemini_client = GeminiClient(API_KEY)  # Uncomment and use your client
    generator = SeatingQuestionGenerator(gemini_client)
    
    # Generate questions with 1000 calls, 2 questions per call
    results = generator.generate_batch_questions(
        total_calls=250,  # Set to 1000 for full run
        questions_per_call=2,
        output_file="grpo_blood_relations.jsonl",
        delay_between_calls=0.5  # 0.5 second delay between calls
    )
    
    # Verify the generated file
    verification_results = generator.verify_jsonl_file("seating_questions.jsonl")
    
    print("\nGeneration Summary:")
    print(f"Total questions generated: {results['total_questions']}")
    print(f"Success rate: {results['successful_calls']}/{results['total_calls']}")
    print(f"Output file: {results['output_file']}")

if __name__ == "__main__":
    main()

2025-07-20 03:55:16,392 - INFO - Starting generation of 250 calls with 2 questions each
2025-07-20 03:55:16,392 - INFO - Output file: grpo_blood_relations.jsonl
2025-07-20 03:57:10,955 - ERROR - Call 46: Invalid question format
2025-07-20 03:57:20,384 - INFO - Progress: 50/250 calls (Success rate: 98.0%)
2025-07-20 03:58:05,100 - ERROR - Call 68: Invalid question format
2025-07-20 03:58:22,143 - ERROR - Call 75: JSON decode error after trim - Expecting value: line 21 column 7 (char 896)
2025-07-20 03:59:22,887 - INFO - Progress: 100/250 calls (Success rate: 97.0%)
2025-07-20 04:00:29,700 - ERROR - Call 127: Invalid question format
2025-07-20 04:00:52,430 - ERROR - Call 136: Invalid question format
2025-07-20 04:01:26,363 - INFO - Progress: 150/250 calls (Success rate: 96.7%)
2025-07-20 04:01:33,667 - ERROR - Call 153: Invalid question format
2025-07-20 04:01:52,675 - ERROR - Call 161: Invalid question format
2025-07-20 04:02:07,501 - ERROR - Call 167: Invalid question format
2025-07-20


Generation Summary:
Total questions generated: 480
Success rate: 240/250
Output file: grpo_blood_relations.jsonl


In [18]:
# data = 
with open("/jupyter-tutorial/Dataset_generation/grpo_seating_arrangement.jsonl", "r", encoding="utf-8") as f:
    for lines in f:
        print(lines)
        print(type(lines))
        break

SyntaxError: invalid syntax (3857487401.py, line 1)