In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from vllm import LLM, SamplingParams

In [None]:


# # Much faster than transformers + bitsandbytes
# llm = LLM(
#     model="../Qwen2-VL-72B-Instruct",
#     tensor_parallel_size=1,          # Use multiple GPUs if available
#     dtype="float16",                 # Skip quantization for speed
#     gpu_memory_utilization=0.9,      # Maximize GPU usage
#     enforce_eager=False,             # Enable CUDA graphs
#     # quantization="awq",              # Use AWQ instead of bitsandbytes
#     max_num_seqs=8                   # Batch processing
# )

# # Optimized sampling
# sampling_params = SamplingParams(
#     temperature=0.7,
#     max_tokens=512,
#     top_p=0.9
# )

# # Generate (much faster)
# outputs = llm.generate(["Your prompt here"], sampling_params)


In [None]:
#!/usr/bin/env python3
"""
Simplified Seating Question Generator
Generates seating arrangement questions and saves them as JSON
"""

import json
import os
import random
import re
from datetime import datetime
from typing import List, Dict
from collections import defaultdict
from vllm import LLM, SamplingParams

class SeatingQuestionGenerator:
    def __init__(self, model, sampling_params, output_dir="generated_questions"):
        self.model = model
        self.sampling_params = sampling_params
        self.output_dir = output_dir
        self.dataset = []
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
    def initialize_dataset(self, initial_samples: List[Dict]):
        """Initialize with seed questions."""
        self.dataset = initial_samples.copy()
        self.save_dataset()
        print(f"Initialized dataset with {len(initial_samples)} questions")
    
    def _create_generation_prompt(self, sample_questions: List[Dict], num_to_generate: int = 5) -> str:
        """Create prompt for generating new questions."""
        prompt = """Generate seating arrangement logic questions following these examples. Each question should have:
- A complex seating scenario (circular, linear, or parallel rows)
- 4 multiple choice options (A, B, C, D)
- Clear answer and explanation
- Different people/professions/nationalities/beverages

Examples:

"""
        
        for i, q in enumerate(sample_questions, 1):
            prompt += f"Example {i}:\n"
            prompt += f"Question: {q['question']}\n"
            for choice in q['choices']:
                prompt += f"{choice}\n"
            prompt += f"Answer: {q['answer']}\n"
            prompt += f"Explanation: {q['explanation']}\n\n"
        
        prompt += f"Now generate {num_to_generate} new seating arrangement questions in the same format. Make them varied and challenging:\n\n"
        return prompt
    
    def _parse_generated_questions(self, outputs) -> List[Dict]:
        """Parse generated text into structured questions."""
        questions = []
        
        for output in outputs:
            text = output.outputs[0].text
            
            # Split by question markers
            question_blocks = re.split(r'(?:Question \d+:|New Question \d+:|Generated Question \d+:|\n\n(?=Question:))', text)
            
            for block in question_blocks:
                if not block.strip():
                    continue
                
                question_dict = self._extract_question_from_block(block)
                if question_dict:
                    questions.append(question_dict)
        
        return questions
    
    def _extract_question_from_block(self, block: str) -> Dict:
        """Extract structured question from text block."""
        try:
            lines = [line.strip() for line in block.strip().split('\n') if line.strip()]
            
            question_text = ""
            choices = []
            answer = ""
            explanation = ""
            
            current_section = "question"
            
            for line in lines:
                line = line.strip()
                
                # Identify sections
                if line.lower().startswith('question:'):
                    current_section = "question"
                    question_text = line[9:].strip()
                elif re.match(r'^[A-D]\)', line):
                    current_section = "choices"
                    choices.append(line)
                elif line.lower().startswith('answer:'):
                    current_section = "answer"
                    answer = line[7:].strip().upper()
                elif line.lower().startswith('explanation:'):
                    current_section = "explanation"
                    explanation = line[12:].strip()
                else:
                    # Continue current section
                    if current_section == "question" and not question_text:
                        question_text = line
                    elif current_section == "question" and question_text:
                        question_text += " " + line
                    elif current_section == "choices" and re.match(r'^[A-D]\)', line):
                        choices.append(line)
                    elif current_section == "explanation" and explanation:
                        explanation += " " + line
                    elif current_section == "explanation" and not explanation:
                        explanation = line
            
            # Validate and return
            if question_text and len(choices) == 4 and answer and explanation:
                return {
                    'question': question_text,
                    'choices': choices,
                    'answer': answer,
                    'explanation': explanation,
                    'generated_at': datetime.now().isoformat()
                }
                
        except Exception as e:
            print(f"Error parsing question block: {e}")
            
        return None
    
    def generate_batch(self, num_questions: int = 5) -> List[Dict]:
        """Generate a batch of new questions."""
        new_questions = []
        
        # Select random sample questions for context
        sample_questions = random.sample(self.dataset, min(3, len(self.dataset)))
        prompt = self._create_generation_prompt(sample_questions, num_questions)
        
        try:
            outputs = self.model.generate([prompt], self.sampling_params)
            questions = self._parse_generated_questions(outputs)
            
            for q in questions:
                if q:  # Simple validation
                    new_questions.append(q)
                    if len(new_questions) >= num_questions:
                        break
                        
        except Exception as e:
            print(f"Generation failed: {e}")
            
        self.dataset.extend(new_questions)
        if new_questions:
            self.save_dataset()
        
        return new_questions
    
    def save_dataset(self):
        """Save the current dataset to JSON."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{self.output_dir}/seating_questions.json"
        
        dataset_info = {
            'metadata': {
                'total_questions': len(self.dataset),
                'generated_at': datetime.now().isoformat(),
                'model_name': 'Qwen2-VL-72B-Instruct'
            },
            'questions': self.dataset
        }
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(dataset_info, f, indent=2, ensure_ascii=False)
            
        # Also save latest version
        latest_filename = f"{self.output_dir}/1000_latest_dataset.json"
        with open(latest_filename, 'w', encoding='utf-8') as f:
            json.dump(dataset_info, f, indent=2, ensure_ascii=False)
            
        print(f"Dataset saved to {filename}")
        
    def generate_dataset(self, target_size: int, batch_size: int = 5):
        """Generate dataset to reach target size."""
        print(f"Generating dataset to reach {target_size} questions...")
        
        while len(self.dataset) < target_size:
            remaining = target_size - len(self.dataset)
            current_batch_size = min(batch_size, remaining)
            
            print(f"Progress: {len(self.dataset)}/{target_size}")
            new_questions = self.generate_batch(current_batch_size)
            
            if not new_questions:
                print("Warning: No new questions generated. Stopping.")
                # break
                
        print(f"Dataset generation complete! Final size: {len(self.dataset)}")
        return self.dataset




In [None]:
def load_model():
    """Load the model with optimized settings."""
    print("Loading model...")
    
    try:
        llm = LLM(
            model="Qwen/Qwen2-VL-72B-Instruct",  # or your local path
            tensor_parallel_size=1,
            dtype="float16",
            gpu_memory_utilization=0.9,
            max_num_seqs=4
        )
        
        sampling_params = SamplingParams(
            temperature=0.8,
            max_tokens=1024,
            top_p=0.9
        )
        
        print("Model loaded successfully!")
        return llm, sampling_params
        
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None


def get_initial_samples():
    """Get seed questions for the generator."""
    return [
        {
            "question": "Five friends from different countries sit circularly. The Italian sits opposite the tea-drinker. The Japanese sits two seats to the left of the coffee-drinker. The Brazilian drinks juice. The Chinese is adjacent to the American. Milk is drunk by someone adjacent to juice. Who drinks coffee?",
            "choices": [
                "A) American",
                "B) Chinese", 
                "C) Japanese",
                "D) Brazilian"
            ],
            "answer": "A",
            "explanation": "Brazilian (juice) has milk adjacent. Italian opposite tea. Japanese -> 2 seats left of coffee -> American must be coffee (Chinese adjacent to American, not conflicting with other constraints)."
        },
        {
            "question": "Eight people A, B, C, D, E, F, G, H sit around a circular table. Four face the center, and four face outward. A is third to the left of B, who faces the opposite direction of D. C (a doctor) sits adjacent to both E and F. G faces the center and is two seats to the left of H, who is not adjacent to B. If E faces outward, who is the engineer?",
            "choices": [
                "A) G",
                "B) H",
                "C) F", 
                "D) D"
            ],
            "answer": "B",
            "explanation": "H's position and facing direction (outward) are derived from clues. Since C is a doctor and professions aren't repeated, H must be the engineer."
        },
        {
            "question": "Six people sit in a straight line facing north. The engineer sits at one end. The doctor is two seats away from the lawyer. The teacher sits immediately to the right of the scientist. The manager is not adjacent to the engineer. If the artist sits between the doctor and teacher, who sits at the left end?",
            "choices": [
                "A) Engineer",
                "B) Doctor",
                "C) Manager", 
                "D) Scientist"
            ],
            "answer": "A",
            "explanation": "Working through the constraints systematically: engineer must be at an end, and given the positioning requirements of other professions, the engineer sits at the left end."
        }
    ]


def main():
    """Main execution function."""
    print("="*60)
    print("SEATING ARRANGEMENT QUESTION GENERATOR")
    print("="*60)
    
    # Load model
    model, sampling_params = load_model()
    if not model:
        print("Failed to load model. Exiting.")
        return
    
    # Initialize generator
    generator = SeatingQuestionGenerator(model, sampling_params)
    
    # Load initial samples
    initial_samples = get_initial_samples()
    generator.initialize_dataset(initial_samples)
    
    # Generate more questions
    target_questions = 1000  # Adjust as needed
    generator.generate_dataset(target_questions, batch_size=3)
    
    print("="*60)
    print("GENERATION COMPLETE!")
    print(f"Total questions: {len(generator.dataset)}")
    print(f"Saved to: {generator.output_dir}/latest_dataset.json")


if __name__ == "__main__":
    main()

In [None]:
# import json
# import random
# from typing import List, Dict

# def create_generation_prompt(sample_questions: List[Dict], num_to_generate: int = 5) -> str:
#     """
#     Create a prompt for generating seating arrangement questions based on samples.
#     Returns a prompt that generates questions in proper JSON schema format.
#     """
    
#     # Format the sample questions
#     examples_text = ""
#     for i, q in enumerate(sample_questions, 1):
#         examples_text += f"""
# Example {i}:
# Question: {q['question']}
# Choices: {q.get('choices', ['A) Option 1', 'B) Option 2', 'C) Option 3', 'D) Option 4'])}
# Answer: {q['answer']}
# Explanation: {q['explanation']}
# """
    
#     prompt = f"""You are an expert at creating seating arrangement reasoning questions for competitive exams. 
# Here are {len(sample_questions)} example questions with their answers and explanations:
# {examples_text}

# Now generate {num_to_generate} NEW seating arrangement questions following these guidelines:

# QUESTION TYPES TO VARY:
# - Circular seating (5-12 people)
# - Linear seating (single row, 6-10 people)  
# - Parallel rows (2 rows, 4-8 people each)

# ELEMENTS TO INCLUDE (mix and match):
# - Nationalities (American, Brazilian, Chinese, Indian, Japanese, German, French, Italian)
# - Professions (Doctor, Engineer, Teacher, Lawyer, Manager, Scientist, Artist)
# - Beverages (Tea, Coffee, Juice, Milk, Water, Soda)
# - Facing directions (center/outward for circular, north/south for linear)
# - Colors (Red, Blue, Green, Yellow, Black, White)
# - Ages (use relative terms like "younger than", "older than")

# CONSTRAINTS TO USE:
# - Opposite positions
# - Adjacent positions  
# - Specific seat numbers/positions
# - "X seats to the left/right of Y"
# - "Between X and Y"
# - Directional facing requirements

# MANDATORY REQUIREMENTS:
# 1. Each question MUST have exactly 4 multiple choice options labeled A), B), C), D)
# 2. Questions must be solvable through logical deduction
# 3. Each question should be unique and test different reasoning patterns
# 4. Answer choices should be plausible and relevant to the question asked

# OUTPUT FORMAT - MUST be a valid JSON object with this exact structure:
# {{
#   "questions": [
#     {{
#       "question": "[Complete question text ending with a clear question]",
#       "choices": [
#         "A) [Option 1]",
#         "B) [Option 2]", 
#         "C) [Option 3]",
#         "D) [Option 4]"
#       ],
#       "answer": "[Single letter A, B, C, or D]",
#       "explanation": "[Step-by-step logical reasoning showing how to arrive at the answer]"
#     }}
#   ]
# }}

# CRITICAL: 
# - Output ONLY valid JSON - no commentary, no extra text
# - Ensure all JSON syntax is correct (proper quotes, commas, brackets)
# - Each question object must include all four fields: question, choices, answer, explanation
# - Choices must always be an array of exactly 4 strings starting with A), B), C), D)

# Generate exactly {num_to_generate} questions now:"""
    
#     return prompt

# # Initialize your sample dataset with proper structure
# initial_samples = [
    # {
    #     "question": "Five friends from different countries sit circularly. The Italian sits opposite the tea-drinker. The Japanese sits two seats to the left of the coffee-drinker. The Brazilian drinks juice. The Chinese is adjacent to the American. Milk is drunk by someone adjacent to juice. Who drinks coffee?",
    #     "choices": [
    #         "A) American",
    #         "B) Chinese", 
    #         "C) Japanese",
    #         "D) Brazilian"
    #     ],
    #     "answer": "A",
    #     "explanation": "Brazilian (juice) has milk adjacent. Italian opposite tea. Japanese -> 2 seats left of coffee -> American must be coffee (Chinese adjacent to American, not conflicting with other constraints)."
    # },
    # {
    #     "question": "Eight people A, B, C, D, E, F, G, H sit around a circular table. Four face the center, and four face outward. A is third to the left of B, who faces the opposite direction of D. C (a doctor) sits adjacent to both E and F. G faces the center and is two seats to the left of H, who is not adjacent to B. If E faces outward, who is the engineer?",
    #     "choices": [
    #         "A) G",
    #         "B) H",
    #         "C) F", 
    #         "D) D"
    #     ],
    #     "answer": "B",
    #     "explanation": "H's position and facing direction (outward) are derived from clues. Since C is a doctor and professions aren't repeated, H must be the engineer."
    # },
    # {
    #     "question": "Twelve people sit in two parallel rows of six each. Front row faces south, back row faces north. P is behind Q, who is third from the left end. R faces T, who is adjacent to S. U is two places to the right of V in the same row. If W is at an extreme end in the back row, who sits at the front row's extreme right?",
    #     "choices": [
    #         "A) S",
    #         "B) T",
    #         "C) U",
    #         "D) V"
    #     ],
    #     "answer": "A", 
    #     "explanation": "Q is third from left in front row; P is behind Q. R faces T (adjacent to S). U and V positions fix S at the front right."
    # }
# ]