In [4]:
!pip install -U transformers accelerate bitsandbytes



In [6]:
# -----------------------------
import os
import re
import json
from pathlib import Path
from typing import List, Dict, Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


In [7]:
# -----------------------------
# 1. File Readers (txt / pdf / docx)
# -----------------------------
def read_txt(path: str) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def read_docx(path: str) -> str:
    from docx import Document
    doc = Document(path)
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n\n".join(paragraphs)

def read_pdf_pdfplumber(path: str) -> str:
    import pdfplumber
    text_pages = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            text_pages.append(text)
    return "\n\n".join(text_pages)

def read_pdf_pymupdf(path: str) -> str:
    import fitz  # PyMuPDF
    doc = fitz.open(path)
    return "\n\n".join([page.get_text() for page in doc])
    
def extract_text_from_path(path: str) -> str:
    path = os.path.abspath(path)
    ext = os.path.splitext(path)[1].lower()
    if ext in ['.txt', '.md']:
        return read_txt(path)
    elif ext in ['.docx']:
        return read_docx(path)
    elif ext == '.pdf':
        try:
            return read_pdf_pdfplumber(path)
        except Exception:
            return read_pdf_pymupdf(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

# -----------------------------

In [8]:
# 2. Cleaning
# -----------------------------
def clean_extracted_text(text: str, remove_headfoot=True) -> str:
    txt = text.replace('\r', '\n')
    txt = re.sub(r'\n\s+\n', '\n\n', txt)
    if remove_headfoot:
        txt = re.sub(r'\nPage\s*\d+\s*\n', '\n', txt, flags=re.IGNORECASE)
        txt = re.sub(r'^\s*\d+\s*$', '', txt, flags=re.MULTILINE)
    txt = re.sub(r'(\w+)-\n(\w+)', r'\1\2', txt)
    return txt.strip()


In [9]:
# 3. Tokenizer abstraction
# -----------------------------
def get_tiktoken_encoder(model_name="gpt-4o-mini"):
    try:
        import tiktoken
        return tiktoken.encoding_for_model(model_name)
    except Exception:
        return None

def count_tokens(text: str, encoder=None) -> int:
    if encoder is not None:
        return len(encoder.encode(text))
    return max(1, len(text) // 4)

def simple_tokenize_words(text: str) -> List[str]:
    return re.findall(r"\S+", text)

# -----------------------------


In [10]:
# 4. Chunking
# -----------------------------
def chunk_text_by_tokens(
    text: str,
    max_tokens: int = 900,
    overlap_tokens: int = 100,
    encoder=None
):
    assert overlap_tokens < max_tokens, "overlap_tokens must be < max_tokens"

    tokens = encoder.encode(text)
    n = len(tokens)

    chunks = []
    start = 0
    chunk_id = 0

    while start < n:
        end = min(start + max_tokens, n)

        token_slice = tokens[start:end]
        content = encoder.decode(token_slice)

        chunks.append({
            "id": f"chunk_{chunk_id}",
            "text": content,
            "token_count": end - start,
            "token_start": start,
            "token_end": end
        })

        chunk_id += 1

        if end == n:
            break

        start = end - overlap_tokens

    return chunks
    
def process_file_to_chunks(path, max_tokens=900, overlap=100):
    txt = extract_text_from_path(path)
    txt = clean_extracted_text(txt)

    encoder = get_tiktoken_encoder("gpt-4o-mini")

    chunks = chunk_text_by_tokens(
        txt,
        max_tokens=max_tokens,
        overlap_tokens=overlap,
        encoder=encoder
    )

    for i, c in enumerate(chunks):
        c["doc_path"] = path
        c["index"] = i

    return chunks


# -----------------------------

In [11]:
import re
from typing import List, Dict

def clean_model_output(chunks: List[Dict], math_mode="plain") -> str:
    """
    Generic post-processing of model output chunks.

    Parameters:
        chunks: list of dicts with 'ideas' or 'text' keys
        math_mode: "plain" converts LaTeX to human-readable, "latex" keeps LaTeX
    Returns:
        str - combined, cleaned text
    """
    all_texts = []
    print(chunks)
    for chunk in chunks:
        text = chunk.get("ideas") or chunk.get("text") or ""

        # 1. Unescape common escaped chars
        text = text.replace(r'\"', '"')
        text = text.replace(r'\\n', '\n')
        text = text.replace(r'\t', '    ')

        # 2. Handle LaTeX math
        if math_mode == "plain":
            # remove \( \) and $$ $$
            text = re.sub(r'\\\(|\\\)', '', text)
            text = re.sub(r'\$\$', '', text)

            # convert common LaTeX symbols to unicode
            replacements = {
                r'\\sigma': 'œÉ',
                r'\\omega': 'œâ',
                r'\\theta': 'Œ∏',
                r'\\pi': 'œÄ',
                r'\\cos': 'cos',
                r'\\sin': 'sin',
                r'\\sqrt': '‚àö',
                r'\\frac': '/',  # basic fraction replacement
            }
            for k, v in replacements.items():
                text = text.replace(k, v)

            # Convert exponents like ^{...} ‚Üí ^(...) for readability
            text = re.sub(r'\^\{([^}]*)\}', r'^\(\1\)', text)

            # Add spaces around operators if missing
            text = re.sub(r'(\w)([+\-*/^])', r'\1 \2', text)
            text = re.sub(r'([+\-*/^])(\w)', r'\1 \2', text)

        # 3. Normalize whitespace
        text = re.sub(r'[ \t]+', ' ', text)        # multiple spaces ‚Üí 1
        text = re.sub(r'\n\s*\n', '\n', text)     # multiple newlines ‚Üí 1
        text = text.strip()

        all_texts.append(text)

    # 4. Combine all chunks with a double newline
    return "\n\n".join(all_texts)


In [12]:
MODEL_ID = "Qwen/Qwen2-7B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

model1 = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto"
)




tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

2025-12-29 10:44:15.838040: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767005056.023900      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767005056.090592      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767005056.547049      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767005056.547078      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767005056.547081      55 computation_placer.cc:177] computation placer alr

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [14]:
MULTI_TASK_PROMPT_TEMPLATE = """You are an assistant that extracts core mathematical or logical ideas.

Task:
Read the text and extract distinct, atomic ideas.
Each idea must represent one clear concept, rule, or definition.

Rules:
- Output only bullet points.
- Each bullet point must start with "- ".
- Preserve symbols and equations exactly as written.
- Do not explain, expand, or add examples.
- Do not repeat ideas.
- Do not add introductions, conclusions, or filler text.

Text:
"{text}"

Bullet points:


"""
def process_text_local(text: str, max_new_tokens: int = 450):
    prompt = MULTI_TASK_PROMPT_TEMPLATE.format(text=text)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )
    inputs = {k: v.to(model1.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model1.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.3,
            top_p=0.9,
            do_sample=True,
            use_cache=True
        )

    return tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    ).strip()


In [None]:
import google.generativeai as genai

genai.configure(api_key="")

model2 = genai.GenerativeModel("models/gemini-2.5-flash")
GEMINI_MULTI_TASK_PROMPT = """
You are an expert educational content generator. Your job is to create high-quality questions 
from the provided ideas or facts. Each question should match the user's selected task mode 
and difficulty level.

### Modes:
You are an intelligent educational content generator.

Your task is to generate high-quality educational outputs based strictly on the given ideas.
Use controlled creativity while remaining faithful to the topic.

---

### Supported Modes

1. **mcq** ‚Äì Multiple-choice questions (MCQs)
   - Generate 1‚Äì2 MCQs per idea (use judgment based on idea complexity).
   - Each MCQ must include:
       - question
       - 4 options (A, B, C, D)
       - correct_answer
       - brief explanation (1‚Äì2 sentences max)
   - Questions should be concise and focused (avoid unnecessary wording).
   - Output strictly in JSON:
     [
       {{
         "question": "...",
         "options": {{ "A": "...", "B": "...", "C": "...", "D": "..." }},
         "correct_answer": "A",
         "explanation": "..."
       }}
     ]

2. **qa** ‚Äì Question‚ÄìAnswer pairs
   - Generate 1‚Äì2 questions per idea if meaningful.
   - Questions must be clear and specific.
   - Answers must be direct, factual, and concise.
   - Keep question length proportional to answer length.
   - Do NOT include explanations.
   - Output JSON:
     [
       {{
         "question": "...",
         "answer": "..."
       }}
     ]

3. **true_false** ‚Äì True/False statements
   - Generate at least one statement per idea.
   - Randomly vary between True and False.
   - Statements must be short, precise, and unambiguous (avoid long compound sentences).
   - Output JSON:
     [
       {{
         "statement": "...",
         "answer": "True"
       }}
     ]

4. **fill_blank** ‚Äì Fill-in-the-blank
   - Generate 1‚Äì2 questions per idea.
   - Hide a key term, symbol, formula component, or concept.
   - The blank should target an essential element, not trivial words.
   - Output JSON:
     [
       {{
         "question": "... ____ ...",
         "answer": "..."
       }}
     ]

5. **summary** ‚Äì Conceptual summary
   - Produce a concise academic summary of the given ideas.
   - Length: a short paragraph (max 120 words).
   - No examples, no extra explanations.
   - Output JSON:
     {{
       "summary": "..."
     }}

---

### Difficulty Level (Bloom‚Äôs Taxonomy)

- **Easy:** Recall, definitions, direct facts, simple relationships.
- **Medium:** Application, comparison, moderate reasoning.
- **Hard:** Multi-step reasoning, synthesis of ideas, deeper conceptual understanding.

---

### Global Rules

- Align all outputs strictly with the given **mode** and **difficulty level**.
- Preserve all mathematical notation, symbols, and formulas **exactly as given**.
- Maintain balance:
  - Do not generate overly long questions with very short answers.
  - Prefer clarity over verbosity.
- Use creativity only to improve question quality, not to add new concepts.
- Output **valid JSON only** ‚Äî no extra text before or after.

---

Mode: {mode}
Difficulty: {difficulty_level}

Ideas:
{ideas}

"""


def generate_questions_with_gemini(ideas: str,mode:str ,difficulty:str, max_output_tokens=5000):
    prompt = GEMINI_MULTI_TASK_PROMPT.format(ideas=ideas,difficulty_level=difficulty,mode=mode)
    response = model2.generate_content(
        prompt,
        generation_config=genai.GenerationConfig(
            max_output_tokens=max_output_tokens,
            temperature=0.8,   # creativity
            top_p=0.9
        )
    )
    return response.text.strip()


In [16]:
def chunk_text_by_rules(
    text: str,
    max_chars=4000,
    max_lines=25
):
    """
    Splits text into safe chunks using character and line limits.
    """
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    chunks = []

    current_chunk = []
    current_chars = 0

    for line in lines:
        line_len = len(line)

        if (
            len(current_chunk) >= max_lines
            or current_chars + line_len > max_chars
        ):
            chunks.append("\n".join(current_chunk))
            current_chunk = []
            current_chars = 0

        current_chunk.append(line)
        current_chars += line_len

    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks


In [23]:
import json
import re

def clean_gemini_output(raw_text: str) -> str:
    """
    Minimal cleanup:
    - Remove markdown JSON fences
    - Remove unnecessary escape slashes
    - Drop incomplete JSON objects
    """

    # 0. Remove Markdown code fences like ```json ... ```
    text = raw_text.strip()
    text = re.sub(r"^```(?:json)?\s*", "", text)
    text = re.sub(r"\s*```$", "", text)

    # 1. Remove escaped newlines and quotes (if any)
    text = text.replace("\\n", "\n")
    text = text.replace('\\"', '"')

    # 2. Try loading JSON safely
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        # If JSON is broken, return original (do NOT guess)
        return raw_text.strip()

    # 3. Remove incomplete objects
    if isinstance(data, list):
        clean_data = []
        for item in data:
            if isinstance(item, dict):
                s = json.dumps(item).strip()
                if s.startswith("{") and s.endswith("}"):
                    clean_data.append(item)
        data = clean_data

    # 4. Dump clean JSON
    return json.dumps(data, indent=2, ensure_ascii=False)


In [19]:
def process_ideas_to_questions(
    ideas_txt: str,
    mode: str,
    difficulty_level: str,
    output_json: str = None
):
    # Default output file name
    if output_json is None:
        output_json = ideas_txt.replace(
            ".txt", f"_{mode}_{difficulty_level}.json"
        )

    # Read full ideas text
    with open(ideas_txt, "r", encoding="utf-8") as f:
        full_text = f.read()

    # Chunk text to control tokens
    chunks = chunk_text_by_rules(full_text)
    print(f"üîπ Total chunks: {len(chunks)}")

    all_results = []  # ‚Üê combined output

    for idx, chunk in enumerate(chunks, start=1):
        print(f"‚û°Ô∏è Processing chunk {idx}/{len(chunks)}")

        raw_response = generate_questions_with_gemini(
            ideas=chunk,
            mode=mode,
            difficulty=difficulty_level
        )

        # Minimal cleaning
        clean_response = clean_gemini_output(raw_response)

        # Parse JSON safely
        try:
            parsed = json.loads(clean_response)
        except json.JSONDecodeError:
            print(f"‚ö†Ô∏è Skipping chunk {idx}: invalid JSON")
            continue

        # Combine results
        if isinstance(parsed, list):
            all_results.extend(parsed)
        else:
            all_results.append(parsed)

    # Save combined output
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)

    print(f"‚úÖ Done! {len(all_results)} items saved to {output_json}")
    
    return output_json


In [20]:
def process_transcript_to_ideas(
    path: str,
    out_txt: str = "ideas.txt"
):
    chunks = process_file_to_chunks(path, max_tokens=900, overlap=100)
    results = []

    for chunk in chunks:
        print(f"Processing {chunk['id']} ({chunk['token_count']} tokens)...")
        ideas_or_summary = process_text_local(chunk["text"])
        results.append({
            "id": chunk["id"],
            "ideas": ideas_or_summary
        })

    clean_text = clean_model_output(results, math_mode="plain")

    with open(out_txt, "w", encoding="utf-8") as f:
        f.write(clean_text)

    print(f"‚úÖ Ideas saved to {out_txt}")




In [25]:
#GUI
import gradio as gr

def run_pipeline(text_input, file_obj, mode, difficulty):
    # -----------------------------
    # Step 1: Load text
    # -----------------------------
    if file_obj:
        text = Path(file_obj.name).read_text(encoding="utf-8")
    else:
        text = text_input

    if not text.strip():
        return "‚ö†Ô∏è No input text provided."

    # Save raw input
    with open("/kaggle/working/input.txt", "w", encoding="utf-8") as f:
        f.write(text)

    # -----------------------------
    # Step 2: Stage-1 ‚Üí Ideas / Summary
    # -----------------------------
    process_transcript_to_ideas(
        path="/kaggle/working/input.txt",
        out_txt="/kaggle/working/ideas.txt"
    )

    if not Path("ideas.txt").exists():
        return "‚ö†Ô∏è Failed to generate ideas."

    # -----------------------------
    # Step 3: Stage-2 ‚Üí Questions
    # -----------------------------
    output_file=process_ideas_to_questions(
        ideas_txt="/kaggle/working/ideas.txt",
        mode=mode,
        difficulty_level=difficulty
    )

    # -----------------------------
    # Step 4: Load output
    # -----------------------------

    if not Path(output_file).exists():
        return "‚ö†Ô∏è No output generated."

    with open(output_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    return json.dumps(data, indent=2, ensure_ascii=False)

iface = gr.Interface(
    fn=run_pipeline,
    inputs=[
        gr.Textbox(lines=10, label="Input Text"),
        gr.File(file_types=[".txt"], label="Upload Text File"),
        gr.Dropdown(["mcq","qa","true_false","fill_blank","summary"], label="Mode"),
        gr.Dropdown(["easy","medium","hard"], label="Difficulty")
    ],
    outputs=gr.Textbox(lines=20, label="Output (JSON)"),
    title="AI Question Generator",
    description="Two-stage LLM pipeline: Ideas ‚Üí Questions"
)

iface.launch()
 
           

* Running on local URL:  http://127.0.0.1:7862
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://57486b9ba23113dda2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/fastapi/applications.py", line 1133, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py",

In [None]:
#CLI
# 7. Pipeline Runner
# -----------------------------

# if __name__ == "__main__":
#     # üß† Step 0 ‚Üí User Input
#     mode = input("Enter mode (summary / ideas / qa / mcq / true_false / fill_blank): ").strip().lower()


#     # File to process
#     input_path = r"/kaggle/input/documents/transcript_2.txt"
    
#     if mode in ["mcq", "true_false", "fill_blank","qa","summary"]:
#           # Optional: difficulty for modes that need it
#         difficulty_level = input("Enter difficulty (easy / medium / hard): ").strip().lower()
#         #process_transcript_to_ideas(input_path)
#         process_ideas_to_questions("/kaggle/working/ideas.txt", difficulty_level=difficulty_level, mode=mode)

#     elif mode == "ideas":
#         # QA-prep may just save Phi2 results or call a different function
#         process_transcript_to_ideas(input_path)
#         print("‚úÖ ideas has been generated and  saved in ideas.txt.")

#     else:
#         print(f"‚ö†Ô∏è Unknown mode: {mode}")
    
                                                    

In [None]:
import requests

API_KEY = ""
url = "https://generativelanguage.googleapis.com/v1beta/models"

headers = {
    "Content-Type": "application/json",
    "x-goog-api-key": API_KEY
}

response = requests.get(url, headers=headers)
data = response.json()

for model in data.get("models", []):
    print(
        model["name"],
        "‚Üí",
        model.get("supportedGenerationMethods", [])
    )


models/embedding-gecko-001 ‚Üí ['embedText', 'countTextTokens']
models/gemini-2.5-flash ‚Üí ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.5-pro ‚Üí ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-exp ‚Üí ['generateContent', 'countTokens', 'bidiGenerateContent']
models/gemini-2.0-flash ‚Üí ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-001 ‚Üí ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-exp-image-generation ‚Üí ['generateContent', 'countTokens', 'bidiGenerateContent']
models/gemini-2.0-flash-lite-001 ‚Üí ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-lite ‚Üí ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
models/gemini-2.0-flash-lite-preview-02-05 ‚Üí ['generateConten