<a href="https://colab.research.google.com/github/Pakeetharan/ai-study-guide/blob/main/Study_Guide_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üìö AI Study Guide Generator
**Turn your lecture slides and textbooks into professional exam notes and practice questions.**

### **How to use this tool:**
1. **Check Settings:** Go to top menu `Runtime` -> `Change runtime type` and ensure **T4 GPU** is selected.
2. **Initialize:** Click the **Play** button on **Step 1** below. Wait for it to say "System Ready" (~2 mins).
3. **Upload & Run:** Click the **Play** button on **Step 2**.
    * You will be asked to connect to **Google Drive** (this is to safely save your final PDF).
    * Click **"Choose Files"** to upload your PDFs. You can upload multiple files (e.g., *Week1.pdf, Week2.pdf*) at once.
4. **Get Results:** The AI will analyze each document separately and save a `Study_Guide_TIMESTAMP.pdf` into your Google Drive folder: `My Drive > AI_Study_Notes`.

---
**üí° Pro Tip:** Upload separate PDF files for each lecture topic instead of merging them. This helps the AI generate specific practice questions for every single topic.

In [None]:
# @title üöÄ Step 1: Initialize System
# @markdown Installs the AI engine, OCR tools, and PDF processors. It takes about **2 minutes**.
# @markdown You only need to run this once per session.

import os, sys, subprocess
import logging, warnings

# Memory Config
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
logging.getLogger("pdfminer").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

print("‚è≥ Installing System Dependencies...")
with open(os.devnull, 'w') as devnull:
    subprocess.run(["apt-get", "update"], stdout=devnull, stderr=devnull)
    subprocess.run(["apt-get", "install", "-y", "tesseract-ocr", "poppler-utils",
                    "libcairo2", "libpango-1.0-0", "libgdk-pixbuf2.0-0", "libffi-dev",
                    "fonts-roboto", "fonts-liberation"], stdout=devnull, stderr=devnull)

    pkgs = [
        "transformers", "accelerate", "bitsandbytes", "langchain-huggingface",
        "langchain-text-splitters", "langchain-community", "langchain-core",
        "pdfplumber", "pdf2image", "pytesseract", "markdown", "weasyprint",
        "tiktoken", "tqdm", "numpy"
    ]
    subprocess.run([sys.executable, "-m", "pip", "install"] + pkgs, stdout=devnull, stderr=devnull)

import torch
import pdfplumber
import pytesseract
import markdown
import re
from datetime import datetime
from tqdm import tqdm
from pdf2image import convert_from_path
from google.colab import files, drive
from weasyprint import HTML, CSS
from weasyprint.text.fonts import FontConfiguration
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate

print("‚è≥ Loading AI Model...")
model_id = "NousResearch/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True
)

# Smart Offloading Configuration
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    max_memory={0: "13GB", "cpu": "24GB"},
    offload_folder="offload"
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2048, model_kwargs={"temperature": 0.3}, return_full_text=False)
llm = HuggingFacePipeline(pipeline=pipe)

print("‚úÖ System Ready.")

In [None]:
# @title üìÇ Step 2: Upload Files & Generate Guide
# @markdown **Instructions:**
# @markdown 1. Run this cell to connect to Drive.
# @markdown 2. Upload your PDFs when the button appears.
# @markdown 3. The AI will process each file and save the result to `My Drive > AI_Study_Notes`.

import os
from google.colab import drive, files
import gc
import torch

# --- 1. Connect Drive ---
print("üîå Checking Google Drive connection...")
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

output_folder = "/content/drive/My Drive/AI_Study_Notes"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# --- 2. Configuration ---
generate_exercises = True # @param {type:"boolean"}
OPTIMAL_CHUNK_SIZE = 6500
CHUNK_OVERLAP = 200

# --- 3. Helper Functions ---
def clean_text_formatting(text):
    """
    Fixes common AI formatting errors:
    1. Removes unnecessary code blocks around plain text.
    2. Ensures sequential numbering (1. 2. 3. instead of 1. 1. 1.)
    """
    # Remove ``` block if it wraps the entire response or simple text
    # But keep it if it looks like actual code/math
    if text.strip().startswith("```") and "def " not in text and "=" not in text:
        text = text.replace("```markdown", "").replace("```", "")

    # Fix broken list numbering (e.g. 1. Question 1. Question -> 1. Question 2. Question)
    lines = text.split('\n')
    new_lines = []
    question_count = 1

    for line in lines:
        # Detect lines starting with "1." that act as headers
        if re.match(r'^\d+\.', line.strip()) and len(line) > 5:
            line = re.sub(r'^\d+\.', f'{question_count}.', line, 1)
            question_count += 1
        new_lines.append(line)

    return '\n'.join(new_lines)

def extract_text_from_file(filename):
    text = ""
    try:
        with pdfplumber.open(filename) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted: text += extracted + "\n"
        if len(text) < 500:
            print(f"   ‚ö†Ô∏è Scanned content detected in {filename}. Running OCR...")
            images = convert_from_path(filename)
            for img in images: text += pytesseract.image_to_string(img) + "\n"
    except Exception as e: print(f"   ‚ùå Error reading {filename}: {e}")
    return text

def run_pipeline():
    print("\n" + "="*40)
    print("   ‚¨áÔ∏è  CLICK THE BUTTON BELOW TO UPLOAD  ‚¨áÔ∏è")
    print("="*40)
    uploaded = files.upload()

    if not uploaded:
        print("‚ùå No files uploaded.")
        return

    all_notes_markdown = ""
    all_exercises_markdown = ""

    for i, filename in enumerate(uploaded.keys()):
        print(f"\nüöÄ Processing File {i+1}/{len(uploaded)}: {filename}...")

        raw_text = extract_text_from_file(filename)
        if not raw_text.strip(): continue

        splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            tokenizer, chunk_size=OPTIMAL_CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        )
        docs = splitter.create_documents([raw_text])

        # --- Generate Academic Notes ---
        print(f"   üìù Generating Academic Notes ({len(docs)} sections)...")
        file_notes = f"# Module: {filename}\n"

        note_prompt = PromptTemplate.from_template(
            """
            You are an expert Professor creating a comprehensive Study Guide. Analyze the following text:
            "{text}"

            ### 1. Analyze the Content Type
            Determine the best structure for these notes based on the content:
            - **If History/Narrative:** Use Timelines or Sequence flows.
            - **If Technical/Scientific:** Use Definitions, Formulas, and Process Steps.
            - **If Comparative:** Use Comparison Tables (e.g., X vs Y).
            - **If Code/Programming:** Use Syntax Blocks and Explanations.

            ### 2. Generate the Notes (Strict Visual Styling)
            Produce the study guide using the adaptive structure you chose above. You MUST follow these visual rules:

            * **Titles:** Use `# Main Topic` for the document title (only once).
            * **Headers:** Use `## Section Header` for major sections and `### Sub-header` for subsections.
            * **Key Terms:** Always **bold** important terms when first defined.
            * **Lists:** Use standard bullet points (`- `) or numbered lists (`1. `) where appropriate.
            * **Tables:** If comparing items or listing data, YOU MUST use a Markdown table.
            * **Code/Math:** Use fenced code blocks (```) for all formulas, equations, or code snippets.
            * **Callouts:** Use Blockquotes (`>`) for critical warnings, important notes, or "Remember This" tips.

            ### 3. Final Summary
            End with a `### Summary` section containing 3-5 high-level takeaways.

            **Output Requirement:** Return ONLY the formatted Markdown. No conversational filler.
            """
        )

        for doc in tqdm(docs, desc="   > Analyzing", leave=False):
            try:
                messages = [{"role": "user", "content": note_prompt.format(text=doc.page_content)}]
                fmt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                # Rely on smart offloading for memory safety
                res = pipe(fmt, max_new_tokens=1500, pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
                clean_res = res.split("assistant")[-1].strip() if "assistant" in res else res
                file_notes += f"\n{clean_res}\n"
            except: pass

        all_notes_markdown += file_notes + "\n\n<div class='page-break'></div>\n\n"

        if generate_exercises:
            print(f"   üß† Designing Exam Questions...")
            mid = len(raw_text) // 3
            sample_context = raw_text[mid : mid + OPTIMAL_CHUNK_SIZE]

            ex_prompt = f"""
            Create a Practice Exam based on this text:
            "{sample_context}"

            STRICT FORMATTING RULES:

            ## Practice Exam: {filename}

            ### Multiple Choice
            1. [Question text here?]
               a) [Option]
               b) [Option]
               c) [Option]
               d) [Option]

            2. [Next Question?]
               a) [Option]...

            ### Short Answer
            3. [Question text?]

            ### Answer Key
            > 1. a) [Brief explanation]
            > 2. b) [Brief explanation]
            > 3. [Brief Answer]

            DO NOT use Code Blocks for the Questions. Use standard text. MCQ options should be in New Line
            """

            try:
                messages = [{"role": "user", "content": ex_prompt}]
                fmt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                res = pipe(fmt, max_new_tokens=1500, pad_token_id=tokenizer.eos_token_id)[0]["generated_text"].split("assistant")[-1].strip()

                # Clean formatting (fix numbering, remove rogue code blocks)
                res = clean_text_formatting(res)

                all_exercises_markdown += f"{res}\n\n<div class='page-break'></div>\n\n"
            except Exception as e:
                print(f"   ‚ùå Error generating exercises: {e}")

    # --- Final PDF Assembly ---
    print("\nüíæ Rendering Final PDF...")
    final_md = f"""
    {all_notes_markdown}
    # Part 2: Practice Workbook
    {all_exercises_markdown}
    """

    html_content = markdown.markdown(final_md, extensions=['extra', 'codehilite', 'tables', 'fenced_code'])

    # CSS: Academic Standard
    css = CSS(string="""
        @page { size: A4; margin: 2.5cm; }
        body {
            font-family: 'Roboto', 'Helvetica', sans-serif;
            font-size: 11pt;
            line-height: 1.6;
            color: #333;
        }

        /* Headers - Consistent Styling */
        h1 {
            color: #2c3e50;
            border-bottom: 2px solid #2c3e50;
            padding-bottom: 10px;
            margin-top: 40px;
            font-size: 20pt;
        }
        h2 {
            color: #2980b9;
            margin-top: 30px;
            font-size: 16pt;
            border-left: 4px solid #2980b9;
            padding-left: 10px;
        }
        h3 {
            color: #e67e22;
            font-size: 13pt;
            margin-top: 20px;
        }

        /* Components */
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            font-size: 10pt;
        }
        th { background-color: #ecf0f1; color: #2c3e50; padding: 8px; border: 1px solid #bdc3c7; }
        td { border: 1px solid #bdc3c7; padding: 8px; }

        pre {
            background: #f8f9fa;
            padding: 15px;
            border-radius: 5px;
            border: 1px solid #dee2e6;
            font-family: 'Courier New', monospace;
            font-size: 9pt;
        }

        blockquote {
            background: #f0f8ff;
            border-left: 4px solid #3498db;
            margin: 15px 0;
            padding: 10px 15px;
            color: #555;
            font-style: italic;
        }

        li { margin-bottom: 5px; }
        .page-break { page-break-after: always; }
    """)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_filename = os.path.join(output_folder, f"Study_Guide_{timestamp}.pdf")

    HTML(string=html_content, base_url='.').write_pdf(output_filename, stylesheets=[css])
    print(f"üéâ Guide Saved: {output_filename}")

run_pipeline()