In [2]:
import requests
import re
import google.generativeai as genai
from bs4 import BeautifulSoup
import time

# Configure Gemini API
GEMINI_API_KEY = ""  # Replace with your actual API Key
genai.configure(api_key=GEMINI_API_KEY)

# Configure SerpAPI
SERPAPI_KEY = ""  # Replace with your actual SerpAPI Key

def fetch_syllabus_links(query):
    """Fetch syllabus links from Google using SerpAPI."""
    url = f"https://serpapi.com/search.json?q={query.replace(' ', '+')}&api_key={SERPAPI_KEY}"
    response = requests.get(url)
    data = response.json()
    links = [result["link"] for result in data.get("organic_results", [])][:5]  # Top 5 results
    return links

def clean_text(text):
    """Removes unwanted symbols, extra spaces, and strange characters."""
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s\-\(\)\[\]\.,]', '', text)  # Keep only relevant characters
    return text.strip()

def scrape_syllabus_page(url):
    """Scrapes a webpage and extracts raw syllabus content."""
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, "lxml")
        raw_text = soup.get_text(separator="\n")
        return raw_text
    except requests.RequestException:
        return None

def clean_syllabus_with_gemini(raw_text, max_retries=3):
    """Uses Gemini API to extract only relevant syllabus information with rate limit handling."""
    
    # Trim long text to avoid API overload
    if len(raw_text) > 1500:
        raw_text = raw_text[:1500]  # Keep only first 1500 characters

    prompt = f"""
    Extract only the syllabus chapters and subchapters from the following text.
    Format the output as:
    
    - Chapter Name
      - Subchapter 1
      - Subchapter 2
    
    Ignore advertisements, greetings, and other irrelevant text. Here is the raw syllabus data:
    
    {raw_text}
    """

    for attempt in range(max_retries):
        try:
            model = genai.GenerativeModel("gemini-pro")
            response = model.generate_content(prompt)
            time.sleep(2)  # Add delay to prevent hitting the rate limit
            return response.text if response else "Error: Empty response"
        
        except Exception as e:
            if "429" in str(e):
                wait_time = 2 ** attempt  # Exponential backoff (2s, 4s, 8s...)
                print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("Error:", e)
                return "Error in processing."
    
    return "Failed to get a response. Please try again later."

def generate_study_roadmap(query):
    """Fetch syllabus, extract raw text, and clean it using Gemini API."""
    
    links = fetch_syllabus_links(query)
    cleaned_roadmap = []

    for link in links:
        raw_syllabus = scrape_syllabus_page(link)
        if raw_syllabus:
            cleaned_syllabus = clean_syllabus_with_gemini(raw_syllabus)
            cleaned_roadmap.append(cleaned_syllabus)
            time.sleep(3)  # Delay between API calls to avoid quota exhaustion

    return cleaned_roadmap

# Example usage
query = "11th standard HSC physics syllabus"
roadmap = generate_study_roadmap(query)

# Display the cleaned syllabus
for idx, syllabus in enumerate(roadmap):
    print(f"📌 **Syllabus from Source {idx+1}:**\n{syllabus}\n")

📌 **Syllabus from Source 1:**
No chapters or subchapters are included in the provided text.

📌 **Syllabus from Source 2:**
This text does not contain a syllabus.

📌 **Syllabus from Source 3:**
**Year 11**

- Core
  - Motion and forces
  - Circular motion and gravity
  - Energy and momentum

- Options
  - Waves and light
  - Electricity and magnetism
  - Particles and energy

**Year 12**

- Core
  - The nature of light
  - Electricity and magnetism
  - Quantum physics and nuclear physics

- Options
  - Astrophysics
  - Engineering physics
  - Materials science

📌 **Syllabus from Source 4:**
Sure, here is the syllabus chapters and subchapters from the text you provided:

- **Chapter 1: Physical World and Measurement**
  - Subchapter 1: Introduction
  - Subchapter 2: Physics and its Scope
  - Subchapter 3: Units and Dimensions
  - Subchapter 4: Errors in Measurement
- **Chapter 2: Kinematics**
  - Subchapter 1: Introduction
  - Subchapter 2: Motion in a Straight Line
  - Subchapter 3: Mot