<a href="https://colab.research.google.com/github/RajShah3006/Saarthi/blob/main/ai_university_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q google-generativeai

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import google.generativeai as genai
from google.colab import userdata
import time # Added specifically for the loop pacing

# Assume GOOGLE_API_KEY is already set up in Colab secrets
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception:
    print("API Key check failed. Ensure it is in secrets.")

# Initialize the Gemini API
model = genai.GenerativeModel('gemini-2.5-flash')
chat = model.start_chat(history=[])

# --- YOUR ORIGINAL FUNCTIONS (UNCHANGED) ---

def get_user_information(user_data=None):
    """Collects information from the user."""
    if user_data: return user_data

    user_data = {}
    user_data['subjects'] = input("What subjects are you taking currently? ")
    user_data['intrests'] = input("What are your intrests? ")
    user_data['overall_average'] = input("What is your overall average? ")
    user_data['grade'] = input("What grade are you in? ")
    user_data['location'] = input("Where are you located? ")
    return user_data

def list_all_programs(url):
    """Scrapes and lists all program names and their URLs from the given URL."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        programs_list = []

        # The container class from your original code
        container = soup.select_one('div.results.results-programs')

        if not container:
            # print(f"Could not find container on {url}")
            return None

        program_elements = container.select('h2.result-heading')
        if not program_elements:
            return None

        for program_element in program_elements:
            program_name = program_element.get_text(strip=True)
            anchor_tag = program_element.find('a', href=True)
            if anchor_tag:
                program_url = anchor_tag['href']
                programs_list.append({'name': program_name, 'url': program_url})

        if programs_list:
            return programs_list
        else:
            return None
    except Exception as e:
        print(f"Error scraping list on {url}: {e}")
        return None

def scrape_university_info(url):
    """Scrapes university information from a given URL."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        university_data = {}

        program_element = soup.select_one('h1.program-title')
        if program_element:
            university_data['program'] = program_element.get_text(strip=True)

        prerequisites = []
        prereq_headings = soup.find_all(string=re.compile(r'Prerequisites|Admission Requirements', re.IGNORECASE))
        for heading in prereq_headings:
            list_element = heading.find_next(['ul', 'ol'])
            if list_element:
                list_items = [li.get_text(strip=True) for li in list_element.select('li')]
                if list_items:
                    prerequisites.extend(list_items)
            else:
                parent = heading.parent
                if parent:
                    text_content = parent.get_text(separator=' ', strip=True)
                    if len(text_content) > len(heading.get_text(strip=True)):
                        prerequisites.append(text_content)

        if prerequisites:
            university_data['prerequisites'] = "\n".join(list(set(prerequisites)))

        admission_average = None
        average_text = soup.find(string=re.compile(r'(?:admission|minimum)?\s*average.*?\d+%', re.IGNORECASE))
        if average_text:
            admission_average = average_text.strip()
        else:
            average_element = soup.select_one('.admission-average-range, .average-grade')
            if average_element:
                admission_average = average_element.get_text(strip=True)

        if admission_average:
            university_data['admission_average'] = admission_average

        # (Location logic omitted as per original code structure returning None for location usually)

        if university_data:
            return university_data
        else:
            return None

    except Exception as e:
        # print(f"Error details: {e}")
        return None

def generate_chatbot_response(user_data, all_programs_detailed_data):
    """Generates a chatbot response based on user data and scraped university info."""
    prompt = f"""Based on the following student information:
- **Subjects:** {user_data['subjects']}
- **Intrests:** {user_data['intrests']}
- **Overall Average:** {user_data['overall_average']}
- **Grade:** {user_data['grade']}
"""
    relevant_programs_info = ""

    if all_programs_detailed_data:
        relevant_programs_info += "\n**Information about potentially relevant programs:**\n\n"

        # LIMITER: Sending only first 30 to avoid token limits in this demo
        for program_data in all_programs_detailed_data[:30]:
            relevant_programs_info += f"**Program Name:** {program_data.get('program_name', 'N/A')}\n"
            relevant_programs_info += f"**Program URL:** {program_data.get('program_url', 'N/A')}\n"
            relevant_programs_info += f"**Prerequisites:** {program_data.get('prerequisites', 'N/A')}\n"
            relevant_programs_info += f"**Admission Average:** {program_data.get('admission_average', 'N/A')}\n"
            relevant_programs_info += "---\n\n"
    else:
        relevant_programs_info += "Could not find detailed information for programs.\n\n"

    if relevant_programs_info:
        prompt += relevant_programs_info
    else:
        prompt += "\nCould not retrieve detailed program information.\n"

    prompt += f"""
Please provide some relevant information, such as:
- What program is recommended based on the student's interests and scraped program data
- A ranking of relevant universities for that specific program
- Recommendations for high school courses to pursue and projects to complete.
Only give information for universities in Ontario.
"""
    try:
        response = chat.send_message(prompt)
        return response.text
    except Exception as e:
        return f"Error communicating with Gemini: {e}"

# --- MAIN EXECUTION FLOW ---

print("Hello! I'm a student assistant chatbot.")

# STEP 2: Scrape program URLs by looping through Alphabet Groups
# We need to hit every letter group to get the full list of 1394 programs.
alphabet_groups = ['a', 'b', 'c', 'd-e', 'f-g', 'h', 'i', 'j-l', 'm', 'n-p', 'q-s', 't-z']
programs_with_urls = []

print("Step 2: Building Master List of Programs (This loops through A-Z)...")

for group in alphabet_groups:
    group_url = f"https://www.ouinfo.ca/programs/search/?search=&group={group}"
    print(f"Fetching Group: {group.upper()}...", end=" ")

    group_programs = list_all_programs(group_url)

    if group_programs:
        programs_with_urls.extend(group_programs)
        print(f"Found {len(group_programs)} programs.")
    else:
        print("No programs found.")

print(f"\nTotal Programs Found: {len(programs_with_urls)}")

all_programs_detailed_data = []

import concurrent.futures

# STEP 3: Parallel Scraping (The Fast Way)
if programs_with_urls:
    print(f"\nStep 3: Starting parallel scrape of {len(programs_with_urls)} programs...")

    all_programs_detailed_data = []

    # This function creates the data dictionary for a single program
    def process_program(program_entry):
        url = f"https://www.ouinfo.ca{program_entry['url']}"
        scraped_data = scrape_university_info(url)
        if scraped_data:
            return {
                'program_name': program_entry['name'],
                'program_url': url,
                **scraped_data
            }
        return None

    # We use 10 "workers" to scrape 10 pages at once.
    # WARNING: Don't go above 10-15 or the website might ban you for attacking them.
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks
        future_to_program = {executor.submit(process_program, p): p for p in programs_with_urls}

        # Collect results as they finish
        completed_count = 0
        for future in concurrent.futures.as_completed(future_to_program):
            result = future.result()
            if result:
                all_programs_detailed_data.append(result)

            completed_count += 1
            if completed_count % 50 == 0:
                print(f"Scraped {completed_count}/{len(programs_with_urls)} programs...")

    print(f"\nFinished. Successfully loaded {len(all_programs_detailed_data)} programs.")
else:
    print("\nFailed to retrieve the initial list of programs.")


# --- GRADIO WEB INTERFACE ---

try:
    import gradio as gr

    def web_wrapper(subjects, interests, average, grade, location):
        user_data = {
            'subjects': subjects,
            'intrests': interests,
            'overall_average': average,
            'grade': grade,
            'location': location
        }

        if not all_programs_detailed_data:
            return "Warning: No program data was scraped yet."

        return generate_chatbot_response(user_data, all_programs_detailed_data)

    interface = gr.Interface(
        fn=web_wrapper,
        inputs=[
            gr.Textbox(label="Current Subjects"),
            gr.Textbox(label="Interests"),
            gr.Textbox(label="Overall Average"),
            gr.Textbox(label="Grade"),
            gr.Textbox(label="Location")
        ],
        outputs=gr.Markdown(label="Response"),
        title="University Application Helper",
        description="Scraped data is ready. Enter your info below."
    )

    interface.launch(share=True, debug=True)

except ImportError:
    print("Gradio is not installed. Please run '!pip install gradio' in a cell above.")

Hello! I'm a student assistant chatbot.
Step 2: Building Master List of Programs (This loops through A-Z)...
Fetching Group: A... Found 92 programs.
Fetching Group: B... Found 129 programs.
Fetching Group: C... Found 278 programs.
Fetching Group: D-E... Found 160 programs.
Fetching Group: F-G... Found 99 programs.
Fetching Group: H... Found 77 programs.
Fetching Group: I... Found 66 programs.
Fetching Group: J-L... Found 71 programs.
Fetching Group: M... Found 117 programs.
Fetching Group: N-P... Found 155 programs.
Fetching Group: Q-S... Found 100 programs.
Fetching Group: T-Z... Found 54 programs.

Total Programs Found: 1398

Step 3: Starting parallel scrape of 1398 programs...
Scraped 50/1398 programs...
Scraped 100/1398 programs...
Scraped 150/1398 programs...
Scraped 200/1398 programs...
Scraped 250/1398 programs...
Scraped 300/1398 programs...
Scraped 350/1398 programs...
Scraped 400/1398 programs...
Scraped 450/1398 programs...
Scraped 500/1398 programs...
Scraped 550/1398 progr

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4c8d37d5133804a223.gradio.live
