<a href="https://colab.research.google.com/github/RajShah3006/university-recommender-ai/blob/main/ai_university_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q google-generativeai

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import google.generativeai as genai
from google.colab import userdata

# Assume GOOGLE_API_KEY is already set up in Colab secrets
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize the Gemini API
model = genai.GenerativeModel('models/gemini-2.5-flash-preview-05-20')
chat = model.start_chat(history=[])

'''
def get_user_information(user_data):
    """Collects information from the user."""
    user_data = {}
    user_data['subjects'] = input("What subjects are you taking currently? ")
    user_data['intrests'] = input("What are your intrests, some activity that you love to put effort into or you would like to be doing in 4 years? ")
    user_data['overall_average'] = input("What is your overall average? ")
    user_data['grade'] = input("What grade are you in? ")
    user_data['location'] = input("Where are you located? ")
    return user_data
    '''

def test_get_user_information():
    """Collects information from the user."""
    user_data = {}
    user_data['subjects'] = "Math, English, Physics"
    user_data['intrests'] = "Robotics & automation, arduino"
    user_data['overall_average'] = "87%"
    user_data['grade'] = "12th"
    user_data['location'] = "Toronto"
    return user_data

def list_all_programs(url):
    """
    Scrapes and lists all program names and their URLs from the given URL.

    Args:
      url: The URL of the page listing all programs.

    Returns:
      A list of dictionaries, where each dictionary contains 'name' and 'url'
      for a program, or None if scraping fails or no programs are found.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        programs_list = []

        # Find the main container
        container = soup.select_one('div.results.results-programs')

        if not container:
            print(f"Could not find the main programs container on {url}")
            return None

        # Find all program title elements within the container
        program_elements = container.select('h2.result-heading')

        if not program_elements:
            print(f"No program title elements found within the container on {url}")
            return None

        # Extract program names and URLs
        for program_element in program_elements:
            program_name = program_element.get_text(strip=True)
            anchor_tag = program_element.find('a', href=True)
            if anchor_tag:
                program_url = anchor_tag['href']
                programs_list.append({'name': program_name, 'url': program_url})

        if programs_list:
            return programs_list
        else:
            print(f"No program names and URLs extracted from {url}")
            return None

    except requests.exceptions.Timeout:
        print(f"Request timed out for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred during scraping or parsing {url}: {e}")
        return None

def scrape_university_info(url):
    """
    Scrapes university information from a given URL with a more flexible approach.

    Args:
      url: https://www.ouinfo.ca/

    Returns:
      A dictionary containing the extracted university information, or None if
      scraping fails or no information is found.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        university_data = {}

        # --- Flexible Searching for Information ---

        # Try to find the program title
        program_element = soup.select_one('h1.program-title')
        if program_element:
            university_data['program'] = program_element.get_text(strip=True)

        # Search for prerequisites using keywords and nearby list structures
        prerequisites = []
        # Look for common headings or text near prerequisites
        prereq_headings = soup.find_all(string=re.compile(r'Prerequisites|Admission Requirements', re.IGNORECASE))
        for heading in prereq_headings:
            # Try to find a list (ul or ol) immediately following the heading or within the same parent element
            list_element = heading.find_next(['ul', 'ol'])
            if list_element:
                list_items = [li.get_text(strip=True) for li in list_element.select('li')]
                if list_items:
                    prerequisites.extend(list_items)
            else:
                # If no list is found, try to extract text from the parent element or nearby paragraphs
                parent = heading.parent
                if parent:
                    # Look for text in the parent or next siblings that might contain prerequisites
                    text_content = parent.get_text(separator=' ', strip=True)
                    if len(text_content) > len(heading.get_text(strip=True)):  # Basic check to avoid just getting the heading text
                        prerequisites.append(text_content)

        if prerequisites:
            # Join unique prerequisites with newlines
            university_data['prerequisites'] = "\n".join(list(set(prerequisites)))

        # Search for admission average using keywords and patterns
        admission_average = None
        # Look for text containing keywords like "average", "admission", "minimum" followed by percentages or ranges
        average_text = soup.find(string=re.compile(r'(?:admission|minimum)?\s*average.*?\d+%', re.IGNORECASE))
        if average_text:
            admission_average = average_text.strip()
        else:
            # Look for common classes or structures near average information
            average_element = soup.select_one('.admission-average-range, .average-grade')  # Add other potential classes
            if average_element:
                admission_average = average_element.get_text(strip=True)

        if admission_average:
            university_data['admission_average'] = admission_average

        # Location is still unlikely to be on this page, keeping as None for now
        location_element = None
        if location_element:
            university_data['location'] = location_element.get_text(strip=True)

        if university_data:
            return university_data
        else:
            # print(f"No relevant information found on {url}") # Suppress this for cleaner output during bulk scraping
            return None

    except requests.exceptions.Timeout:
        print(f"Request timed out for URL: {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred during scraping or parsing {url}: {e}")
        return None


def generate_chatbot_response(user_data, all_programs_detailed_data):
    """Generates a chatbot response based on user data and scraped university info."""
    prompt = f"""Based on the following student information:
- **Subjects:** {user_data['subjects']}
- **Intrests:** {user_data['intrests']}
- **Overall Average:** {user_data['overall_average']}
- **Grade:** {user_data['grade']}

"""

    relevant_programs_info = ""
    user_interests_keywords = user_data['intrests'].lower().split() # Split interests into keywords

    if all_programs_detailed_data:
        relevant_programs_info += "\n**Information about potentially relevant programs:**\n\n" # Added introductory sentence and bolding
        relevant_programs = []
        for program_data in all_programs_detailed_data:
            program_name_lower = program_data.get('program_name', '').lower()
            # Enhanced filtering: Check if any interest keyword is a substring of a word in the program name
            # or if the program name is a substring of an interest keyword.
            if any(keyword in program_name_lower.split() or program_name_lower in keyword for keyword in user_interests_keywords):
                relevant_programs.append(program_data)

        if relevant_programs:
            for program_data in relevant_programs:
                relevant_programs_info += f"**Program Name:** {program_data.get('program_name', 'N/A')}\n" # Bold program name
                relevant_programs_info += f"**Program URL:** {program_data.get('program_url', 'N/A')}\n" # Bold program URL
                relevant_programs_info += f"**Prerequisites:** {program_data.get('prerequisites', 'N/A')}\n" # Bold prerequisites
                relevant_programs_info += f"**Admission Average:** {program_data.get('admission_average', 'N/A')}\n" # Bold admission average
                relevant_programs_info += "---\n\n" # Added newline for better separation

        else:
            relevant_programs_info += "Could not find detailed information for programs closely related to your interests.\n\n"

    if relevant_programs_info:
        prompt += relevant_programs_info
    else:
        prompt += "\nCould not retrieve detailed program information.\n"

    prompt += f"""
Please provide some relevant information, such as:
- What program is recommended based on the student's interests and scraped program data
- A ranking of relevant universities for that specific program, including:
  - What are the prerequisites
  - Last few years admission average
  - How far the university is located (if available from scraping or inferable)
  - Tuition and Fees (if available from scraping or inferable)
  - Has a supplementary application or not (if available from scraping or inferable)
- Recommendations for high school courses to pursue and projects to complete for university applications, specifically tailored to the student's interests, current subjects ({user_data['subjects']}), and grade level ({user_data['grade']}).

Be specific and tailor the response to the student's input and the provided program information. Only give information for universities in Ontario.
"""
    response = chat.send_message(prompt)
    return response.text

# --- Main Execution Flow ---
print("Hello! I'm a student assistant chatbot. I can help you with information related to your studies.")

# 1. Get user information
student_info = test_get_user_information()

# 2. Scrape program URLs
base_url = "https://www.ouinfo.ca"
programs_with_urls = list_all_programs(f"{base_url}/programs/all")

all_programs_detailed_data = []

# 3. Scrape detailed data for each program
if programs_with_urls:
    print("\nScraping detailed program information...")
    for program in programs_with_urls:
        program_url = f"{base_url}{program['url']}"
        # print(f"Attempting to scrape: {program_url}") # Keep this commented for cleaner output
        scraped_data = scrape_university_info(program_url)
        if scraped_data:
            # Combine program name and URL with scraped data
            detailed_data = {
                'program_name': program['name'],
                'program_url': program_url,
                **scraped_data
            }
            all_programs_detailed_data.append(detailed_data)
            # print(f"Successfully scraped data for {program['name']}") # Keep this commented for cleaner output
        else:
            pass # print(f"Failed to scrape data for {program['name']} from {program_url}") # Keep this commented for cleaner output

    print("Finished scraping.")

    # 4. Generate and display chatbot response
    if all_programs_detailed_data:
        bot_response = generate_chatbot_response(student_info, all_programs_detailed_data)
        print("\nChatbot Response:")
        print(bot_response)
    else:
        print("\nNo detailed program data was successfully scraped to generate a comprehensive response.")

else:
    print("\nFailed to retrieve the initial list of programs with URLs. Cannot proceed with scraping or generating a response.")

print("\nChat session ended.")

Hello! I'm a student assistant chatbot. I can help you with information related to your studies.

Scraping detailed program information...
Finished scraping.

Chatbot Response:
Based on the student's profile (Grade 12, 87% average, Math, English, Physics subjects, interests in Robotics & automation, Arduino) and the provided program information, here's a detailed recommendation:

### Recommended Program and University

The student's interests in "Robotics & automation" and "Arduino" strongly suggest a path in engineering or a related technology field. Out of the provided programs, the **"Automotive & Vehicle Engineering Technology I (Bachelor of Technology)" at McMaster University** is the most relevant option. While not explicitly "Robotics," Automotive Engineering Technology often involves aspects of automation, control systems, embedded electronics (like Arduino), and the application of physics and math, making it a strong fit for these interests.

### University and Program Details