<a href="https://colab.research.google.com/github/RajShah3006/Saarthi/blob/main/ai_university_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 google-generativeai gradio scikit-learn

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import google.generativeai as genai
from google.colab import userdata
import time
import concurrent.futures
import json
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# --- 1. SETUP ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception:
    print("‚ö†Ô∏è API Key check failed. Ensure it is in secrets.")

model = genai.GenerativeModel('gemini-2.5-flash')
chat = model.start_chat(history=[])

# --- 2. CACHING & UTILS ---
CACHE_FILE = "university_data_cached.json"

def save_data(data):
    with open(CACHE_FILE, 'w') as f:
        json.dump(data, f)
    print("üíæ Database saved.")

def load_data():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r') as f:
            data = json.load(f)
            # Auto-Fix names if they are missing
            for p in data:
                if 'program_name' not in p and 'name' in p:
                    p['program_name'] = p['name']
            print(f"‚ö° Loaded {len(data)} programs from cache.")
            return data
    return None

def get_embedding(text):
    clean_text = re.sub(r'\s+', ' ', str(text)).strip()[:2000]
    for attempt in range(3):
        try:
            result = genai.embed_content(
                model="models/text-embedding-004",
                content=clean_text,
                task_type="retrieval_document"
            )
            return result['embedding']
        except:
            time.sleep(1)
    return [0] * 768

# --- 3. SCRAPING FUNCTIONS ---
def list_all_programs(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        programs_list = []
        container = soup.select_one('div.results.results-programs')
        if not container: return None

        program_elements = container.select('h2.result-heading')
        for program_element in program_elements:
            program_name = program_element.get_text(strip=True)
            anchor_tag = program_element.find('a', href=True)
            if anchor_tag:
                programs_list.append({'name': program_name, 'url': anchor_tag['href']})
        return programs_list
    except:
        return None

def scrape_university_info(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}

        prerequisites = []
        headings = soup.find_all(string=re.compile(r'Prerequisites|Admission Requirements', re.IGNORECASE))
        for h in headings:
            lst = h.find_next(['ul', 'ol'])
            if lst: prerequisites.extend([li.get_text(strip=True) for li in lst.select('li')])

        if prerequisites: data['prerequisites'] = ", ".join(list(set(prerequisites)))

        avg = soup.find(string=re.compile(r'\d+%', re.IGNORECASE))
        if avg: data['admission_average'] = avg.strip()

        return data
    except:
        return {}

# --- 4. MAIN EXECUTION (Scrape + Embed) ---
all_programs_detailed_data = load_data()

if not all_programs_detailed_data:
    print("üöÄ No cache found. Starting FRESH SCRAPE...")

    # A. Get URLs
    programs_with_urls = []
    alphabet_groups = ['a', 'b', 'c', 'd-e', 'f-g', 'h', 'i', 'j-l', 'm', 'n-p', 'q-s', 't-z']
    for group in alphabet_groups:
        res = list_all_programs(f"https://www.ouinfo.ca/programs/search/?search=&group={group}")
        if res: programs_with_urls.extend(res)
    print(f"‚úÖ Found {len(programs_with_urls)} programs.")

    # B. Scrape Details (Parallel)
    print("‚è≥ Scraping details (Parallel)...")
    scraped_results = []

    def process_program(entry):
        url = f"https://www.ouinfo.ca{entry['url']}"
        data = scrape_university_info(url)
        return {
            'program_name': entry['name'],
            'program_url': url,
            **data
        }

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_program, p): p for p in programs_with_urls}
        completed = 0
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result: scraped_results.append(result)
            completed += 1
            if completed % 50 == 0: print(f"{completed}...", end="")

    # C. Embeddings (Sequential to avoid Timeout)
    print("\nüß† Generating AI Embeddings (One-by-one to prevent timeout)...")
    all_programs_detailed_data = []

    for i, item in enumerate(scraped_results):
        text = f"{item['program_name']} {item.get('prerequisites', '')}"
        item['embedding'] = get_embedding(text)
        all_programs_detailed_data.append(item)

        if i % 25 == 0: print(".", end="")
        time.sleep(0.1)

    save_data(all_programs_detailed_data)

# --- 5. CHATBOT & GRADIO ---
def find_best_matches(user_query, all_data, top_k=8):
    query_vector = get_embedding(user_query)
    valid_data = [p for p in all_data if 'embedding' in p]
    if not valid_data: return all_data[:5]

    db_vectors = [p['embedding'] for p in valid_data]
    scores = cosine_similarity([query_vector], db_vectors)[0]
    top_indices = scores.argsort()[-top_k:][::-1]
    return [valid_data[i] for i in top_indices]

def generate_chatbot_response(user_data, relevant_programs):
    context = ""
    for p in relevant_programs:
        name = p.get('program_name', 'Unknown Program')
        url = p.get('program_url', '#')
        avg = p.get('admission_average', 'Not Listed')
        prereqs = p.get('prerequisites', 'Not listed') # Grab Prereqs for the prompt

        context += f"- {name}\n"
        context += f"  Target Avg: {avg}\n"
        context += f"  Required Prereqs: {prereqs}\n"
        context += f"  Link: {url}\n\n"

    prompt = f"""
    Act as 'Saarthi', a wise, encouraging university guidance counselor.

    STUDENT PROFILE:
    - Interests: {user_data['intrests']}
    - Current Grade Level: {user_data['grade']}
    - Estimated Average: {user_data['overall_average']}
    - Current Subjects: {user_data['subjects']}
    - Location: {user_data['location']}

    TOP DATABASE MATCHES:
    {context}

    INSTRUCTIONS:
    1. **Rank & Recommend:** Recommend the top 10 programs from the list above.
    2. **Subject Check (CRITICAL):** Compare the student's "Current Subjects" against the "Required Prereqs".
       - If they are missing a key subject, warn them politely!
       - If their subjects look perfect, tell them they are on the right track.
    3. **Fit Analysis:** Explain why these programs fit their interest in "{user_data['intrests']}".
    4. **Extracurriculars:** Suggest specific side projects or clubs based on their interests.
    5. **Tone:** Be warm, supportive, and use emojis.
    """
    try:
        response = chat.send_message(prompt)
        return response.text
    except Exception as e:
        return f"Error: {e}"

def web_wrapper(subjects, interests, average, grade, location):
    user_data = {
        'subjects': subjects, 'intrests': interests,
        'overall_average': average, 'grade': grade, 'location': location
    }
    matches = find_best_matches(interests, all_programs_detailed_data)
    return generate_chatbot_response(user_data, matches)

interface = gr.Interface(
    fn=web_wrapper,
    inputs=[
        gr.Textbox(label="Current Subjects (e.g. Calculus, Physics, English)"),
        gr.Textbox(label="Interests"),
        gr.Textbox(label="Overall Average%"),
        gr.Textbox(label="What grade are you in?"),
        gr.Textbox(label="Location")
    ],
    outputs=gr.Markdown(label="Saarthi Advice"),
    title="Saarthi AI",
    description="Running locally in Colab."
)

interface.launch(inline=True, share=True, debug=True)

‚ö° Loaded 1399 programs from cache.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a27b505ec8fa417d8e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a27b505ec8fa417d8e.gradio.live




In [1]:
import requests
from bs4 import BeautifulSoup
import re
import google.generativeai as genai
from google.colab import userdata
import time
import concurrent.futures
import json
import os
import csv
import datetime
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# --- 1. SETUP ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception:
    print("‚ö†Ô∏è API Key check failed. Ensure it is in secrets.")

model = genai.GenerativeModel('gemini-2.5-flash')
chat = model.start_chat(history=[])

# --- 2. LOGGING & CACHING ---
CACHE_FILE = "university_data_cached.json"
LOG_FILE = "user_traffic_logs.csv"

if not os.path.exists(LOG_FILE):
    with open(LOG_FILE, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Timestamp", "Grade", "Location", "Interests", "Subjects"])

def log_interaction(grade, location, interests, subjects):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"\nüîî [ALERT {timestamp}] New User from {location}!")
    with open(LOG_FILE, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([timestamp, grade, location, interests, subjects])

def save_data(data):
    with open(CACHE_FILE, 'w') as f:
        json.dump(data, f)
    print("üíæ Database saved.")

def load_data():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r') as f:
            data = json.load(f)
            for p in data:
                if 'program_name' not in p and 'name' in p:
                    p['program_name'] = p['name']
            print(f"‚ö° Loaded {len(data)} programs from cache.")
            return data
    return None

def get_embedding(text):
    clean_text = re.sub(r'\s+', ' ', str(text)).strip()[:2000]
    for attempt in range(3):
        try:
            result = genai.embed_content(
                model="models/text-embedding-004",
                content=clean_text,
                task_type="retrieval_document"
            )
            return result['embedding']
        except:
            time.sleep(1)
    return [0] * 768

# --- 3. SCRAPING FUNCTIONS ---
def list_all_programs(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        programs_list = []
        container = soup.select_one('div.results.results-programs')
        if not container: return None
        program_elements = container.select('h2.result-heading')
        for program_element in program_elements:
            program_name = program_element.get_text(strip=True)
            anchor_tag = program_element.find('a', href=True)
            if anchor_tag:
                programs_list.append({'name': program_name, 'url': anchor_tag['href']})
        return programs_list
    except:
        return None

def scrape_university_info(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}
        prerequisites = []
        headings = soup.find_all(string=re.compile(r'Prerequisites|Admission Requirements', re.IGNORECASE))
        for h in headings:
            lst = h.find_next(['ul', 'ol'])
            if lst: prerequisites.extend([li.get_text(strip=True) for li in lst.select('li')])
        if prerequisites: data['prerequisites'] = ", ".join(list(set(prerequisites)))
        avg = soup.find(string=re.compile(r'\d+%', re.IGNORECASE))
        if avg: data['admission_average'] = avg.strip()
        return data
    except:
        return {}

# --- 4. MAIN EXECUTION ---
all_programs_detailed_data = load_data()

if not all_programs_detailed_data:
    print("üöÄ No cache found. Starting FRESH SCRAPE...")
    programs_with_urls = []
    alphabet_groups = ['a', 'b', 'c', 'd-e', 'f-g', 'h', 'i', 'j-l', 'm', 'n-p', 'q-s', 't-z']
    for group in alphabet_groups:
        res = list_all_programs(f"https://www.ouinfo.ca/programs/search/?search=&group={group}")
        if res: programs_with_urls.extend(res)
    print(f"‚úÖ Found {len(programs_with_urls)} programs.")

    scraped_results = []
    def process_program(entry):
        url = f"https://www.ouinfo.ca{entry['url']}"
        data = scrape_university_info(url)
        return {'program_name': entry['name'], 'program_url': url, **data}

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_program, p): p for p in programs_with_urls}
        completed = 0
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result: scraped_results.append(result)
            completed += 1
            if completed % 50 == 0: print(f"{completed}...", end="")

    print("\nüß† Generating AI Embeddings...")
    all_programs_detailed_data = []
    for i, item in enumerate(scraped_results):
        text = f"{item['program_name']} {item.get('prerequisites', '')}"
        item['embedding'] = get_embedding(text)
        all_programs_detailed_data.append(item)
        if i % 25 == 0: print(".", end="")
        time.sleep(0.1)
    save_data(all_programs_detailed_data)

# --- 5. CHATBOT & GRADIO ---
def find_best_matches(user_query, all_data, top_k=8):
    query_vector = get_embedding(user_query)
    valid_data = [p for p in all_data if 'embedding' in p]
    if not valid_data: return all_data[:5]
    db_vectors = [p['embedding'] for p in valid_data]
    scores = cosine_similarity([query_vector], db_vectors)[0]
    top_indices = scores.argsort()[-top_k:][::-1]
    return [valid_data[i] for i in top_indices]

def generate_chatbot_response(user_data, relevant_programs):
    context = ""
    for p in relevant_programs:
        name = p.get('program_name', 'Unknown Program')
        url = p.get('program_url', '#')
        avg = p.get('admission_average', 'Not Listed')
        prereqs = p.get('prerequisites', 'Not listed')
        context += f"- PROGRAM: {name}\n  AVG: {avg}\n  PREREQS: {prereqs}\n  LINK: {url}\n\n"

    # --- THIS IS THE UPDATED PROMPT FOR COMMUTE LOGIC ---
    prompt = f"""
    Act as 'Saarthi', a wise university guidance counselor in Ontario.

    STUDENT PROFILE:
    - Interest: {user_data['intrests']}
    - Grade: {user_data['grade']}
    - Avg: {user_data['overall_average']}
    - Subjects: {user_data['subjects']}
    - HOME LOCATION: {user_data['location']}

    TOP MATCHES:
    {context}

    INSTRUCTIONS:
    1. Recommend the top 3 programs.
    2. **Prerequisite Check:** Compare their subjects to the requirements. Warn if missing.
    3. **COMMUTE ANALYSIS (Crucial):** For each recommended university, calculate the estimated travel time from '{user_data['location']}'.
       - **Time:** Estimate the one-way time (e.g. "45 mins").
       - **Mode:** Suggest the best way (GO Train, TTC, Bus, or Car).
       - **Cost:** Estimate monthly cost (e.g. "GO Train is approx $250/month" or "Gas is approx $200/month").
       - **Verdict:** If the commute is > 1 hour one-way, strongly recommend **RESIDENCE**. If < 45 mins, recommend **COMMUTING**.

    4. **Tone:** Warm and supportive. Use emojis üöå üè† üéì.
    """
    try:
        response = chat.send_message(prompt)
        return response.text
    except Exception as e:
        return f"Error: {e}"

def web_wrapper(subjects, interests, average, grade, location):
    log_interaction(grade, location, interests, subjects)

    user_data = {
        'subjects': subjects, 'intrests': interests,
        'overall_average': average, 'grade': grade, 'location': location
    }
    matches = find_best_matches(interests, all_programs_detailed_data)
    return generate_chatbot_response(user_data, matches)

interface = gr.Interface(
    fn=web_wrapper,
    inputs=[
        gr.Textbox(label="Current Subjects"),
        gr.Textbox(label="Interests"),
        gr.Textbox(label="Overall Average%"),
        gr.Textbox(label="Grade"),
        gr.Textbox(label="Location (City, ON)"), # Updated Label
    ],
    outputs=gr.Markdown(label="Saarthi Advice"),
    title="Saarthi AI: Commute & Program Guide",
    description="I calculate commute times, costs, and residency options for you!"
)

interface.launch(inline=True, share=True, debug=True)

‚ö° Loaded 1399 programs from cache.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://e0383c1c3be35e8c67.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



üîî [ALERT 2025-11-30 04:23:41] New User from Oakville!
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://e0383c1c3be35e8c67.gradio.live


