<a href="https://colab.research.google.com/github/RajShah3006/Saarthi/blob/main/ai_university_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 google-generativeai gradio scikit-learn

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import google.generativeai as genai
from google.colab import userdata
from google.colab import drive
import time
import concurrent.futures
import json
import os
import csv
import datetime
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# --- 1. SETUP & DRIVE ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception:
    print("‚ö†Ô∏è API Key missing. Please check Colab Secrets.")

print("üìÇ Connecting to Google Drive...")
drive.mount('/content/drive')

DRIVE_FOLDER = "/content/drive/My Drive/Saarthi_Project_Data"
if not os.path.exists(DRIVE_FOLDER):
    os.makedirs(DRIVE_FOLDER)

CACHE_FILE = f"{DRIVE_FOLDER}/university_data_cached.json"
LOG_FILE = f"{DRIVE_FOLDER}/user_traffic_logs.csv"

model = genai.GenerativeModel('gemini-2.5-flash')
chat = model.start_chat(history=[])

# --- 2. DATA CONSTANTS ---
GRADE_OPTIONS = ["Grade 11", "Grade 12", "Gap Year"]

# (Your Full List Goes Here)
ALL_COURSES = [
    # --- GRADE 12 COURSES (from your file) ---
    "Dramatic Arts (ADA4M1)", "Drama Film/Video (ADV4M1)",
    "Exploring and Creating in the Arts (AEA4O1)", "Guitar Music (AMG4M1)",
    "Music (AMU4M1)", "Visual Arts (AVI4M1)", "Visual Arts - Info/Consumer (AWE4M1)",
    "Visual Arts - Fashion (AWI4M1)", "Visual Arts - Drawing (AWM4M1)",
    "Photography (AWQ4M1)", "Visual Arts - Film/Video (AWR4M1)",
    "Visual Arts - Computer (AWS4M1)", "Visual Arts - Non-Traditional (AWT4M1)",
    "Entrepreneurship: Venture Planning (BDV4C1)",
    "Environment & Resource Mgmt (CGR4M1)", "World Issues (CGW4U1)",
    "Canada: History, Identity, Culture (CHI4U1)", "World History (CHY4U1)",
    "Canadian & International Law (CLN4U1)", "Canadian & World Politics (CPW4U1)",
    "English (ENG4U1)", "English (College) (ENG4C1)", "Studies in Literature (ETS4U1)",
    "The Writer's Craft (EWC4U1)", "Nutrition and Health (HFA4U1)",
    "Personal Life Management (HIP4O1)", "Challenge and Change in Society (HSB4U1)",
    "Equity and Social Justice (HSE4M1)", "Philosophy (HZT4U1)",
    "Interdisciplinary Studies (IDC4U1)", "Foundations for College Math (MAP4C1)",
    "Calculus and Vectors (MCV4U1)", "Data Management (MDM4U1)",
    "Advanced Functions (MHF4U1)", "Literacy Course (OLC4O1)",
    "Personal Fitness (PAF4O1)", "Recreation Leadership (PLF4M1)",
    "Healthy Active Living (PPL4O1)", "Kinesiology (PSK4U1)",
    "Biology (SBI4U1)", "Chemistry (SCH4U1)", "Physics (SPH4U1)",

    # --- GRADE 11 COURSES (from your file) ---
    "Dramatic Arts (ADA3M1)", "Drama Film/Video (ADV3M1)", "Guitar Music (AMG3M1)",
    "Media Arts (ASM3M1)", "Visual Arts (AVI3M1)", "Visual Arts - Crafts (AWA3M1)",
    "Visual Arts - Fashion (AWI3M1)", "Photography (AWQ3M1)",
    "Financial Accounting (BAF3M1)", "Entrepreneurship (BDI3C1)",
    "Marketing (BMI3C1)", "Forces of Nature (CGF3M1)", "Travel and Tourism (CGG3O1)",
    "Genocide and Crimes Against Humanity (CHG381)", "World History to 16th Century (CHW3M1)",
    "Understanding Canadian Law (CLU3M1)", "Media Studies (EMS3O1)",
    "Food and Culture (HFC3M1)", "World Religions (HRT3M1)",
    "Intro to Anthropology/Psych/Soc (HSP3U1)", "Philosophy: Big Questions (HZB3M1)",
    "Functions (MCR3U1)", "Functions & Applications (MCF3M1)",
    "First Nations, M√©tis, Inuit Voices (NBE3U1)", "Biology (SBI3U1)",
    "Chemistry (SCH3U1)", "Physics (SPH3U1)", "Environmental Science (SVN3M1)",
    "Technological Design (TDJ3M1)", "Hairstyling and Aesthetics (TXJ3E1)"
]

# --- 3. UTILS (FIXED) ---

# FIX #1: Added the missing HEADERS variable
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def log_interaction(grade, location, interests, subjects):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # Only create header if file doesn't exist
    if not os.path.exists(LOG_FILE):
        with open(LOG_FILE, mode='w', newline='') as f:
            csv.writer(f).writerow(["Timestamp", "Grade", "Location", "Interests", "Subjects"])

    with open(LOG_FILE, mode='a', newline='') as f:
        csv.writer(f).writerow([timestamp, grade, location, interests, subjects])

def save_data(data):
    with open(CACHE_FILE, 'w') as f:
        json.dump(data, f)

def load_data():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r') as f:
            data = json.load(f)
            if data and 'program_name' in data[0]:
                print(f"‚ö° Loaded {len(data)} programs from Drive.")
                return data
    return None

# FIX #2: Added the Batch Embedding function
def get_batch_embeddings(text_list):
    try:
        result = genai.embed_content(
            model="models/text-embedding-004",
            content=text_list,
            task_type="retrieval_document"
        )
        return result['embedding']
    except Exception as e:
        print(f"Batch Error: {e}")
        return [[0]*768 for _ in range(len(text_list))]

def get_single_embedding(text):
    try:
        result = genai.embed_content(
            model="models/text-embedding-004",
            content=str(text)[:2000],
            task_type="retrieval_query"
        )
        return result['embedding']
    except: return [0] * 768

# --- 4. SCRAPING ---
def list_all_programs(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        programs_list = []
        for el in soup.select('h2.result-heading'):
            programs_list.append({'name': el.get_text(strip=True), 'url': el.find('a', href=True)['href']})
        return programs_list
    except: return None

def scrape_university_info(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}
        prereqs = []
        for h in soup.find_all(string=re.compile(r'Prerequisites|Admission Requirements', re.IGNORECASE)):
            lst = h.find_next(['ul', 'ol'])
            if lst: prereqs.extend([li.get_text(strip=True) for li in lst.select('li')])
        data['prerequisites'] = ", ".join(list(set(prereqs))) if prereqs else "Not listed"
        avg = soup.find(string=re.compile(r'\d+%', re.IGNORECASE))
        data['admission_average'] = avg.strip() if avg else "Not listed"
        return data
    except: return {}

# --- 5. MAIN EXECUTION ---
all_programs_detailed_data = load_data()

if not all_programs_detailed_data:
    print("üöÄ Starting Scrape...")
    programs_with_urls = []
    # Full alphabet required for real scrape
    alphabet = ['a', 'b', 'c', 'd-e', 'f-g', 'h', 'i', 'j-l', 'm', 'n-p', 'q-s', 't-z']

    for group in alphabet:
        res = list_all_programs(f"https://www.ouinfo.ca/programs/search/?search=&group={group}")
        if res: programs_with_urls.extend(res)

    print(f"‚úÖ Found {len(programs_with_urls)} programs. Deep scraping...")
    scraped_results = []

    def process_program(entry):
        url = f"https://www.ouinfo.ca{entry['url']}"
        data = scrape_university_info(url)
        return {'program_name': entry['name'], 'program_url': url, **data}

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_program, p): p for p in programs_with_urls}
        for future in concurrent.futures.as_completed(futures):
            if future.result(): scraped_results.append(future.result())

    print("üß† Generating Embeddings (Batch Mode)...")
    texts = [f"{item['program_name']} {item.get('prerequisites','')}"[:2000] for item in scraped_results]
    all_vectors = []

    # Process 50 at a time
    for i in range(0, len(texts), 50):
        print(f"Embedding batch {i}...", end="\r")
        all_vectors.extend(get_batch_embeddings(texts[i : i + 50]))
        time.sleep(0.5)

    all_programs_detailed_data = []
    for i, item in enumerate(scraped_results):
        if i < len(all_vectors):
            item['embedding'] = all_vectors[i]
            all_programs_detailed_data.append(item)

    save_data(all_programs_detailed_data)

# --- 6. LOGIC ---

def find_best_matches(query, data, top_k=5):
    q_vec = get_single_embedding(query)
    valid_data = [x for x in data if 'embedding' in x]
    if not valid_data: return data[:5]

    db_vecs = [x['embedding'] for x in valid_data]
    scores = cosine_similarity([q_vec], db_vecs)[0]
    top_indices = scores.argsort()[-top_k:][::-1]
    return [valid_data[i] for i in top_indices]

def generate_chatbot_response(user_data, relevant_programs):
    context = ""
    for p in relevant_programs:
        context += f"- {p['program_name']} (Avg: {p['admission_average']})\n"
        context += f"  Prereqs: {p['prerequisites']}\n"
        context += f"  Link: {p['program_url']}\n\n"

    prompt = f"""
    Act as 'Saarthi', a wise university guidance counselor.

    STUDENT:
    - Interests: {user_data['intrests']}
    - Grade: {user_data['grade']}
    - Avg: {user_data['overall_average']}
    - Courses: {user_data['subjects']}
    - Location: {user_data['location']}

    OPTIONS:
    {context}

    TASK:
    1. **Rank & Recommend:** Recommend the top 10 programs.
    2. **Prerequsite Check:** Compare "Subjects" vs "Prereqs". Warn if missing.
    3. **Fit Analysis:** Explain fit.
    4. **Extracurriculars:** Suggest side projects.
    5. **COMMUTE ANALYSIS:** - Calculate estimated travel time from '{user_data['location']}' to the university.
       - If > 1 hour, recommend RESIDENCE.
       - Estimate Cost (GO Train/Gas).
    6. **Tone:** Warm and supportive. Use emojis.
    """
    try:
        return chat.send_message(prompt).text
    except Exception as e:
        return f"Error: {e}"

def web_wrapper(subjects_list, interests, average, grade, location):
    # Convert list to string for AI
    subjects_str = ", ".join(subjects_list) if subjects_list else "None"

    log_interaction(grade, location, interests, subjects_str)

    user_data = {
        'subjects': subjects_str, 'intrests': interests,
        'overall_average': average, 'grade': grade, 'location': location
    }
    matches = find_best_matches(interests, all_programs_detailed_data)
    return generate_chatbot_response(user_data, matches)

# --- 7. UI ---
interface = gr.Interface(
    fn=web_wrapper,
    inputs=[
        # FIX #3: Added allow_custom_value=True so they can type new subjects
        gr.Dropdown(
            ALL_COURSES,
            multiselect=True,
            label="Subjects",
            info="Type to search (e.g. 'Math'). You can also type custom courses.",
            allow_custom_value=True
        ),
        gr.Textbox(label="Interests"),
        gr.Textbox(label="Average %"),
        gr.Dropdown(GRADE_OPTIONS, label="Grade"),
        gr.Textbox(label="Location (City)"),
    ],
    outputs=gr.Markdown(label="Saarthi Advice"),
    title="Saarthi AI",
    description="Your Personal University Guide"
)

interface.launch(inline=True, share=True, debug=True)