<a href="https://colab.research.google.com/github/RajShah3006/Saarthi/blob/main/ai_university_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 google-generativeai gradio scikit-learn

In [5]:
import requests
from bs4 import BeautifulSoup
import re
import google.generativeai as genai
from google.colab import userdata
from google.colab import drive
import time
import concurrent.futures
import json
import os
import csv
import datetime
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# --- 1. SETUP & DRIVE ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception:
    print("‚ö†Ô∏è API Key missing.")

print("üìÇ Connecting to Google Drive...")
drive.mount('/content/drive')

DRIVE_FOLDER = "/content/drive/My Drive/Saarthi_Project_Data"
if not os.path.exists(DRIVE_FOLDER):
    os.makedirs(DRIVE_FOLDER)

CACHE_FILE = f"{DRIVE_FOLDER}/university_data_cached.json"
LOG_FILE = f"{DRIVE_FOLDER}/user_traffic_logs.csv"

model = genai.GenerativeModel('gemini-2.5-flash')
chat = model.start_chat(history=[])

# --- 2. DATA CONSTANTS ---
GRADE_OPTIONS = ["Grade 11", "Grade 12", "Gap Year"]

ALL_COURSES = [
    "Dramatic Arts (ADA3M1)", "Music (AMU3M1)", "Visual Arts (AVI3M1)",
    "Financial Accounting (BAF3M1)", "Entrepreneurship (BDI3C1)",
    "Physics (SPH3U1)", "Biology (SBI3U1)", "Chemistry (SCH3U1)",
    "Functions (MCR3U1)", "English (ENG3U1)", "Computer Science (ICS3U1)",
    "Calculus and Vectors (MCV4U1)", "Advanced Functions (MHF4U1)",
    "English (ENG4U1)", "Physics (SPH4U1)", "Chemistry (SCH4U1)",
    "Biology (SBI4U1)", "Data Management (MDM4U1)", "Kinesiology (PSK4U1)"
]

# --- 3. UTILS ---
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

def log_interaction(grade, location, interests, subjects):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    if not os.path.exists(LOG_FILE):
        with open(LOG_FILE, mode='w', newline='') as f:
            csv.writer(f).writerow(["Timestamp", "Grade", "Location", "Interests", "Subjects"])
    with open(LOG_FILE, mode='a', newline='') as f:
        csv.writer(f).writerow([timestamp, grade, location, interests, subjects])

def save_data(data):
    with open(CACHE_FILE, 'w') as f:
        json.dump(data, f)

def load_data():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r') as f:
            data = json.load(f)
            if data and 'program_name' in data[0]:
                print(f"‚ö° Loaded {len(data)} programs from Drive.")
                return data
    return None

def get_batch_embeddings(text_list):
    try:
        result = genai.embed_content(model="models/text-embedding-004", content=text_list, task_type="retrieval_document")
        return result['embedding']
    except: return [[0]*768 for _ in range(len(text_list))]

def get_single_embedding(text):
    try:
        result = genai.embed_content(model="models/text-embedding-004", content=str(text)[:2000], task_type="retrieval_query")
        return result['embedding']
    except: return [0] * 768

# --- 4. SCRAPING ---
def list_all_programs(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        programs_list = []
        for el in soup.select('h2.result-heading'):
            programs_list.append({'name': el.get_text(strip=True), 'url': el.find('a', href=True)['href']})
        return programs_list
    except: return None

def scrape_university_info(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}
        prereqs = []
        for h in soup.find_all(string=re.compile(r'Prerequisites|Admission Requirements', re.IGNORECASE)):
            lst = h.find_next(['ul', 'ol'])
            if lst: prereqs.extend([li.get_text(strip=True) for li in lst.select('li')])
        data['prerequisites'] = ", ".join(list(set(prereqs))) if prereqs else "Not listed"
        avg = soup.find(string=re.compile(r'\d+%', re.IGNORECASE))
        data['admission_average'] = avg.strip() if avg else "Not listed"
        return data
    except: return {}

# --- 5. MAIN EXECUTION ---
all_programs_detailed_data = load_data()

if not all_programs_detailed_data:
    print("üöÄ Starting Scrape...")
    programs_with_urls = []
    for group in ['a', 'b', 'c', 'd-e', 'f-g', 'h', 'i', 'j-l', 'm', 'n-p', 'q-s', 't-z']:
        res = list_all_programs(f"https://www.ouinfo.ca/programs/search/?search=&group={group}")
        if res: programs_with_urls.extend(res)

    print("‚è≥ Scraping Details...")
    scraped_results = []
    def process_program(entry):
        url = f"https://www.ouinfo.ca{entry['url']}"
        data = scrape_university_info(url)
        return {'program_name': entry['name'], 'program_url': url, **data}

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(process_program, p): p for p in programs_with_urls}
        for future in concurrent.futures.as_completed(futures):
            if future.result(): scraped_results.append(future.result())

    print("üß† Generating Embeddings...")
    texts = [f"{item['program_name']} {item.get('prerequisites','')}"[:2000] for item in scraped_results]
    all_vectors = []
    for i in range(0, len(texts), 50):
        print(f"Embedding batch {i}...", end="\r")
        all_vectors.extend(get_batch_embeddings(texts[i : i + 50]))
        time.sleep(1)

    all_programs_detailed_data = []
    for i, item in enumerate(scraped_results):
        if i < len(all_vectors):
            item['embedding'] = all_vectors[i]
            all_programs_detailed_data.append(item)
    save_data(all_programs_detailed_data)

# --- 6. LOGIC & STATE MANAGEMENT ---

def find_best_matches(query, data, top_k=5):
    q_vec = get_single_embedding(query)
    valid_data = [x for x in data if 'embedding' in x]
    db_vecs = [x['embedding'] for x in valid_data]
    scores = cosine_similarity([q_vec], db_vecs)[0]
    top_indices = scores.argsort()[-top_k:][::-1]
    return [valid_data[i] for i in top_indices]

# STEP 1: Generate the Initial Big Report
def initial_report(subjects, interests, average, grade, location):
    subjects_str = ", ".join(subjects) if subjects else "None"
    log_interaction(grade, location, interests, subjects_str)

    matches = find_best_matches(interests, all_programs_detailed_data)

    # Store this context for the Chatbot to use later
    context_data = {
        "profile": f"Grade: {grade}, Avg: {average}, Loc: {location}, Subj: {subjects_str}, Int: {interests}",
        "matches": matches
    }

    context_str = ""
    for p in matches:
        context_str += f"- {p['program_name']} (Avg: {p['admission_average']})\n  Prereqs: {p['prerequisites']}\n  Link: {p['program_url']}\n\n"

    prompt = f"""
    Act as 'Saarthi', a wise university guidance counselor.
    PROFILE: {context_data['profile']}
    MATCHES: {context_str}

    INSTRUCTIONS:
    1. **Rank & Recommend:** Recommend the top 10 programs.
    2. **Prerequsite Check:** Compare "Subjects" vs "Prereqs". Warn if missing.
    3. **Fit Analysis:** Explain fit.
    4. **Extracurriculars:** Suggest side projects.
    5. **COMMUTE ANALYSIS:** - Calculate estimated travel time from '{user_data['location']}' to the university.
       - If > 1 hour, recommend RESIDENCE.
       - Estimate Cost (GO Train/Gas).
    6. **Tone:** Warm and supportive. Use emojis.
    """
    response = chat.send_message(prompt).text

    # Return 3 things:
    # 1. The Chat History (User message + AI response)
    # 2. The Context Data (saved to State)
    # 3. Visibility update for the Follow-up Box
    return [(None, response)], context_data, gr.update(visible=True)

# STEP 2: Handle Follow-up Questions
def follow_up_chat(user_message, history, context_data):
    if not context_data:
        return history + [(user_message, "Please generate a report first!")]

    # Re-build context string from saved state
    context_str = ""
    for p in context_data['matches']:
        context_str += f"- {p['program_name']} (Avg: {p['admission_average']})\n  Prereqs: {p['prerequisites']}\n\n"

    prompt = f"""
    CONTEXT:
    The user previously asked for university advice.
    Profile: {context_data['profile']}
    University Options discussed: {context_str}

    NEW USER QUESTION: "{user_message}"

    INSTRUCTIONS:
    Answer the question based on the University Options above.
    If they ask about tuition, campus life, or specific details not in the text, use your general knowledge as an AI.
    Keep it conversational.
    """

    response = chat.send_message(prompt).text
    history.append((user_message, response))
    return history, "" # Return history and clear the textbox

# --- 7. HYBRID UI (The Blocks System) ---

with gr.Blocks(theme=gr.themes.Soft()) as app:
    # State holds data between clicks (Invisible to user)
    session_state = gr.State()

    gr.Markdown("# üèπ Saarthi: AI University Guide")
    gr.Markdown("Start by filling out your profile to get a custom roadmap. Then, chat with Saarthi to ask more questions!")

    with gr.Row():
        with gr.Column(scale=1):
            # Input Section
            inp_subjects = gr.Dropdown(ALL_COURSES, multiselect=True, label="Current Subjects", allow_custom_value=True)
            inp_interests = gr.Textbox(label="Interests (e.g. Robotics)")
            inp_avg = gr.Textbox(label="Average %")
            inp_grade = gr.Dropdown(GRADE_OPTIONS, label="Grade")
            inp_loc = gr.Textbox(label="Location (City, ON)")

            btn_generate = gr.Button("üöÄ Generate Roadmap", variant="primary")

        with gr.Column(scale=2):
            # Output Section
            chatbot = gr.Chatbot(label="Saarthi's Advice", height=550, bubble_full_width=False)

            # Follow-up Box (Hidden until report is generated)
            txt_followup = gr.Textbox(label="Ask a follow-up question...", placeholder="e.g. 'How much is tuition?'", visible=False)
            btn_ask = gr.Button("Ask", visible=False)

    # EVENT 1: Generate Initial Report
    btn_generate.click(
        fn=initial_report,
        inputs=[inp_subjects, inp_interests, inp_avg, inp_grade, inp_loc],
        outputs=[chatbot, session_state, txt_followup] # Updates chat, saves state, shows text box
    ).then(
        # Also show the 'Ask' button
        lambda: gr.update(visible=True), None, btn_ask
    )

    # EVENT 2: Ask Follow-up (Press Enter)
    txt_followup.submit(
        fn=follow_up_chat,
        inputs=[txt_followup, chatbot, session_state],
        outputs=[chatbot, txt_followup]
    )

    # EVENT 3: Ask Follow-up (Click Button)
    btn_ask.click(
        fn=follow_up_chat,
        inputs=[txt_followup, chatbot, session_state],
        outputs=[chatbot, txt_followup]
    )

app.launch(inline=True, share=True, debug=True)

üìÇ Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚ö° Loaded 1399 programs from Drive.


  with gr.Blocks(theme=gr.themes.Soft()) as app:
  chatbot = gr.Chatbot(label="Saarthi's Advice", height=550, bubble_full_width=False)
  chatbot = gr.Chatbot(label="Saarthi's Advice", height=550, bubble_full_width=False)
  chatbot = gr.Chatbot(label="Saarthi's Advice", height=550, bubble_full_width=False)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://213b328ab877ce0f9d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://213b328ab877ce0f9d.gradio.live


