In [None]:
# ----------------------------------------------------------------------
# 1. SETUP & INSTALLATIONS
# ----------------------------------------------------------------------
!pip install -q pandas sentence-transformers tqdm
print("All required libraries are installed.")

# ----------------------------------------------------------------------
# 2. IMPORTS
# ----------------------------------------------------------------------
import os
import glob
import pandas as pd
from tqdm.notebook import tqdm
import re
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from google.colab import drive

# ----------------------------------------------------------------------
# 3. CONFIG
# ----------------------------------------------------------------------
DRIVE_DATA_DIR = "/content/drive/MyDrive/LM_Project"
COLAB_OUTPUT_DIR = "/content/app_data"
EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"

# ----------------------------------------------------------------------
# 4. DATA PROCESSING FUNCTIONS
# ----------------------------------------------------------------------
def parse_term(term: str):
    term_str = str(term).lower(); year_match = re.search(r"\d{4}", term_str); year = int(year_match.group()) if year_match else 0
    if "spring" in term_str: season = "Spring";
    elif "summer" in term_str: season = "Summer";
    elif "fall" in term_str: season = "Fall";
    else: season = "Unknown"
    return year, season
def calculate_gpa(row: pd.Series) -> float:
    gpa_grades = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'F': 0}; total_points = sum(row.get(g, 0) * p for g, p in gpa_grades.items()); total_students = sum(row.get(g, 0) for g in gpa_grades)
    return round(total_points / total_students, 2) if total_students > 0 else 0.0
def load_and_prepare_documents(data_dir: str) -> list:
    files = glob.glob(os.path.join(data_dir, "*.csv")); documents = []
    print(f"Found {len(files)} CSVs. Processing...")
    for file in tqdm(files, desc="Processing CSV Files"):
        df = pd.read_csv(file, encoding='utf-8-sig'); df.columns = df.columns.str.strip(); df.fillna(0, inplace=True)
        for _, row in df.iterrows():
            first_name = row.get('Primary Instructor First Name', '') or ''; last_name = row.get('Primary Instructor Last Name', '') or ''; instructor = f"{first_name} {last_name}".strip(); course_id = f"{row.get('Subject', '')} {row.get('Catalog Number', '')}".strip()
            if not instructor or not course_id: continue
            year, season = parse_term(row.get('Term', '')); gpa = calculate_gpa(row)
            documents.append(f"Instructor: {instructor} | Course: {course_id} | Term: {season} {year} | GPA: {gpa:.2f}")
    return documents

# ----------------------------------------------------------------------
# 5. MAIN BLOCK
# ----------------------------------------------------------------------
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Drive mounted.")
os.makedirs(COLAB_OUTPUT_DIR, exist_ok=True)
documents = load_and_prepare_documents(DRIVE_DATA_DIR)
with open(os.path.join(COLAB_OUTPUT_DIR, "documents.json"), "w") as f:
    json.dump(documents, f)
print(f"\n Saved {len(documents)} documents to app_data/documents.json")
print("Loading embedding model (this may take a moment)...")

model = SentenceTransformer(EMBEDDING_MODEL, device='cuda')
print("Creating embeddings (this is the slow part, but fast on GPU)...")
document_embeddings = model.encode(documents, show_progress_bar=True, normalize_embeddings=True)
np.save(os.path.join(COLAB_OUTPUT_DIR, "embeddings.npy"), document_embeddings)
print(f"Saved embeddings of shape {document_embeddings.shape} to app_data/embeddings.npy")
