In [1]:
# !python -m pip install -q langchain langchain-community langchain-chroma langchain-google-genai
# !python -m pip install -q chromadb pypdf faiss-cpu

In [2]:
import os
import pandas as pd
from pathlib import Path
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document
import shutil
from typing import List, Tuple

In [3]:
def normalize_ids(titles_or_ids):
    """Accept either course_ids or human titles"""
    out = []
    for x in titles_or_ids:
        x = str(x).strip()
        if x in courses["course_id"].values:
            out.append(x)
        else:
            cid = title_to_id.get(x.lower())
            if cid:
                out.append(cid)
    return out

In [4]:
def recommend_courses(profile, completed_ids, top_k):
    """
    Returns list of (course_id, similarity_score) for the top-k recommendations,
    excluding courses in completed_ids.
    """
    completed_ids = set(normalize_ids(completed_ids))
    # Query vector DB with the learner profile string
    results = vectorstore.similarity_search_with_score(profile, k=top_k * 3)

    # top_k
    out = []
    for doc, score in results:
        cid = doc.metadata.get("course_id")
        if cid in completed_ids:
            continue
        out.append((cid, float(score), doc.metadata.get("title"), doc.metadata.get("description")))
        if len(out) >= top_k:
            break

    # Return (course_id, similarity)
    return [(cid, s) for cid, s, _, _ in out]

In [5]:
def recommend(recs):
    rows = []
    for cid, score in recs:
        row = courses[courses["course_id"] == cid].iloc[0]
        rows.append({
            "course_id": cid,
            "title": row["title"],
            "similarity": round(float(score), 4)
        })
    return pd.DataFrame(rows)

In [6]:
embedding_model_name = "models/gemini-embedding-001" 
vector_db_path = "VectorDB_Chroma"            
collection_name = "courses_v1"

In [7]:
csv_path = Path("/home/zadmin/Desktop/test/GAAI-B5-GCP/datasets/assignment2dataset.csv")
courses = pd.read_csv(csv_path)

In [8]:
required_cols = {"course_id", "title", "description"}
missing = required_cols - set(courses.columns)

In [9]:
missing

set()

In [10]:
if "text" not in courses.columns:
    courses["text"] = courses["title"].astype(str).str.strip() + ": " + courses["description"].astype(str).str.strip()

In [11]:
courses["text"] = courses["text"].fillna("").str.strip()
courses = courses[courses["text"].str.len() > 0].copy()
courses = courses.drop_duplicates(subset=["course_id"]).reset_index(drop=True)

In [12]:
print(courses.head(2))
print(f"Loaded {len(courses)} courses")

  course_id                                    title  \
0      C001          Foundations of Machine Learning   
1      C002  Deep Learning with TensorFlow and Keras   

                                         description  \
0  Understand foundational machine learning algor...   
1  Explore neural network architectures using Ten...   

                                                text  
0  Foundations of Machine Learning: Understand fo...  
1  Deep Learning with TensorFlow and Keras: Explo...  
Loaded 25 courses


In [13]:
if Path(vector_db_path).exists():
    shutil.rmtree(vector_db_path)

In [14]:
embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model_name)

In [15]:
docs = [
    Document(
        page_content=row["text"],
        metadata={
            "course_id": row["course_id"],
            "title": row["title"],
            "description": row["description"],
        },
    )
    for _, row in courses.iterrows()
]


In [16]:
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=vector_db_path,
    collection_name=collection_name,
    collection_metadata={"use_type": "COURSE_RECO_ENGINE"}
)

In [17]:
print("Indexed docs:", len(vectorstore.get()["ids"]))

Indexed docs: 25


In [18]:
title_to_id = {
    str(r["title"]).strip().lower(): str(r["course_id"])
    for _, r in courses.iterrows()
}

In [19]:
# Chroma retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [20]:
test_profiles = [
    {
        "profile": "I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?",
        "completed": ["Python Programming for Data Science"], 
    },
    {
        "profile": "I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.",
        "completed": [],
    },
    {
        "profile": "My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.",
        "completed": ["Foundations of Machine Learning"],
    },
    {
        "profile": "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
        "completed": [],
    },
    {
        "profile": "I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?",
        "completed": [],
    },
]

all_results = []
for i, tp in enumerate(test_profiles, start=1):
    recs = recommend_courses(profile=tp["profile"], completed_ids=tp["completed"], top_k=5)
    df = recommend(recs)
    print(f"\n=== Test Profile {i} ===")
    print(tp["profile"])
    print("Completed:", tp["completed"])
    display(df)
    all_results.append((tp, df))



=== Test Profile 1 ===
I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?
Completed: ['Python Programming for Data Science']


Unnamed: 0,course_id,title,similarity
0,C014,Data Visualization with Tableau,0.3308
1,C017,R Programming and Statistical Analysis,0.3941
2,C012,SQL for Data Analysis,0.4052
3,C001,Foundations of Machine Learning,0.4098
4,C003,Natural Language Processing Fundamentals,0.4226



=== Test Profile 2 ===
I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.
Completed: []


Unnamed: 0,course_id,title,similarity
0,C007,Cloud Computing with Azure,0.3016
1,C009,Containerization with Docker and Kubernetes,0.3064
2,C008,DevOps Practices and CI/CD,0.3373
3,C025,MLOps: Productionizing Machine Learning,0.378
4,C006,Data Engineering on AWS,0.4332



=== Test Profile 3 ===
My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.
Completed: ['Foundations of Machine Learning']


Unnamed: 0,course_id,title,similarity
0,C025,MLOps: Productionizing Machine Learning,0.3061
1,C002,Deep Learning with TensorFlow and Keras,0.3137
2,C005,Reinforcement Learning Basics,0.3409
3,C003,Natural Language Processing Fundamentals,0.3608
4,C004,Computer Vision and Image Processing,0.3745



=== Test Profile 4 ===
I want to learn to build and deploy microservices with Kubernetes—what courses fit best?
Completed: []


Unnamed: 0,course_id,title,similarity
0,C009,Containerization with Docker and Kubernetes,0.2241
1,C010,APIs and Microservices Architecture,0.3061
2,C008,DevOps Practices and CI/CD,0.3476
3,C007,Cloud Computing with Azure,0.3604
4,C025,MLOps: Productionizing Machine Learning,0.3733



=== Test Profile 5 ===
I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?
Completed: []


Unnamed: 0,course_id,title,similarity
0,C023,Blockchain Technology and Smart Contracts,0.2014
1,C021,Cybersecurity Fundamentals,0.3846
2,C022,Internet of Things (IoT) Development,0.3924
3,C009,Containerization with Docker and Kubernetes,0.4004
4,C024,Augmented and Virtual Reality Development,0.4013


## Relevance Notes

#### Test Profile 1:

 - Only the Tableau course is a clear fit
 - Others are generic matches due to catalog overlap.

#### Test Profile 2:

 - Top 3 are good
 - Bottom 2 are near misses.

#### Test Profile 3:

 - Top 2 are ideal
 - Rest are domain tracks that can follow after core DL + MLOps.

#### Test Profile 4:

 - Top 3 align with building/deploying microservices on K8s.

#### Test Profile 5:

 - Only one truly relevant option is present.