In [1]:
!python -m pip install langchain-community langchainhub langchain-chroma langchain langchain-experimental --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import Document

embedding_model_name = "models/gemini-embedding-001"
model_name = "gemini-2.0-flash"


embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model_name)


In [3]:
import pandas as pd
df = pd.read_csv('datasets/assignment2dataset.csv')
df.head()

Unnamed: 0,course_id,title,description
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...


In [4]:
vector_db_path = "VectorDB_Chroma"
os.makedirs(vector_db_path,exist_ok=True)

In [5]:
documents = []
for _, row in df.iterrows():
    content = f"{row['title']}. {row['description']}"
    documents.append(
        Document(
            page_content=content,
            metadata={
                "course_id": row["course_id"],
                "title": row["title"],
                "description": row["description"]
            }
        )
    )


In [6]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=vector_db_path,
    collection_name="courses",
    collection_metadata={"use_type": "TRAINING AND EXPERIMENTATION"}
)


In [7]:
vectorstore = Chroma(
    persist_directory=vector_db_path,
    embedding_function=embeddings,
    collection_name="courses"
)

In [21]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [20]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
Online learning platforms host thousands of courses across domains—learners often feel overwhelmed by choices. You are a personalized recommender that understands both course 
content and individual learner profiles can boost engagement and completion rates by suggesting the most relevant next steps. Return the top-5 most relevant courses 
from a catalog of course offerings.
{question}

Context:
{context}
"""

prompt = PromptTemplate.from_template(message)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nOnline learning platforms host thousands of courses across domains—learners often feel overwhelmed by choices. You are a personalized recommender that understands both course \ncontent and individual learner profiles can boost engagement and completion rates by suggesting the most relevant next steps. Return the top-5 most relevant courses \nfrom a catalog of course offerings.\n{question}\n\nContext:\n{context}\n')

In [22]:
from langchain.chat_models import init_chat_model
llm = init_chat_model(model_name, model_provider="google_genai")

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

In [28]:
response = rag_chain.invoke("I know python, sugget me something in SQL")

print(response.content)

Given your Python background and interest in SQL, here are the top 5 most relevant courses, focusing on leveraging SQL for data-related tasks:

1.  **SQL for Data Analysis (C012):** This is the most direct recommendation.  Since you know Python, learning SQL will significantly expand your data analysis capabilities. This course covers essential SQL concepts like SELECT statements, JOINs, subqueries, window functions, and aggregate functions, all crucial for data manipulation and generating insights. The focus on PostgreSQL or MySQL provides practical experience.

2.  **Big Data Analytics with Spark (C011):**  This is relevant because it includes "Spark SQL". Knowing SQL and Python, learning Spark SQL is a logical next step for working with large datasets.  The course also introduces PySpark, allowing you to leverage your Python skills within the Spark ecosystem.

While the prompt only included two SQL-related courses, to provide 5 suggestions I will add three more that build upon SQL a

In [25]:
from typing import List, Tuple

def recommend_courses(profile: str, completed_ids: List[str], top_k: int = 5) -> List[Tuple[str, float]]:
    """
    Returns a list of (course_id, similarity_score) for the top-k recommendations.
    """
    # Build query text
    completed_titles = df[df["course_id"].isin(completed_ids)]["title"].tolist()
    query_text = ""
    if completed_titles:
        query_text += "Completed: " + "; ".join(completed_titles) + " "
    query_text += "Interests: " + profile

    # Query vectorstore
    results = vectorstore.similarity_search_with_score(query_text, k=top_k + len(completed_ids))

    recs = []
    for doc, score in results:
        cid = doc.metadata["course_id"]
        if cid in completed_ids:
            continue
        recs.append((cid, score))
        if len(recs) >= top_k:
            break
    return recs


In [None]:
test_profiles = [
    ("I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?", []),  
    ("I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.", []),
    ("My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.", []),
    ("I want to learn to build and deploy microservices with Kubernetes—what courses fit best?", []),
    ("I’m interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?", [])
]

In [27]:
for i, (profile, completed) in enumerate(test_profiles, 1):
    recs = recommend_courses(profile, completed, top_k=5)
    print(f"\n=== Test Profile {i} ===")
    print("Query:", profile)
    print("Completed IDs:", completed)
    print("Top Recommendations:")
    for cid, score in recs:
        row = df[df["course_id"] == cid].iloc[0]
        print(f"  - {row['title']} (score={score:.4f})")



=== Test Profile 1 ===
Query: I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?
Completed IDs: []
Top Recommendations:
  - Python Programming for Data Science (score=0.2442)
  - Python Programming for Data Science (score=0.2442)
  - Data Visualization with Tableau (score=0.2974)
  - Data Visualization with Tableau (score=0.2974)
  - R Programming and Statistical Analysis (score=0.3596)

=== Test Profile 2 ===
Query: I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.
Completed IDs: []
Top Recommendations:
  - Cloud Computing with Azure (score=0.2952)
  - Cloud Computing with Azure (score=0.2952)
  - Containerization with Docker and Kubernetes (score=0.3074)
  - Containerization with Docker and Kubernetes (score=0.3074)
  - DevOps Practices and CI/CD (score=0.3251)

=== Test Profile 3 ===
Query: My background is in ML fundamentals; I’d like to specialize in neural networks a