In [18]:
import json
from sentence_transformers import SentenceTransformer, util
import numpy as np
from collections import defaultdict

In [4]:
DATA_FILE = 'data_dummy.json'
with open(DATA_FILE, 'r') as f:
    courses_data = json.load(f)

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
course_texts = []
course_map = {}


for i, course in enumerate(courses_data):
    combined_text = f"{course['course_code']} {course['course_title']} {course['course_description']}"
    course_texts.append(combined_text)
    course_map[i] = course
    
course_embeddings = model.encode(course_texts, convert_to_tensor=True, show_progress_bar=True)
    

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
def find_resources(query, top_k=3, similarity_threshold=0.3):
    """
    Finds the most relevant courses and their resources based on a user query.

    Args:
        query (str): The user's search query (e.g., "MTH101", "Introduction to Algebra").
        top_k (int): The number of top relevant courses to return.

    Returns:
        list: A list of dictionaries, each containing course info and its resources.
    """
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    cosine_scores = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
    top_results_indices = np.argsort(-cosine_scores.cpu().numpy())[:top_k]
     
     
    # Use a defaultdict to automatically create lists for new types
    grouped_resources = defaultdict(list)
    relevant_courses_found = []

    print(f"\nSearching for: '{query}'")
    print("-" * 30)

    # Iterate through all courses, checking similarity
    for idx in top_results_indices:
        similarity_score = cosine_scores[idx].item()
        course_info = course_map[idx]

        if similarity_score >= similarity_threshold:
            relevant_courses_found.append({
                "course_code": course_info['course_code'],
                "course_title": course_info['course_title'],
                "similarity_score": f"{similarity_score:.4f}"
            })
            # Add all resources from this relevant course to our grouped_resources
            for res in course_info['resources']:
                grouped_resources[res['type']].append(res)
        # We can stop after finding top_k relevant courses
        if len(relevant_courses_found) >= top_k:
            break
    
    # --- Print Section for Console Demo ---
    if not relevant_courses_found:
        print("  No highly relevant courses found for this query.")
        print("  Consider adjusting the query or similarity_threshold.")
        return {} # Return empty dictionary if no relevant resources


    if not grouped_resources:
        print("  No resources found for the relevant courses.")
    else:
        # Define a display order for resource types for consistency
        display_order = ['youtube_video', 'pdf', 'past_question', 'article', 'other']

        # Sort grouped_resources by display_order
        for res_type in display_order:
            if res_type in grouped_resources:
                print(f"\n[{res_type.replace('_', ' ').title()}]:")
                # Sort resources within each type alphabetically by title for consistency
                sorted_res = sorted(grouped_resources[res_type], key=lambda x: x['title'])
                for res in sorted_res:
                    print(f"  - {res['title']} (Link: {res['link']})")
        
        # Handle any resource types not in display_order
        for res_type, resources in grouped_resources.items():
            if res_type not in display_order:
                print(f"\n[{res_type.replace('_', ' ').title()}]:")
                sorted_res = sorted(resources, key=lambda x: x['title'])
                for res in sorted_res:
                    print(f"  - {res['title']} (Link: {res['link']})")
                    
    # The actual return value for API integration will be the grouped_resources dictionary
    return grouped_resources


In [22]:
if __name__ == "__main__":
    # Test queries - try different formats!
    queries = [
        "MTH101",
        "Introduction to programming",
        "chemistry basics",
        "algebra concepts",
        "human body systems",
        "laws of thermodynamics",
        "SIWES application guide" # This should ideally yield no results or low relevance, showing limitations
    ]

    for q in queries:
        found_resources = find_resources(q, top_k=2) # Adjust top_k as desired
        print("\n" + "=" * 50 + "\n") # Separator for clarity



Searching for: 'MTH101'
------------------------------

[Youtube Video]:
  - Algebra for Beginners: Full Course (Link: https://www.youtube.com/watch?v=f_T3x-bL8rE)

[Pdf]:
  - Trigonometry Basics Notes (Link: https://www.mathsisfun.com/algebra/images/trig-summary.pdf)

[Past Question]:
  - MTH101 First Semester Exam Questions (Link: https://example.com/mth101_exam_questions.pdf)

[Article]:
  - Understanding Set Theory (Link: https://www.britannica.com/science/set-theory)



Searching for: 'Introduction to programming'
------------------------------

[Youtube Video]:
  - Algebra for Beginners: Full Course (Link: https://www.youtube.com/watch?v=f_T3x-bL8rE)
  - What is Computer Science? (Link: https://www.youtube.com/watch?v=szG2XfT0D6U)

[Pdf]:
  - Introduction to Algorithms PDF (Link: https://example.com/algorithms_intro.pdf)
  - Trigonometry Basics Notes (Link: https://www.mathsisfun.com/algebra/images/trig-summary.pdf)

[Past Question]:
  - CSC101 Fundamentals Questions (Link: http