In [1]:
from langchain_chroma import Chroma
import json 
from langchain_huggingface import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding_function = HuggingFaceEmbeddings(model="intfloat/e5-large-v2")

In [3]:
vectorstore = Chroma(persist_directory="updated_filter_db/", embedding_function=embedding_function)

In [4]:
vectorstore.get()

{'ids': ['4610af04-d766-4824-9c05-5f898543415b',
  '2f847d00-14f1-4208-b086-fb68b7d62f10',
  'e4a05c06-5008-4318-b110-8c2992a8ca42',
  '5e22b63b-1187-46ff-8a6f-727d5c5c864e',
  '180fa764-3981-4335-ad97-8843b5a1ab39',
  '58b4158e-fc04-4119-a2d8-184569a04475',
  'e8be1a00-d817-443d-aa7d-2472ed453691',
  'c9cc45ce-d557-4e76-8ca7-5c209fc96620',
  '1712073d-a338-453c-a8e0-9efc24beb950',
  'da9c046a-4b26-48ee-98b0-0baf4d0118c3',
  'f082b94b-6b78-4394-892e-f4226f5eb53c',
  'db67fab5-0c3e-49a9-abba-119080e32bed',
  '168f2e4c-9f4c-4bda-9037-23d10a9f9bd5',
  '3f0986ad-5916-4f28-b398-94cef6b9a432',
  'bcb8bfb2-a6da-4837-aad2-79f6ff63be38',
  '86a449cc-c311-4064-8cd6-aa04b9aed67a',
  '234f27cd-5970-454f-b87a-b6f4b1a845cb',
  'e6b291a5-6f7a-4cb4-9e72-cc3ee2635903',
  'df595050-beae-42fe-8f31-f23ab99fb3c3',
  'eb78f90e-8e90-4bcc-be00-87e2e68d18f1',
  '20dac205-ef38-47cc-b782-acb9ef68c3a2',
  '3ba086f1-5b3d-4661-a54c-dcbde9ef5b21',
  '94ca2d90-5400-4ea5-8422-735bbea8bbb6',
  'd4246a60-79a9-43e2-a0f2-

In [3]:
original_school_ids = [
    "school_90",
    "school_115", 
    "school_31",
    "school_20",
    "school_143",
    "school_62",
    "school_9",
    "school_26",
    "school_118"
]

original_program_ids = [
    "program_138",
    "program_324",
    "program_1172", 
    "program_142",
    "program_310",
    "program_42",
    "program_119",
    "program_784",
    "program_1141",
    "program_505"
]

In [28]:
search_kwargs = {
    "k": 10,  
    "fetch_k": 40,  
    "lambda_mult": 0.5,
    "filter": {  # ✅ Use "filter" as top-level key
        "$and": [
            {"program_type": {"$eq": "Business & Management"}},  # ✅ Remove nested "filter"
            {"school_id": {"$nin": original_school_ids}}, 
            {"program_id": {"$nin": original_program_ids}}
        ]
    },
    "where_document": {  # ✅ Separate document filters
        "$or": [
            {"$regex": "(?i)program language.*english"},
            {"$regex": "(?i)program language.*french"},
            {"$and": [
                {"$regex": "(?i)Teaching Language.*english"}, 
                {"$regex": "(?i)Teaching Language.*french"}
            ]}
        ]
    }
}

In [7]:
retriever = vectorstore.as_retriever(search_type = "mmr")

In [10]:
content = retriever.invoke("skema")
for doc in content:
    print(doc)

page_content='
        # Mastère Spécialisés® (MS) Manager en Gestion de Patrimoine Financier 

    ## Program Overview
    **Program Name:** Mastère Spécialisés® (MS) Manager en Gestion de Patrimoine Financier 
    **School:** SKEMA Business School
    **Field of Study:** Business & Management

    ## Location Details
    **Country:** FRANCE
    **City:**: Paris

    ## Program Information
    **program Language:**: French
    **Entry Level:**: 5
    **Price:** 18000.0
    **Intake:**: Fall


    ' metadata={'school_id': 'school_9', 'country': 'FRANCE', 'program_id': 'program_887', 'fee': 18000.0, 'duration': 1.0, 'program_type': 'Business & Management'}
page_content='
        # MSc Sport, Event & Hospitality Management

    ## Program Overview
    **Program Name:** MSc Sport, Event & Hospitality Management
    **School:** SKEMA Business School
    **Field of Study:** Business & Management

    ## Location Details
    **Country:** FRANCE
    **City:**: Sophia-Antipolis

    ## Program

In [30]:
for doc in retriever.invoke("business"):
    print(doc.metadata)

{'program_type': 'Business & Management', 'duration': 2.0, 'program_id': 'program_770', 'country': 'FRANCE', 'school_id': 'school_15', 'fee': 13400.0}
{'country': 'FRANCE', 'school_id': 'school_23', 'fee': 11100.0, 'program_type': 'Business & Management', 'duration': 2.0, 'program_id': 'program_749'}
{'fee': 10133.0, 'school_id': 'school_119', 'duration': 3.0, 'country': 'FRANCE', 'program_type': 'Business & Management', 'program_id': 'program_1350'}
{'fee': 8500.0, 'country': 'FRANCE', 'duration': 3.0, 'program_id': 'program_1028', 'program_type': 'Business & Management', 'school_id': 'school_74'}
{'country': 'FRANCE', 'program_type': 'Business & Management', 'fee': 6500.0, 'program_id': 'program_385', 'duration': 2.0, 'school_id': 'school_134'}
{'school_id': 'school_7', 'fee': 12000.0, 'country': 'FRANCE', 'program_type': 'Business & Management', 'duration': 2.0, 'program_id': 'program_564'}
{'program_id': 'program_647', 'duration': 2.0, 'country': 'FRANCE', 'program_type': 'Business

In [None]:
"$or" : [{"where_document": "$regex": "(?i)Teaching Language.*english"},{"where_document": "$regex": "(?i)Teaching Language.*english"}, "$and" : [{"where_document": "$regex": "(?i)Teaching Language.*english"},{"where_document": "$regex": "(?i)Teaching Language.*english"}]]

In [6]:
from general_search import search

In [None]:
# Keep original IDs separate from returned IDs


# Fixed filter statements (remove nested "filter" key)
filter_statements = [
    {
        "$or": [
            {"where_document": {"$regex": "(?i)program Language.*english"}},
            {"where_document": {"$regex": "(?i)program Language.*french"}},
            {"$and": [
                {"where_document": {"$regex": "(?i)program Language.*english"}}, 
                {"where_document": {"$regex": "(?i)program Language.*french"}}
            ]}
        ]
    },
    {"program_type": {"$eq": "Business & Management"}},
{    "$and": [ {"fee": {"$gte": 8000}}, 
    {"fee": {"$lte": 10000}} ]}

]

# Debug: Check what your search is actually returning
print("=== FIRST SEARCH ===")
print(f"Original school IDs to exclude: {original_school_ids}")
print(f"Original program IDs to exclude: {original_program_ids}")

returns1, returned_school_ids1, returned_program_ids1, content1 = search(
    "business", 
    "all", 
    original_school_ids,
    original_program_ids, 
    True, 
    False, 
    filter_statements
)

print(f"Returned school IDs from search 1: {returned_school_ids1}")
print(f"Returned program IDs from search 1: {returned_program_ids1}")
print(f"Number of documents returned: {len(returns1)}")

# Check if any excluded IDs appear in results
overlap_schools = set(original_school_ids) & set(returned_school_ids1)
overlap_programs = set(original_program_ids) & set(returned_program_ids1)

print(f"⚠️  School ID overlap (should be empty): {overlap_schools}")
print(f"⚠️  Program ID overlap (should be empty): {overlap_programs}")

print("\n=== SECOND SEARCH ===")
all_school_ids_to_exclude = original_school_ids + returned_school_ids1
all_program_ids_to_exclude = original_program_ids + returned_program_ids1

print(f"All school IDs to exclude: {all_school_ids_to_exclude}")
print(f"All program IDs to exclude: {all_program_ids_to_exclude}")

returns2, returned_school_ids2, returned_program_ids2, content2 = search(
    "business", 
    "all", 
    all_school_ids_to_exclude,
    all_program_ids_to_exclude, 
    True, 
    False, 
    filter_statements
)

print(f"Returned school IDs from search 2: {returned_school_ids2}")
print(f"Returned program IDs from search 2: {returned_program_ids2}")

# Check if search 2 returns same IDs as search 1
same_school_ids = set(returned_school_ids1) & set(returned_school_ids2)
same_program_ids = set(returned_program_ids1) & set(returned_program_ids2)

print(f"🔥 Same school IDs in both searches: {same_school_ids}")
print(f"🔥 Same program IDs in both searches: {same_program_ids}")
for doc in content2:
    print(doc)

=== FIRST SEARCH ===
Original school IDs to exclude: ['school_90', 'school_115', 'school_31', 'school_20', 'school_143', 'school_62', 'school_9', 'school_26', 'school_118']
Original program IDs to exclude: ['program_138', 'program_324', 'program_1172', 'program_142', 'program_310', 'program_42', 'program_119', 'program_784', 'program_1141', 'program_505']
Search kwargs: {'k': 10, 'fetch_k': 40, 'lambda_mult': 0.5, 'filter': {'$and': [{'program_type': {'$eq': 'Business & Management'}}, {'$and': [{'fee': {'$gte': 8000}}, {'fee': {'$lte': 10000}}]}, {'school_id': {'$nin': ['school_90', 'school_115', 'school_31', 'school_20', 'school_143', 'school_62', 'school_9', 'school_26', 'school_118']}}, {'program_id': {'$nin': ['program_138', 'program_324', 'program_1172', 'program_142', 'program_310', 'program_42', 'program_119', 'program_784', 'program_1141', 'program_505']}}]}, 'where_document': {'$or': [{'$regex': '(?i)program Language.*english'}, {'$regex': '(?i)program Language.*french'}, {'$a

In [7]:
new_school_ids

NameError: name 'new_school_ids' is not defined

In [1]:
from general_search import search

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# User selects English + French languages and Business program type
filter_statements = []

search("kedge", "all", ['35'],
 ['302',
  '303',
  '285',
  '284',
  '283',
  '1240',
  '286',
  '291',
  '1241',
  '287',
  '298',
  '1239',
  '290'], True, False, filter_statements)

Original query: kedge
Rewritten query: KEDGE Business School
=== FINAL SEARCH KWARGS ===
{
  "k": 15,
  "fetch_k": 40,
  "lambda_mult": 0.4,
  "filter": {
    "$and": [
      {
        "school_id": {
          "$in": [
            "35"
          ]
        }
      },
      {
        "program_id": {
          "$nin": [
            "302",
            "303",
            "285",
            "284",
            "283",
            "1240",
            "286",
            "291",
            "1241",
            "287",
            "298",
            "1239",
            "290"
          ]
        }
      }
    ]
  }
}
Test search (no filters) returned: 5 docs
Raw retriever returned 14 documents
First doc metadata: {'school_id': '35', 'fee': 17500.0, 'country': 'FRANCE', 'program_type': 'Business & Management', 'rank': 11.0, 'program_id': '305', 'duration': 1.0}
First doc content preview: 
# Mastère Spécialisé® Management par la qualité

## Program Overview
**Program Name:** Mastère Spécialisé® Managem

([{'school_id': 35,
   'school_name': 'KEDGE Business School',
   'rank': 11.0,
   'school_logo': 'https%3A%2F%2Ffirebasestorage.googleapis.com%2Fv0%2Fb%2Fmain-byte-304814.appspot.com%2Fo%2FKEDGE%2520Business%2520SchoolKEDGE-Business-School.jpg%3Falt%3Dmedia%26token%3D4e668eea-e9c0-4dc2-8f64-0c469921b84f',
   'country_code': 'FRANCE'},
  {'school_logo': 'https%3A%2F%2Ffirebasestorage.googleapis.com%2Fv0%2Fb%2Fmain-byte-304814.appspot.com%2Fo%2FKEDGE%2520Business%2520SchoolKEDGE-Business-School.jpg%3Falt%3Dmedia%26token%3D4e668eea-e9c0-4dc2-8f64-0c469921b84f',
   'country_code': 'FRANCE',
   'school_type': 'Business & Management',
   'program_name': 'Mastère Spécialisé® Management par la qualité',
   'school_id': 35,
   'program_id': 305,
   'price': 17500.0},
  {'school_logo': 'https%3A%2F%2Ffirebasestorage.googleapis.com%2Fv0%2Fb%2Fmain-byte-304814.appspot.com%2Fo%2FKEDGE%2520Business%2520SchoolKEDGE-Business-School.jpg%3Falt%3Dmedia%26token%3D4e668eea-e9c0-4dc2-8f64-0c469921b84f',
  

In [6]:
for doc in docs:
    print(doc.page_content)


    # Bachelor of Business Administration

    ## Program Overview
    **Program Name:** Bachelor of Business Administration
    **School:** International University of Monaco - OMNES Group
    **Field of Study:** Business & Management

    ## Location Details
    **Country:** FRANCE
    **City:**: Monaco

    ## Program Information
    **program Language:**: English
    **Entry Level:**: 1
**Entry Level:**: 0
**Entry Level:**: 2
    **Price:** 15803.0
    **Intake:**: Spring
**Intake:**: Fall
    **specialization:** ['Communication and Event Management', 'Luxury Marketing, Sales and Services', 'Sports Business Management', 'International Finance', 'Global Business Management', 'Management of Luxury Tourism and Hospitality']

    ---
    

        # Bachelor of Science in Data Science for Responsible Business

    ## Program Overview
    **Program Name:** Bachelor of Science in Data Science for Responsible Business
    **School:** emlyon Business School
    **Field of Study:** Busines

In [None]:
import json 
with open("school_parent_json.json", 'r') as f:
    data = json.load(f)

In [13]:
data

{'school_1': {'school_id': 55004,
  'school_name': 'ESLSCA Business School - Planeta Group',
  'rank': None,
  'school_logo': 'https%3A%2F%2Ffirebasestorage.googleapis.com%2Fv0%2Fb%2Fmain-byte-304814.appspot.com%2Fo%2FESLSCAESLSCA.jpg%3Falt%3Dmedia%26token%3D600c77b5-f9f1-48b9-85c5-72230778bb70',
  'country_code': 'FRANCE'},
 'school_2': {'school_id': 17,
  'school_name': 'École Supérieure de Tourisme (Yschools)',
  'rank': None,
  'school_logo': 'https%3A%2F%2Ffirebasestorage.googleapis.com%2Fv0%2Fb%2Fmain-byte-304814.appspot.com%2Fo%2FYschools%2520%253A%2520%25C3%2589cole%2520Sup%25C3%25A9rieure%2520de%2520Tourisme%2520%25C3%2589cole-Sup%25C3%25A9rieure-de-Tourisme.jpg%3Falt%3Dmedia%26token%3D0652c57d-a28e-4e08-89a0-0e88c540b5de',
  'country_code': 'FRANCE'},
 'school_3': {'school_id': 41,
  'school_name': 'Toulouse Business School',
  'rank': 11.5,
  'school_logo': 'https%3A%2F%2Ffirebasestorage.googleapis.com%2Fv0%2Fb%2Fmain-byte-304814.appspot.com%2Fo%2FToulouse%2520Business%2520S

In [14]:
list_of_school_names = []
for k, v in data.items():
    list_of_school_names.append(v['school_name'])


In [16]:
list_of_school_names

['ESLSCA Business School - Planeta Group',
 'École Supérieure de Tourisme (Yschools)',
 'Toulouse Business School',
 'IGEFI - FIGS Group',
 'KEDGE Business School',
 'Rennes School of Business',
 'ICN Business School',
 'EFAP - EDH Group',
 'SKEMA Business School',
 'Paris School Of Technology & Business - GALILEO Group',
 'Cours Florent - GALILEO Group',
 'EDC Business School',
 'Ecole Bleue - EDH Group',
 'WIS - FIGS Group',
 'Montpellier Business School',
 'IIM Digital School - Pôle Léonard de Vinci',
 'École Supérieure de Design (Yschools)',
 'South Champagne Business School (Yschools)',
 'Atelier De Sèvres - GALILEO Group',
 'European Business School Paris - GA Education Group',
 'Clermont School of Business',
 'Sup De Com - FIGS Group',
 'The American Business School - IGENSIA Group',
 'ESG Immobilier (GALILEO Group)',
 'ESARC Evolution - GALILEO Group',
 'Ecole 89 - GA Education Group',
 'Institut National De Gemmologie - AD Education Group',
 'Albert School',
 'CY Tech',
 'Sup 