# This is the notebook where I am testing the actual search this is not what will be used in production

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample website content
data = [
    {"url": "site.com/page1", "title": "Cooking class", "content": "Cooking is a basic skill all new in life...", "filters": []},
    {"url": "site.com/page2", "title": "Machine Learning", "content": "Machine learning is a subset of AI...", "filters": []},
    {"url": "site.com/page3", "title": "Deep Learning", "content": "Deep learning uses neural networks...", "filters": []}
]
df = pd.DataFrame(data)
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["content"])

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    return similarities


In [3]:
# Simulated user feedback data 
#later I will write a function to handel this and convert the data
user_feedback = [
    {"query": "cake", "url": "site.com/page1", "clicked": 1},
    {"query": "Machine Learning", "url": "site.com/page2", "clicked": 1},
    {"query": "Deep Learning", "url": "site.com/page3", "clicked": 1},
    {"query": "Deep Learning", "url": "site.com/page1", "clicked": 0},
    {"query": "Machine Learning", "url": "site.com/page1", "clicked": 0},
    {"query": "food", "url": "site.com/page1", "clicked": 1},
    {"query": "pizza", "url": "site.com/page1", "clicked": 1},
    {"query": "pizza", "url": "site.com/page2", "clicked": 1},
    {"query": "Machine Learning", "url": "site.com/page1", "clicked": 1}
]
feedback_df = pd.DataFrame(user_feedback)


In [4]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
model = RandomForestRegressor()

def learn(feedback_df):
    global model
    features = []
    labels = []
    for index, row in feedback_df.iterrows():
        doc_index = df[df["url"] == row["url"]].index[0]
        similarity = search(row["query"])[doc_index]
        features.append([similarity])
        labels.append(row["clicked"])  # 1 if clicked, 0 if ignored

    # Train a ranking model
    model.fit(np.array(features), np.array(labels))

learn(feedback_df)

In [5]:
def improved_search(query, filters=None):
    """
    Perform a search with optional filters.

    :param query: Search query string
    :param filters: List of filter tags (optional)
    :return: Ranked search results
    """
    similarities = search(query)
    results = []
    
    for i, sim in enumerate(similarities):
        # Skip results that do not match the filters (if any filters are specified)
        if filters:
            page_filters = df.iloc[i]["filters"]
            if not any(f in page_filters for f in filters):
                continue

        features = np.array([[sim]])  
        rank_score = model.predict(features)[0]
        results.append((df.iloc[i]["url"], df.iloc[i]["title"], rank_score))

    return sorted(results, key=lambda x: x[2], reverse=True)  # Sort by predicted relevance

# Try a search with and without filters
print(improved_search("AI"))  # Without filters
print(improved_search("how to make AI", filters=[]))  # With filter

[('site.com/page2', 'Machine Learning', np.float64(0.9755555555555556)), ('site.com/page1', 'Cooking class', np.float64(0.703269841269841)), ('site.com/page3', 'Deep Learning', np.float64(0.703269841269841))]
[('site.com/page2', 'Machine Learning', np.float64(0.9755555555555556)), ('site.com/page1', 'Cooking class', np.float64(0.703269841269841)), ('site.com/page3', 'Deep Learning', np.float64(0.703269841269841))]
