In [5]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [6]:
def load_preprocessed_data(input_file):
    """
    Load preprocessed data from a JSON file.
    
    Args:
    - input_file (str): Path to the JSON file containing preprocessed documents.
    
    Returns:
    - list of dict: Preprocessed documents.
    """
    with open(input_file, "r", encoding="utf-8") as json_file:
        return json.load(json_file)

In [7]:
def preprocess_query(query):
    """
    Preprocess the search query using the same steps applied to the documents.
    
    Args:
    - query (str): The search query input by the user.
    
    Returns:
    - str: Preprocessed query.
    """
    # Repeat preprocessing steps (lowercase, remove punctuation, etc.)
    # For simplicity, use basic steps here.
    query = query.lower()  # Lowercasing
    query = re.sub(r'[^\w\s]', '', query)  # Remove punctuation
    query = re.sub(r'\d+', '', query)  # Remove numbers
    return query