**DATABASE SETUP**

In [5]:
from pymongo import MongoClient

mongo_connection = 'mongodb://localhost:27017/'
mongo_database = 'deeplecture'
mongo_collection = 'corrected_all_text_data'

client = MongoClient(mongo_connection)
db = client[mongo_database]
collection_complete = db[mongo_collection]


**DATABASE FILTERS**

In [6]:
def filter_by_metadata(initial_query: dict, metadata: dict):
    """
    Filters documents by metadata on top of an existing query.
    
    :param initial_query: Initial query dictionary
    :param metadata: Dictionary containing metadata to filter on
    :return: List of documents that match the combined query
    """
    combined_query = initial_query.copy()
    for key, value in metadata.items():
        combined_query[f"metadata.{key}"] = value
    results = collection_complete.find(combined_query)
    return list(results)

def filter_by_terms(query: dict, terms: list):
    """
    Filters documents by searching for terms in the text on top of an existing query.
    
    :param query: Initial query dictionary
    :param terms: List of terms to search for in the text of documents
    :return: List of documents that contain any of the terms
    """
    terms_query = {'$or': [{'text': {'$regex': term, '$options': 'i'}} for term in terms]}
    combined_query = {**query, **{'$and': [terms_query]}}
    results = collection_complete.find(combined_query)
    return list(results)

def filter_by_terms_together(query: dict, terms: list):
    """
    Filters documents by searching for documents where all specified terms appear together in the text on top of an existing query.
    
    :param query: Initial query dictionary
    :param terms: List of terms that should appear together in the text of documents
    :return: List of documents that contain all of the terms together
    """
    if not terms or len(terms) < 2:
        raise ValueError("Please provide at least two terms to search for.")

    combined_regex = "(?=.*" + ")(?=.*".join(terms) + ")"
    combined_query = {**query, **{'text': {'$regex': combined_regex, '$options': 'i'}}}
    results = collection_complete.find(combined_query)
    return list(results)


**DATABASE FILTERS TEST**

In [None]:
initial_query = {}

# Filter by metadata
metadata_filter = {'autor': 'varios', 'anyo': '1778'}
documents_by_metadata = filter_by_metadata(initial_query, metadata_filter)
print(f"Number of documents filtered by metadata: {len(documents_by_metadata)}")

# Filter by terms
terms_filter = ['agricultura', 'economia']
documents_by_terms = filter_by_terms(initial_query, terms_filter)
print(f"Number of documents filtered by metadata and then by terms: {len(documents_by_terms)}")

# Filter by terms together
terms_together_filter = ['naturaleza', 'dios']
documents_by_terms_together = filter_by_terms_together({'_id': {'$in': [doc['_id'] for doc in documents_by_terms]}}, terms_together_filter)
print(f"Number of documents filtered by metadata, then by terms, and then terms together: {len(documents_by_terms_together)}")

Number of documents filtered by metadata: 3
Number of documents filtered by metadata and then by terms: 7709
