In [1]:
from whoosh.analysis import StandardAnalyzer
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import MultifieldParser, FuzzyTermPlugin
import os

# Liste de stop words en anglais
stop_words = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", 
    "aren't", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", 
    "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "does", "doesn't", 
    "don't", "down", "during", "each", "few", "for", "from", "had", "hadn't", "has", 
    "hasn't", "have", "haven't", "having", "here", "here's", "here's", "how", "how's", "how's", 
    "i", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "itself", "let", "more", 
    "most", "my", "myself", "of", "off", "on", "once", "only", "or", "other", "ought", "our", 
    "ours", "ourselves", "out", "over", "own", "same", "so", "than", "that", "that's", "that've", 
    "the", "theirs", "theirs", "them", "themselves", "then", "there", "there's", "there's", "therefore", 
    "these", "they", "they're", "they've", "this", "those", "through", "to", "under", "until", "up", 
    "very", "was", "wasn't", "were", "weren't", "what", "what's", "what's", "what's", "what's", "what's", 
    "when", "when's", "where", "where's", "which", "which's", "while", "who", "who's", "who's", "why", 
    "why's", "with", "won't", "would", "wouldn't"
])

In [None]:
schema = Schema(
    title=TEXT(stored=True),
    author=TEXT(stored=True),
    publication_date=ID(stored=True)
)

# Étape 2 : Créer un index
if not os.path.exists("book_index2"):
    os.mkdir("book_index2")
index = create_in("book_index2", schema)

# Ajouter des documents (livres) à l'index
writer = index.writer()
books2 = [
    {"title": "Pride and Prejudice", "author": "Jane Austen", "publication_date": "1813"},
    {"title": "Pride and Prejudice", "author": "Ma grand-mère", "publication_date": "1813 av J.-C."},
    {"title": "To Kill a Mockingbird", "author": "Harper Lee", "publication_date": "1960"},
    {"title": "1984", "author": "George Orwell", "publication_date": "1949"},
    {"title": "The Great Gatsby", "author": "F. Scott Fitzgerald", "publication_date": "1925"},
    {"title": "Moby Dick", "author": "Herman Melville", "publication_date": "1851"},
    {"title": "Les Fleurs du mal", "author": "Charles Baudelaire", "publication_date": "1857"}
]

for book in books2:
    writer.add_document(
        title=book["title"],
        author=book["author"],
        publication_date=book["publication_date"]
    )
writer.commit()

<class 'whoosh.index.FileIndex'>


In [3]:
def prepare_fuzzy_query(query_string):
    # Ajouter ~ à chaque mot de la requête
    terms = query_string.split()
    fuzzy_terms = [term + "~" if term.lower() not in stop_words else term for term in terms]
    return " ".join(fuzzy_terms)

# Étape 3 : Recherche dans l'index
def search_books(query_string):
    with index.searcher() as searcher:
        # Permet de chercher dans plusieurs champs
        parser = MultifieldParser(["title", "author"], schema=index.schema)
        parser.add_plugin(FuzzyTermPlugin())  # Ajouter la recherche floue

        fuzzy_query_string = prepare_fuzzy_query(query_string)
        query = parser.parse(fuzzy_query_string)
        
        results = searcher.search(query)
        print(f"Résultats trouvés : {len(results)}")
        print(f"Votre recherche : {query}")
        for result in results:
            #print(f"Titre : {result['title']}, Auteur : {result['author']}, Date : {result['publication_date']}")
            print(result)
            print(type(result))


In [None]:
# Étape 4 : Tester la recherche
if __name__ == "__main__":
    while True:
        query = input("Tapez votre recherche (titre, auteur ou date) : ")
        if query.lower() in ["exit", "quit"]:
            break
        search_books(query)

Résultats trouvés : 1
Votre recherche : ((title:les~ OR author:les~) AND (title:fleurs~ OR author:fleurs~))
<Hit {'author': 'Charles Baudelaire', 'publication_date': '1857', 'title': 'Les Fleurs du mal'}>
<class 'whoosh.searching.Hit'>
