In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import sklearn
import os
import PyPDF2
import pdfplumber
import fitz
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

Collecte manuelle et préparation des données

In [14]:
from concurrent.futures import ThreadPoolExecutor

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using pdfplumber."""
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def preprocess_text(text):
    """Clean the extracted text for easier processing."""
    text = re.sub(r'\n+', '\n', text)  # Remplacer les sauts de ligne multiples par un seul
    text = re.sub(r'\s+', ' ', text)  # Remplacer les espaces multiples par un seul
    return text

def extract_information(text):
    """Attempt to extract title, author, date, and other info from the text."""
    # Rechercher les sections du texte où se trouvent les informations
    title = re.search(r"(Title|Titre):?\s*(.*)", text, re.IGNORECASE)
    if title:
        title = title.group(2)
    else:
        # Si la première méthode ne fonctionne pas, on essaie de prendre la première ligne
        title = text.split('\n')[0][:100]  # Prendre les premiers 100 caractères comme titre

    author = re.search(r"(Author|Auteur):?\s*(.*)", text, re.IGNORECASE)
    if author:
        author = author.group(2)
    else:
        author = "Auteur non trouvé"

    # Extraction de la date (en supposant un format standard comme YYYY-MM-DD, DD/MM/YYYY, etc.)
    date = re.search(r"(\b\d{4}[-/]\d{2}[-/]\d{2}\b|\b\d{2}[-/]\d{2}[-/]\d{4}\b)", text)
    if date:
        date = date.group(0)
    else:
        date = "Date non trouvée"

    # Extraction du contenu après les premières informations trouvées
    content_start = text.find(title) + len(title) if title else 0
    content = text[content_start:].strip()

    return {
        "title": title.strip(),
        "author": author.strip(),
        "date": date.strip(),
        "content": content
    }

def process_single_pdf(pdf_file, pdf_folder):
    pdf_path = os.path.join(pdf_folder, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    cleaned_text = preprocess_text(text)
    info = extract_information(cleaned_text)
    info["file_name"] = pdf_file  # Ajouter le nom du fichier pour référence
    return info

def process_pdfs(pdf_folder):
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
    results = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_single_pdf, pdf_file, pdf_folder) for pdf_file in pdf_files]
        for future in futures:
            results.append(future.result())

    # Créer un DataFrame à partir des résultats
    df = pd.DataFrame(results)
    return df

pdf_folder_path = "articles et bouquins"
pdf_data_df = process_pdfs(pdf_folder_path)


In [2]:
# Afficher les résultats
pdf_data_df = pd.read_csv("corpus.csv")
pdf_data_df.head()

Unnamed: 0,title,author,date,content,file_name
0,Une am´elioration de la convergence de la m´et...,Email addresses: fatimabouyghf3@gmail.com(F.BO...,Date non trouvée,"LabMIA-SI, University of Mohammed V Rabat, Mor...",Article_11.pdf
1,L’am´elioration de la convergence de la m´etho...,Email address: fatimabouyghf3@gmail.com(F.BOUY...,Date non trouvée,e d’un seul ou plusieurs second membres F. BOU...,Article_22.pdf
2,Une approche unifi´ee pour les m´ethodes de so...,Email addresses: fatimabouyghf3@gmail.com(F.BO...,Date non trouvée,"lin´eaires F. BOUYGHFa,b, A. MESSAOUDIa, H. SA...",Article_33.pdf
3,1 Appendix ThecoefficientmatrixAandtheright–ha...,Auteur non trouvé,Date non trouvée,onjugate GradientMethod” by Hao Ji and Yaohang...,BFBCGappendix.pdf
4,Electronic Transactions on Numerical Analysis....,Auteur non trouvé,Date non trouvée,"ity Copyright2009, Kent State University. htt...",bloc sadok messaoudi.pdf


In [16]:
# Exporter les données vers un fichier CSV si nécessaire
pdf_data_df.to_csv("corpus.csv", index=False)

In [3]:
# Télécharger les stop words de nltk si ce n'est pas déjà fait
nltk.download('stopwords')
nltk.download('punkt')

# Obtenir les stop words en français et en anglais
stop_words = set(stopwords.words('french') + stopwords.words('english'))

def clean_text(text):
    """Nettoyer et prétraiter le texte."""
    if not isinstance(text, str):
        return ""
    # 1. Suppression des balises HTML
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # 2. Mettre en minuscules
    text = text.lower()
    
    # 3. Suppression de la ponctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    
    # 4. Tokenisation
    tokens = word_tokenize(text)
    
    # 5. Suppression des stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # 6. Rejoindre les tokens nettoyés
    cleaned_text = " ".join(tokens)
    return cleaned_text

# Appliquer le nettoyage sur les colonnes 'title' et 'content'
pdf_data_df['cleaned_title'] = pdf_data_df['title'].apply(clean_text)
pdf_data_df['cleaned_content'] = pdf_data_df['content'].apply(clean_text)

# Afficher les premières lignes pour vérifier le nettoyage
pdf_data_df[['cleaned_title', 'cleaned_content']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Maxim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,cleaned_title,cleaned_content
0,am´elioration convergence m´ethode idr f bouyg...,labmia si university mohammed v rabat morocco ...
1,’ am´elioration convergence m´ethode bicgstab ...,e ’ seul plusieurs second membres f bouyghfa b...
2,approche unifi´ee m´ethodes sous espace krylov...,lin´eaires f bouyghfa b messaoudia h sadokb la...
3,1 appendix thecoefficientmatrixaandtheright–ha...,onjugate gradientmethod ” hao ji yaohangli 1 a...
4,electronic transactions numerical analysis etn...,ity copyright2009 kent state university http ...


In [4]:
# Tokeniser le contenu de chaque document
corpus = pdf_data_df['cleaned_content'].tolist()
tokenized_corpus = [word_tokenize(doc) for doc in corpus]

Indexation et recherche de base

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Utiliser les stopwords en français et en anglais
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
tfidf_matrix = vectorizer.fit_transform(corpus)  # TF-IDF matrix

In [6]:
def boolean_search(query, inverted_index, operation='AND'):
    query_words = query.lower().split()
    result = set(inverted_index[query_words[0]])
    
    for word in query_words[1:]:
        if operation == 'AND':
            result = result.intersection(inverted_index[word])
        elif operation == 'OR':
            result = result.union(inverted_index[word])
        elif operation == 'NOT':
            result = result.difference(inverted_index[word])
    
    return result

In [7]:
from collections import defaultdict

# Créer un index inversé
inverted_index = defaultdict(set)

for doc_id, doc in enumerate(tokenized_corpus):
    for word in doc:
        inverted_index[word].add(doc_id)

# Tester la fonction boolean_search
queries = ["krylov", "algorithm", "linear AND systems", "nonexistent OR krylov", "linear NOT systems"]

for query in queries:
    result = boolean_search(query, inverted_index)
    print(f"Query: {query}")
    print(f"Result: {result}\n")

Query: krylov
Result: {0, 1, 2, 4, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 23, 24, 26, 32, 35, 39, 42, 43, 47}

Query: algorithm
Result: {0, 1, 2, 4, 5, 7, 8, 9, 10, 12, 14, 15, 17, 19, 20, 22, 24, 26, 32, 33, 34, 35, 38, 39, 42, 43, 46, 47}

Query: linear AND systems
Result: set()

Query: nonexistent OR krylov
Result: set()

Query: linear NOT systems
Result: set()



Amélioration de la pertinence

In [9]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

# Initialiser le modèle BM25
bm25 = BM25Okapi(tokenized_corpus)

# Fonction pour classer les documents en fonction d'une requête
def bm25_search(query, bm25, corpus_df, top_n=5):
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    corpus_df['bm25_score'] = scores
    # Obtenir les top n résultats triés par score
    return corpus_df.sort_values(by='bm25_score', ascending=False).head(top_n)

query = "machine learning"
top_results_bm25 = bm25_search(query, bm25, pdf_data_df)
top_results_bm25[['title', 'file_name', 'bm25_score']]


Unnamed: 0,title,file_name,bm25_score
23,"Numerische Mathematik Bd. 1, S. 29-- 37 (I 959...",Householder-Bauer1959_Article_OnCertainMethods...,2.516734
43,Numerical Algorithms20 (1999) 303–321 303 CMRH...,Sadok1999_Article_CMRHANewMethodForSolvingNons...,1.694497
24,NUMERICAL EXPERIMENTS WITH A MULTIPLE GRID AND...,IDR-Sonneveld-1980.pdf,1.674265
6,SIAMJ.MATH.ANAL. (cid:2)c 2010Societ yforIndus...,bouhamidi meshless.pdf,1.539991
14,"See discussions, stats, and author profiles fo...",doctorat gutnkecht.pdf,1.091323


In [10]:
from sentence_transformers import SentenceTransformer, util
import torch

# Charger un modèle BERT pré-entraîné
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encoder tout le contenu des documents en embeddings
corpus_embeddings = model.encode(pdf_data_df['cleaned_content'].tolist(), convert_to_tensor=True)

# Fonction pour effectuer une recherche sémantique
def semantic_search(query, model, corpus_embeddings, corpus_df, top_n=5):
    query_embedding = model.encode(query, convert_to_tensor=True)
    # Calculer les similarités cosinus entre la requête et les documents
    cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    # Obtenir les top n résultats
    top_results = torch.topk(cosine_scores, k=top_n)
    result_indices = top_results.indices.tolist()
    scores = top_results.values.tolist()
    return corpus_df.iloc[result_indices].assign(semantic_score=scores)

query = "deep learning"
top_results_semantic = semantic_search(query, model, corpus_embeddings, pdf_data_df)
top_results_semantic[['title', 'file_name', 'semantic_score']]


Unnamed: 0,title,file_name,semantic_score
31,,paper chebytchev.pdf,0.270222
36,s des programmes. Afin d’´eviter au lecteur un...,"Quarteroni, Sacco, Saleri. Methodes numeriques...",0.270222
48,de DOCTEUR EN INFORMATIQUE par Caroline LE CAL...,these.pdf,0.270222
18,. III.Series. QA188.B462003 512.9′434—dc21 200...,Generalized Inverses Theory and Applications b...,0.270222
45,quiresafixedamountofcompu- tational work at ea...,silvester[2299].pdf,0.270222


In [11]:
# Définir la variable de requête
query = "deep learning"

from sklearn.preprocessing import MinMaxScaler

# Normaliser les scores TF-IDF, BM25 et embeddings
# Calculer les scores TF-IDF, BM25 et BERT
tfidf_scores = tfidf_matrix.sum(axis=1).A1  # Somme des scores TF-IDF pour chaque document
bm25_scores = bm25.get_scores(word_tokenize(query.lower()))  # Scores BM25 pour la requête
bert_scores = util.cos_sim(model.encode(query, convert_to_tensor=True), corpus_embeddings)[0].cpu().numpy()  # Scores BERT pour la requête
scaler = MinMaxScaler()

# Reshape pour scaler
tfidf_scores_reshaped = tfidf_scores.reshape(-1, 1)
bm25_scores_reshaped = bm25_scores.reshape(-1, 1)
bert_scores_reshaped = bert_scores.reshape(-1, 1)

# Fit et transform
tfidf_scores_normalized = scaler.fit_transform(tfidf_scores_reshaped).flatten()
bm25_scores_normalized = scaler.fit_transform(bm25_scores_reshaped).flatten()
bert_scores_normalized = scaler.fit_transform(bert_scores_reshaped).flatten()

# Combiner les scores normalisés
combined_scores = (tfidf_scores_normalized + bm25_scores_normalized + bert_scores_normalized) / 3

# Ajouter les scores combinés au DataFrame
pdf_data_df['combined_score'] = combined_scores

# Trier les documents par score combiné
ranked_docs_hybrid = pdf_data_df.sort_values(by='combined_score', ascending=False)

# Afficher les résultats
ranked_docs_hybrid[['title', 'file_name', 'combined_score']]

Unnamed: 0,title,file_name,combined_score
14,"See discussions, stats, and author profiles fo...",doctorat gutnkecht.pdf,0.667923
23,"Numerische Mathematik Bd. 1, S. 29-- 37 (I 959...",Householder-Bauer1959_Article_OnCertainMethods...,0.580267
42,The Lanczos Biorthogonalization Algorithm and ...,Saad Lanczos non-symétrique.pdf,0.557234
19,SIAMJ.ScI.STAT.COMPUT. 1986SocietyforIndustria...,GMRes SAAD AND MARTIN.pdf,0.482013
11,JournalofComputationalandAppliedMathematics213...,CMRH SADOK.pdf,0.475987
8,Other Manifestations of the Schur Complement C...,BREZINSKI Other manifestations of the shur com...,0.47329
7,Linear and Multilinear Algebra ISSN: 0308-1087...,bouquin shermann morrison.pdf,0.456191
25,"JOURNAL OF APPROXIhfAl’ION THEORY 5, 137-148 (...",IGCG.pdf,0.450211
32,"Appl. Comput. Math.,V.xx,N.xx,20xx,pp.xx-xx A ...",paper_ACM_survey_modred.pdf,0.417934
40,_ SIAMJ. MATRIXANAL. APPL. () 1992 SocietyforI...,riedel1992.pdf,0.411475


Fonctionnalités de base

In [12]:
from textblob import TextBlob
from spellchecker import SpellChecker

# Initialiser le correcteur orthographique
spell = SpellChecker(language='fr')

def correct_spelling(query):
    # Utiliser TextBlob pour corriger l'orthographe de la requête
    blob = TextBlob(query)
    corrected_query = str(blob.correct())
    
    # Utiliser pyspellchecker pour suggestions de mots
    words = query.split()
    suggestions = {word: spell.candidates(word) for word in words}
    
    return corrected_query, suggestions

# Exemple d'utilisation
query = "recherh sémantique avc BERT"
corrected_query, suggestions = correct_spelling(query)
print(f"Requête corrigée: {corrected_query}")
print(f"Suggestions: {suggestions}")


Requête corrigée: richer sémantique arc BERT
Suggestions: {'recherh': {'rucher', 'ruchers', 'rocher', 'rechercha', 'recherche', 'recherché', 'rochers'}, 'sémantique': {'sémantique'}, 'avc': {'arc', 'avec', 'av.'}, 'BERT': {'vert', 'sert'}}


In [22]:
def filter_by_facet(corpus_df, author=None, date_range=None, file_name=None):
    """
    Filtrer les articles selon les facettes.
    :param author: Filtrer par nom de l'auteur.
    :param date_range: Tuple (start_date, end_date) pour filtrer par date.
    :param file_name: Filtrer par nom de fichier.
    """
    filtered_df = corpus_df
    
    # Filtrer par auteur
    if author:
        filtered_df = filtered_df[filtered_df['author'].str.contains(author, case=False, na=False)]
    
    # Filtrer par plage de dates
    if date_range:
        start_date, end_date = date_range
        # Assurez-vous que la colonne 'date' est bien au format datetime
        filtered_df['date'] = pd.to_datetime(filtered_df['date'], errors='coerce')
        filtered_df = filtered_df[(filtered_df['date'] >= start_date) & (filtered_df['date'] <= end_date)]
    
    # Filtrer par nom de fichier
    if file_name:
        filtered_df = filtered_df[filtered_df['file_name'].str.contains(file_name, case=False, na=False)]
    
    return filtered_df

# Exemple d'utilisation
filtered_results = filter_by_facet(pdf_data_df, file_name='Article_33.pdf')
filtered_results[['title', 'author', 'date', 'file_name']]

Unnamed: 0,title,author,date,file_name
2,Une approche unifi´ee pour les m´ethodes de so...,Email addresses: fatimabouyghf3@gmail.com(F.BO...,NaT,Article_33.pdf


In [29]:
# Correction de la requête
query = "recherh sémantique avc BERT"
corrected_query, suggestions = correct_spelling(query)

# Filtrage par facettes
filtered_corpus = filter_by_facet(pdf_data_df, file_name='Article_33.pdf')

# Recalculer les embeddings pour le corpus filtré
filtered_corpus_embeddings = model.encode(filtered_corpus['cleaned_content'].tolist(), convert_to_tensor=True)

# Recherches sur le corpus filtré
# Appliquer bm25_search à l'ensemble de pdf_data_df pour éviter les erreurs d'index
top_results_bm25 = bm25_search(corrected_query, bm25, pdf_data_df)

# S'assurer que top_n ne dépasse pas le nombre de documents dans le corpus filtré
top_n = min(5, len(filtered_corpus))
top_results_semantic = semantic_search(corrected_query, model, filtered_corpus_embeddings, filtered_corpus, top_n=top_n)

print("Résultats BM25 :", top_results_bm25[['title', 'bm25_score']])
print("\nRésultats Recherche Sémantique :", top_results_semantic[['title', 'semantic_score']])


Résultats BM25 :                                                title  bm25_score
0  Une am´elioration de la convergence de la m´et...         0.0
1  L’am´elioration de la convergence de la m´etho...         0.0
2  Une approche unifi´ee pour les m´ethodes de so...         0.0
3  1 Appendix ThecoefficientmatrixAandtheright–ha...         0.0
4  Electronic Transactions on Numerical Analysis....         0.0

Résultats Recherche Sémantique :                                                title  semantic_score
2  Une approche unifi´ee pour les m´ethodes de so...        0.189472


Interface utilisateur et expérience de recherche

Pour tester l'interface utilisateur, lancez le code si dessous puis sur votre navigateur rendez vous sur : http://127.0.0.1:5000

In [46]:
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS

app = Flask(__name__)
CORS(app)  # Pour permettre les requêtes cross-origin, si nécessaire

# Charger ton modèle BM25 et Sentence Transformer ici
# bm25, model, corpus_embeddings, corpus_df...

@app.route('/')
def home():
    return render_template('index.html')  # Crée un fichier index.html pour l'interface

@app.route('/search', methods=['POST'])
def search():
    query = request.json.get('query', '')
    corrected_query, _ = correct_spelling(query)  # Corrige la requête

    # Filtrer les résultats (par exemple, par auteur et date) si des filtres sont fournis
    author = request.json.get('author', None)
    date_range = request.json.get('date_range', None)
    filtered_corpus = filter_by_facet(pdf_data_df, author=author, date_range=date_range)

    # Recherche BM25 et Sémantique
    top_results_bm25 = bm25_search(corrected_query, bm25, filtered_corpus)
    top_results_semantic = semantic_search(corrected_query, model, corpus_embeddings, filtered_corpus)

    # Combine results or select one
    results = top_results_bm25[['title', 'bm25_score']].to_dict(orient='records')
    
    return jsonify(results=results)

try:
    if __name__ == '__main__':
        app.run(debug=False)
except Exception as e:
    print(f"Erreur lors du lancement de l'application : {e}")


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [20/Oct/2024 19:46:49] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [20/Oct/2024 19:46:50] "POST /search HTTP/1.1" 200 -
127.0.0.1 - - [20/Oct/2024 19:46:51] "POST /search HTTP/1.1" 200 -
127.0.0.1 - - [20/Oct/2024 19:46:51] "POST /search HTTP/1.1" 200 -
127.0.0.1 - - [20/Oct/2024 19:46:51] "POST /search HTTP/1.1" 200 -
