In [6]:
!pip install pytesseract
!pip install pdf2image
!pip install transformers
!pip install spacy
!python -m spacy download fr_core_news_sm
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr-fra


Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following

In [10]:
import pytesseract
from pdf2image import convert_from_path
import spacy
from transformers import BertTokenizer, BertForMaskedLM
import torch
from collections import Counter
import re

# Charger le modèle spaCy pour la langue française
nlp = spacy.load('fr_core_news_sm')

# Charger le modèle BERT
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
model = BertForMaskedLM.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')

# Liste de mots-clés pour chaque type de document
document_keywords = {
    "Procédure, méthode, guide": ["procédure", "méthode", "guide", "instructions", "étapes", "processus", "fonctionnement"],
    "CV": ["compétences", "expérience", "formation", "objectif", "parcours", "CV", "candidat", "recrutement"],
    "Lettre de motivation": ["motivation", "candidature", "poste", "entreprise", "recrutement", "compétences", "intérêt","travailler"],
    "Contrat": ["accord", "engagement", "clause", "signature", "responsabilité", "conditions", "parties"],
    "Fiche de paie": ["salaire", "net", "brut", "cotisations", "charges sociales", "rémunération"],
    "Devis": ["estimation", "prix", "proposition", "offre", "coût", "service", "quantité"],
    "Facture": ["facture", "paiement", "montant", "produit", "service", "date", "référence"],
    "Bon de commandes": ["commande", "produit", "quantité", "prix", "bon", "livraison"],
    "Attestation": ["attestation", "certification", "déclaration", "preuve", "confirmation"]
}

# Fonction pour traiter l'image et extraire le texte
def ocr_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang='fra')
    return text

# Fonction pour nettoyer et prétraiter le texte
def preprocess_text(text):
    # Enlever les chiffres, la ponctuation et convertir en minuscules
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # Lemmatisation avec spaCy
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])

    return lemmatized_text

# Fonction pour extraire les mots les plus fréquents
def get_most_frequent_words(text, top_n=10):
    words = text.split()
    word_counts = Counter(words)
    return word_counts.most_common(top_n)

# Fonction pour contextualiser les mots fréquents avec BERT
def contextualize_words_with_bert(words):
    inputs = tokenizer(words, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs

# Fonction pour choisir le tag du document
def choose_document_tag(frequent_words):
    # Vérifier chaque document et trouver les correspondances les plus proches
    document_tags = []

    for tag, keywords in document_keywords.items():
        common_keywords = [word for word in frequent_words if word in keywords]
        if common_keywords:
            document_tags.append((tag, len(common_keywords)))

    if document_tags:
        # Trier par nombre de correspondances
        document_tags.sort(key=lambda x: x[1], reverse=True)
        return document_tags[0][0]
    else:
        return "Document inconnu"

# Fonction principale pour gérer le flux
def process_document(pdf_path):
    # OCR pour extraire le texte du PDF
    extracted_text = ocr_pdf(pdf_path)

    # Nettoyer et prétraiter le texte
    cleaned_text = preprocess_text(extracted_text)

    # Extraire les mots les plus fréquents
    frequent_words = get_most_frequent_words(cleaned_text)

    # Choisir un tag pour le document en fonction des mots fréquents
    document_tag = choose_document_tag([word[0] for word in frequent_words])

    return document_tag

# Exemple d'appel de la fonction
pdf_path = "/content/CERINA ALLEK.pdf"
document_tag = process_document(pdf_path)
print("Le document appartient à la catégorie:", document_tag)


Some weights of BertForMaskedLM were not initialized from the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Le document appartient à la catégorie: Lettre de motivation
