<a href="https://colab.research.google.com/github/Thiery45/-portfolio-/blob/main/Devoir.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installations des bibliotheques nécessaires **

In [None]:

!pip install PyPDF2 nltk scikit-learn pandas openpyxl

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
 # Installation des dépendances
!pip install spacy fr_core_news_sm PyPDF2 pandas scikit-learn
!python -m spacy download fr_core_news_sm

# Exécuter tout le code en une cellule

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
 import spacy
import math
import logging
import pandas as pd
import PyPDF2
import subprocess
import os
from sklearn.metrics.pairwise import cosine_similarity

# Configuration du logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Charger le modèle français de spaCy
try:
    nlp = spacy.load("fr_core_news_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "fr_core_news_sm"], check=True)
    nlp = spacy.load("fr_core_news_sm")

# Augmenter la limite de longueur de texte
nlp.max_length = 2_500_000


# Fonctions de base (adaptées au français)

def count(tokens):
    return {word: tokens.count(word) for word in set(tokens)}

def extract_text_from_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        logging.error(f"Le fichier {pdf_path} n'existe pas.")
        return ""
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        logging.error(f"Erreur lors de la lecture du PDF {pdf_path}: {e}")
    return text

def split_text(text, max_length=1_000_000):
    """Divise un texte en morceaux de longueur max_length."""
    return [text[i : i + max_length] for i in range(0, len(text), max_length)]


# Traitement des PDFs

pdf_paths = ["/content/pdf _2.pdf", "/content/pdf_1.pdf"]  # Chemins mis à jour

documents = []

for path in pdf_paths:
    text = extract_text_from_pdf(path)
    if not text.strip():
        logging.warning(f"Le fichier {path} est vide ou illisible.")
        continue
    text_parts = split_text(text)
    tokens = []
    for part in text_parts:
        doc = nlp(part)
        tokens.extend([
            token.lemma_.lower()
            for token in doc
            if not token.is_stop
            and not token.is_punct
            and token.text.strip()
        ])
    if tokens:
        documents.append(tokens)
        logging.info(f"PDF traité : {path} ({len(tokens)} tokens)")

# Vérifions  qu'on a bien au moins deux documents
if len(documents) < 2:
    logging.error("Pas assez de documents valides pour la comparaison.")
    exit()

# Calculs TF-IDF

def Formule_TF(tokens):
    total = len(tokens)
    return {word: count / total for word, count in count(tokens).items()} if total > 0 else {}

def Formule_IDF(docs):
    N = len(docs)
    unique_words = set(word for doc in docs for word in doc)
    return {word: math.log(N / (1 + sum(1 for doc in docs if word in doc)) + 1e-6) for word in unique_words}

def Formule_TF_IDF(tf, idf):
    return {word: tf_val * idf[word] for word, tf_val in tf.items()}

# Génération de la matrice
if all(len(doc) > 0 for doc in documents):
    tf_values = [Formule_TF(doc) for doc in documents]
    idf_values = Formule_IDF(documents)
    tfidf_values = [Formule_TF_IDF(tf, idf_values) for tf in tf_values]
else:
    logging.error("Un des documents est vide après traitement.")
    exit()


# Similarité cosinus et calcul du PWI

unique_words = sorted(set(word for doc in documents for word in doc))
vectors = [[tfidf.get(word, 0) for word in unique_words] for tfidf in tfidf_values]

if len(vectors) < 2:
    logging.error("Impossible de calculer la similarité : pas assez de documents.")
    exit()

similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Calcul du Pairwise Index (PWI)
def calculate_pwi(doc1, doc2):
    intersection = set(doc1).intersection(set(doc2))
    union = set(doc1).union(set(doc2))
    return len(intersection) / len(union) if union else 0

pwi_score = calculate_pwi(documents[0], documents[1])

# Export des résultats

result_df = pd.DataFrame({
    "Fichier 1": [pdf_paths[0]],
    "Fichier 2": [pdf_paths[1]],
    "Similarité Cosinus": [similarity],
    "Pairwise Index (PWI)": [pwi_score],
    "Relation": ["Oui" if similarity > 0.5 or pwi_score > 0.5 else "Non"]
})

output_file = "comparaison_pdfs.xlsx"
try:
    result_df.to_excel(output_file, index=False)
    logging.info(f"Résultats exportés dans {output_file}")
except ModuleNotFoundError:
    logging.error("openpyxl non installé. Essayez : pip install openpyxl")

# Affichage détaillé
detail_log = "\nDétail des calculs :"
for i, (path, vec) in enumerate(zip(pdf_paths, vectors)):
    detail_log += f"\nPDF {i+1} - {path}:"
    top_terms = sorted(zip(unique_words, vec), key=lambda x: x[1], reverse=True)[:5]
    detail_log += "\nTop 5 termes significatifs :"
    for term, score in top_terms:
        detail_log += f"\n{term}: {score:.4f}"

logging.info(detail_log)

# Lien de téléchargement
logging.info(f"Téléchargez le fichier ici : {output_file}")


