# Code minimal nécessaire pour Process un document

In [None]:
DATA_FOLDER = ""
XML_INITIAL_FILE = ""
ANTI_DICT_FILE = ""
STEMMED_REPLACEMENTS = ""

In [None]:
from index import BS4Parser, Corpus

documents = BS4Parser().process_folder(DATA_FOLDER, limit=20)  # Process all files inside the folder
Corpus(documents = documents)

In [None]:
STORAGE_TAGS = {"Corpus": "corpus", "documents": "bulletins", "Document": "bulletin", "Image": "image"}

with open(XML_INITIAL_FILE, "w", encoding="utf-8") as file:
    file.write(Corpus(documents=documents).model_dump_xml_str_pretty(tags=STORAGE_TAGS))
with open(XML_INITIAL_FILE, "r", encoding="utf-8") as file:
    CORPUS = Corpus.model_validate_xml(file.read(), tags=STORAGE_TAGS)

FILTERED_CORPUS = CORPUS.model_copy(deep=True)
CORPUS

## STANDARDISATION

In [None]:
from typing import Callable
import re

# Fonction de standardisation
STANDARDIZE: Callable[[str], str] = lambda x: re.sub(r"[^\w\s]", "", re.sub(r"'", " ", x.strip().lower()))

STANDARDIZE("L'éléphant est un animal majestueux.")  # "l éléphant est un animal majestueux"

In [None]:
FILTERED_CORPUS.apply_filter(["texte", "titre", "images.legende"], filter=STANDARDIZE)

## Filtrage par ANTI DICT

In [None]:
import pandas
with open(ANTI_DICT_FILE, "w+", encoding="utf-8") as file:
    file.writelines([f"{token}\t\"\"\n" for token in CORPUS.token_index().get_irrelevant_terms()])
anti_dictionnaire = pandas.read_csv(ANTI_DICT_FILE, sep="\t", header=None, na_filter=False)
anti_dictionnaire.head(3)

In [None]:
FILTERED_CORPUS.apply_substitutions(["texte", "titre", "images.legende"], anti_dictionnaire)
print(CORPUS.documents[0].texte)
print("---")
print(FILTERED_CORPUS.documents[0].texte)

In [None]:
from index import spacy_lemmas, snowball_stems

tokens = list(FILTERED_CORPUS.tokens().keys())

snowball_stems = snowball_stems(tokens)
spacy_lemmas = spacy_lemmas(tokens)

pandas.DataFrame({
    "Token": list(snowball_stems["word"]), 
    "Spacy": list(spacy_lemmas["stem"]), 
    "Snowball": list(snowball_stems["stem"]),
}).tail(3)

In [None]:
stems = spacy_lemmas
with open(STEMMED_REPLACEMENTS, "w+", encoding="utf-8") as file:
    file.writelines([f"{word}\t{stem}\n" for word, stem in zip(stems['word'], stems['stem'])])
    
substitutions = pandas.read_csv(STEMMED_REPLACEMENTS, sep="\t", header=None, na_filter=False)
substitutions.head(3)  # Fichier de Substitutions par lemmes