In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from termcolor import colored
import os
import numpy as np
from collections import defaultdict
import re

In [2]:
def dfs(noeud, graphe, visite, composante): #https://chat.mistral.ai/chat/d52a01f2-0186-4d4b-b46a-279ad43bdd3f
    pile = [noeud]
    while pile:
        n = pile.pop()
        if n not in visite:
            visite.add(n)
            composante.append(n)
            for voisin in graphe[n]:
                if voisin not in visite:
                    pile.append(voisin)

In [3]:
def trouver_composantes_connexes(paires):
    # Créer un graphe à partir des paires
    graphe = defaultdict(list)
    for a, b in paires:
        graphe[a].append(b)
        graphe[b].append(a)

    visites = set()
    composantes_connexes = []

    # Parcourir tous les noeuds du graphe
    for noeud in graphe:
        if noeud not in visites:
            composante = []
            dfs(noeud, graphe, visites, composante)
            composantes_connexes.append(sorted(composante))

    return composantes_connexes

In [4]:
def clean_double(directory):
    txt_files = os.listdir(directory)
    articles = []
    for txt in txt_files:
        try:
            with open(os.path.join(directory, txt), 'r', encoding='utf-8') as f:
                articles.append(f.read())
        except OSError:
            print(colored(f"Erreur lors de l'ouverture/lecture du fichier {txt}.", 'red'))
    #calcule la distance cosine entre tous les textes 2 à 2 pour éliminer les doublons        
    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(articles)
    pairwise_similarity = tfidf * tfidf.T
    arr = pairwise_similarity.toarray()
    np.fill_diagonal(arr, 0)
    index_doublon  = np.transpose((arr>=0.95).nonzero())
    groupe_doublon = trouver_composantes_connexes(index_doublon) #regroupe les doublons par groupe en regardant quels binômes sont transitifs
    text_group = []
    for groupe in groupe_doublon:
        first_file = True
        for text in groupe:
            if first_file:
                first_file = False
            else:
                try:
                    os.remove(os.path.join(directory, txt_files[text]))
                except OSError:
                    print(colored(f"Erreur lors de la suppression du fichier {txt_files[text]}", 'red'))
            
   

In [5]:
for year in range(2019, 2025) :
    path = 'Sources/Europresse/txt/'
    for month in range(1, 13):
        month = str(month) if month > 9 else '0' + str(month)
        folder = path + month + '_' + str(year)
        clean_double(folder)
        print(colored(f"Dossier {folder} trié", 'green'))

[32mDossier Sources/Europresse/txt/01_2019 trié[0m
[32mDossier Sources/Europresse/txt/02_2019 trié[0m
[32mDossier Sources/Europresse/txt/03_2019 trié[0m
[32mDossier Sources/Europresse/txt/04_2019 trié[0m
[32mDossier Sources/Europresse/txt/05_2019 trié[0m
[32mDossier Sources/Europresse/txt/06_2019 trié[0m
[32mDossier Sources/Europresse/txt/07_2019 trié[0m
[32mDossier Sources/Europresse/txt/08_2019 trié[0m
[32mDossier Sources/Europresse/txt/09_2019 trié[0m
[32mDossier Sources/Europresse/txt/10_2019 trié[0m
[32mDossier Sources/Europresse/txt/11_2019 trié[0m
[32mDossier Sources/Europresse/txt/12_2019 trié[0m
[32mDossier Sources/Europresse/txt/01_2020 trié[0m
[32mDossier Sources/Europresse/txt/02_2020 trié[0m
[32mDossier Sources/Europresse/txt/03_2020 trié[0m
[32mDossier Sources/Europresse/txt/04_2020 trié[0m
[32mDossier Sources/Europresse/txt/05_2020 trié[0m
[32mDossier Sources/Europresse/txt/06_2020 trié[0m
[32mDossier Sources/Europresse/txt/07_2020 tr

In [13]:
def clean_file(fpath):
    try:
        with open(fpath, "r") as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error 404 {fpath} not found :(")
        return

    motifs = [r"Cet article est paru dans (.*?)(?:\.|$|\n)",
            r"Illustration(s)",
            r"(site web)",
            r"Voir aussi :(.*?)(?:\.|$|\n)",
            r"Note(s) :(.*?)(?:\.|$|\n)",
            r"Encadré(s) :(.*?)(?:\.|$|\n)",
            r"(\+?\d{1,3}[-.\s]?)?(\(?\d{1,4}\)?[-.\s]?)?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,4}", #numéros de téléphone
            r"(https?://|www\.)[^\s/$.?#].[^\s]*", #sites web
            r"Certificat émis le (.*?)(?:\.|$|\n)",
            r"\bfr\b"]

    for motif in motifs:
        content = re.sub(motif, '', content)

    with open(fpath, "w") as f:
        f.write(content)

In [11]:
def clean_directory(dpath):
    txt_files = os.listdir(dpath)
    for txt in txt_files:
        clean_file(os.path.join(dpath, txt))

In [12]:
for year in range(2019, 2025) :
    path = 'Sources/Europresse/txt/'
    for month in range(1, 13):
        month = str(month) if month > 9 else '0' + str(month)
        folder = path + month + '_' + str(year)
        clean_directory(folder)
        print(colored(f"Dossier {folder} nettoyé", 'green'))

[32mDossier Sources/Europresse/txt/01_2019 trié[0m
[32mDossier Sources/Europresse/txt/02_2019 trié[0m
[32mDossier Sources/Europresse/txt/03_2019 trié[0m
[32mDossier Sources/Europresse/txt/04_2019 trié[0m
[32mDossier Sources/Europresse/txt/05_2019 trié[0m
[32mDossier Sources/Europresse/txt/06_2019 trié[0m
[32mDossier Sources/Europresse/txt/07_2019 trié[0m
[32mDossier Sources/Europresse/txt/08_2019 trié[0m
[32mDossier Sources/Europresse/txt/09_2019 trié[0m
[32mDossier Sources/Europresse/txt/10_2019 trié[0m
[32mDossier Sources/Europresse/txt/11_2019 trié[0m
[32mDossier Sources/Europresse/txt/12_2019 trié[0m
[32mDossier Sources/Europresse/txt/01_2020 trié[0m
[32mDossier Sources/Europresse/txt/02_2020 trié[0m
[32mDossier Sources/Europresse/txt/03_2020 trié[0m
[32mDossier Sources/Europresse/txt/04_2020 trié[0m
[32mDossier Sources/Europresse/txt/05_2020 trié[0m
[32mDossier Sources/Europresse/txt/06_2020 trié[0m
[32mDossier Sources/Europresse/txt/07_2020 tr