In [1]:
# IMPORTS
import pandas as pd
import importlib
from pathlib import Path

from utils import epg_cleaner, casen_config, spacy_config, stanza_config, ner_config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- File path 
RAW_DATA_PATH = Path("Ressources/20231101_raw.xlsx")

# --- Load raw data
raw_df = pd.read_excel(RAW_DATA_PATH)
# raw_df.head()

In [3]:
importlib.reload(epg_cleaner)
from utils.epg_cleaner import EPGCleaner

# Initialize the cleaner
# Note: We remove empty descriptions either if explicitly requested (remove_empty_desc=True)
#       or when merging duplicate descriptions (merge_duplicated_desc=True) to ensure that
#       merged 'files_id' tuples only contain IDs corresponding to valid descriptions.

cleaner = EPGCleaner(verbose=True)

cleaned_df = cleaner.clean(raw_data=raw_df)
# cleaned_df.head()

[Data Cleaning] Total rows: 9731
[Data Cleaning] Rows with missing descriptions: 3657 (37.58%)
[Data Cleaning] Removed rows with missing descriptions
[Data Cleaning] Remaining rows : 6074
[Data Cleaning] Aggregated duplicate descriptions
[Data Cleaning] Duplicates removed : 2626 (43.23%)
[Data Cleaning] Final dataset size: 3448 rows (64.57% reduction from original)
[Data Cleaning] Run in 0.10 seconds


In [4]:
importlib.reload(casen_config)
from utils.casen_config import CasEN

# Initialize CasEN tool

c = CasEN(
    generate_new_corpus = False,  # Assume CasEN result already existing
    corpus_mode = "single", # "single", "mutiple", "collection"
    lightmode = False, # lightmode ON -> compute only necessary column
    include_tags_name = False,
    define_label_with_grf = False,
    grf_limit = None, # None if we want to keep everything else int for asign limit
    remove_undefined_labels = False,
    archiving_result = False,
    auto_analyse = True,
    order_dataframe = True, 
    timer = True,
    logging = False,
    verbose=True
)

c_df = c.run(cleaned_df)
# df.to_excel("CasEN_collection.xlsx", index=False)
c_df.head()

[get_config] Config Loaded sucessfuly !
[get_files] Founds 1 .txt files.
[analyse_files] run in : 1.09s
[order] run in : 0.02s
#################################### CASEN DATAFRAME ####################################
DataFrame size : 1.38 Mo (11558225 bits), shape: (15754, 13)
Total NE founds : 15754
Unique NE founds (by NE + files_id): 14366
 ---- Labels founds ---- :
PER        : 6159 (39.09%)
Undefined  : 4816 (30.57%)
ORG        : 2669 (16.94%)
LOC        : 2110 (13.39%)
---- Graphs frequency ----:
grfpersGenerique     : 3525 (17.28%)
grfroleName          : 2419 (11.86%)
grftagNomFamille     : 2112 (10.36%)
grfpersPrenomNom     : 1789 (8.77%)
grftagPrenom         : 1727 (8.47%)
grfgeog              : 986 (4.83%)
grforgProximite      : 766 (3.76%)
grfamountPrepDuree   : 590 (2.89%)
grfplaceBatiment     : 543 (2.66%)
grftime              : 353 (1.73%)
grfgeogPhysique      : 349 (1.71%)
grfpersContextePersonne : 275 (1.35%)
grforgGenerique      : 260 (1.27%)
grftimeAdverbeTempsHeure :

Unnamed: 0,NE,label,files_id,pos,method,desc,grf_1,grf_2,grf_3,grf_4,grf_5,grf_6,grf_7
0,Nora,PER,"(0,)","(206, 210)",casEN,"leurs, elle est persuadée que Nora a divulgué ...",grfpersGenerique,,,,,,
1,Marcel,PER,"(0,)","(353, 359)",casEN,oue toujours pas où se trouve Marcel.,grfpersGenerique,,,,,,
2,enquêteurs,Undefined,"(1,)","(420, 430)",casEN,"tentant de le débusquer, les enquêteurs décou...",grfroleName,,,,,,
3,directrice,Undefined,"(1,)","(108, 118)",casEN,la galerie Delandin. Mais la directrice et fo...,grfroleName,,,,,,
4,Selma Berrayah,PER,"(1,)","(217, 231)",casEN,ire Magellan et le lieutenant Selma Berrayah s...,grfpersPrenomNom,grftagPrenom,grftagNomFamille,,,,


In [5]:
importlib.reload(spacy_config)
from utils.spacy_config import SpaCyConfig

spc = SpaCyConfig(
    production_mode= False,
    explode_ids = False,
    order_dataframe=True,
    auto_analyse= True, 
    timer=True, 
    logging=False, 
    verbose=True
)

sp_df = spc.run(cleaned_df)
sp_df.head()
# sp_df.to_excel("spacy.xlsx", index=False)

[load config] Config Loaded sucessfuly !
[spaCy] spaCy version: 3.8.7
[spaCy] spaCy model: core_news_sm
order in : 0.00s
#################################### CASEN DATAFRAME ####################################
DataFrame size : 0.91 Mo (7664541 bits), shape: (14147, 7)
Total NE founds : 14147
Unique NE founds (by NE + files_id): 12658
 ---- Labels founds ---- :
PER        : 6577 (46.49%)
LOC        : 4102 (29.00%)
MISC       : 2684 (18.97%)
ORG        : 784 (5.54%)
#########################################################################################
self_analyse in : 0.02s
SpaCy DataFrame shape: (14147, 7)
run in : 51.50s


Unnamed: 0,titles,NE,label,desc,method,files_id,pos
0,Faster than fear,Marcel,PER,oue toujours pas où se trouve Marcel.,spaCy,"(0,)","(353, 359)"
1,Faster than fear,Sunny,LOC,i-ci demande à ne parler qu'à Sunny. D'ailleur...,spaCy,"(0,)","(164, 169)"
2,Faster than fear,Haffner,PER,"us rien à voir avec l'affaire Haffner, mais ce...",spaCy,"(0,)","(116, 123)"
3,Faster than fear,Haffner,PER,"er à... elle. En garde à vue, Haffner n'avoue ...",spaCy,"(0,)","(311, 318)"
4,Faster than fear,Nora,LOC,"leurs, elle est persuadée que Nora a divulgué ...",spaCy,"(0,)","(206, 210)"


In [6]:
importlib.reload(stanza_config)
from utils.stanza_config import StanzaConfig

stz = StanzaConfig(
    use_gpu=True, 
    production_mode=False,
    order_dataframe = True,
    timer=True, 
    logging=False, 
    verbose=True
)

stz_df = stz.run(cleaned_df)
stz_df.head()

2025-12-03 16:56:02 INFO: Loading these models for language: fr (French):
| Processor | Package            |
----------------------------------
| tokenize  | combined           |
| mwt       | combined           |
| ner       | wikinergold_charlm |

2025-12-03 16:56:02 INFO: Using device: cpu
2025-12-03 16:56:02 INFO: Loading: tokenize


[load config] Config Loaded sucessfuly !


2025-12-03 16:56:11 INFO: Loading: mwt
2025-12-03 16:56:11 INFO: Loading: ner
2025-12-03 16:56:16 INFO: Done loading processors!


[stanza] Stanza version: 1.10.1
[stanza] Pipeline lang: fr
order in : 0.00s
Stanza DataFrame shape: (13709, 7)
run in : 347.00s


Unnamed: 0,titles,NE,label,desc,method,files_id,pos
0,Faster than fear,Marcel,PER,oue toujours pas où se trouve Marcel.,stanza,"(0,)","(353, 359)"
1,Faster than fear,Haffner,PER,"er à... elle. En garde à vue, Haffner n'avoue ...",stanza,"(0,)","(311, 318)"
2,Faster than fear,Nora,PER,"leurs, elle est persuadée que Nora a divulgué ...",stanza,"(0,)","(206, 210)"
3,Faster than fear,Sunny,MISC,a pu prouver son innocence et Sunny a été susp...,stanza,"(0,)","(35, 40)"
4,Faster than fear,Ralf,PER,Ralf a pu prouver son innocence et,stanza,"(0,)","(0, 4)"


In [None]:
importlib.reload(ner_config)
from utils.ner_config import NerConfig

ner = NerConfig(
    process_priority_merge= True,
    labels_priority=["PER"], # Les labels sur lequels on applique la priorité
    process_casen_opti= True,
    remove_duplicated_entity_per_desc= True,
    verbose= True
)

ner_df = ner.run(cleaned_df, [c_df, sp_df, stz_df], correction="Ressources/20231101_correction.xlsx")
ner_df.head(10)
ner_df.to_excel("CasEN_SpaCy_Stanza_Priority(PER).xlsx", index=False)

[load config] Config Loaded sucessfuly !
[_merge] Shape of every DataFrame : [(15754, 13), (14147, 7), (13709, 7)]
[keep precise graphs] CasENOpti : 19 lines updated
[apply_correction] 22006 corrections chargées.
[apply_correction] Shape final: (30529, 17)
