In [34]:
# IMPORTS
import pandas as pd
import importlib
from pathlib import Path

from utils import epg_cleaner, casen_config, spacy_config, stanza_config, ner_config2

## Step 1:  Data Retrieval and Preprocessing

The preprocessing stage serves two main purposes:

1. **Data Cleaning**  
   Entries that are not relevant to the task are systematically removed.  
   Specifically, Electronic Program Guides (EPGs) without textual descriptions are discarded.

2. **Data Consolidation**  
   EPGs sharing identical descriptions are merged into a single record.  
   This step minimizes redundancy and reduces the overall computational workload in subsequent stages.

3. **Provenance Tracking**  
   To preserve traceability, each merged record retains a `files_id` field.  
   This field stores a tuple of the original file identifiers from which the description was derived.


In [35]:
# --- File path 
RAW_DATA_PATH = Path("Ressources/20231101_raw.xlsx")

# --- Load raw data
raw_df = pd.read_excel(RAW_DATA_PATH)
raw_df.head()

Unnamed: 0,titles,sub_title,days,channel,category,desc,length,start_hour,start_mins,stop_hour,stop_mins,clean_titles
0,Faster than fear,Série TV\nSérie policière\nRéalisateur :\nFlor...,20231101,13eme RUE,Série TV,Ralf a pu prouver son innocence et Sunny a été...,50,1,30,2,20,Faster than fear
1,Commissaire Magellan (S1-E30),Série TV\nSérie policière\nDurée : 1h40min\nRé...,20231101,13eme RUE,Série TV,L'oeuvre du talentueux photographe Tristan Gar...,105,2,20,4,5,Commissaire Magellan
2,Einstein : équations criminelles (S3-E1),Série TV\nSérie policière\nDurée : 42min\nRéal...,20231101,13eme RUE,Série TV,Un châtelain féru de chasse et avec la gâchett...,45,4,5,4,50,Einstein : équations criminelles
3,La mort du Père Noël,Cinéma\nCourt métrage\nDurée : 15min\nRéalisat...,20231101,13eme RUE,Cinéma,Le Père Noël est mort. Qui l'a tué ?,10,4,50,5,0,La mort du Père Noël
4,La belle affaire,Cinéma\nCourt métrage\nDurée : 25min\nRéalisat...,20231101,13eme RUE,Cinéma,"A la frontière suisse, une détective est charg...",25,5,0,5,25,La belle affaire


In [None]:
importlib.reload(epg_cleaner)
from utils.epg_cleaner import EPGCleaner

# Initialize the cleaner
# Note: We remove empty descriptions either if explicitly requested (remove_empty_desc=True)
#       or when merging duplicate descriptions (merge_duplicated_desc=True) to ensure that
#       merged 'files_id' tuples only contain IDs corresponding to valid descriptions.

cleaner = EPGCleaner(verbose=True)

cleaned_df = cleaner.clean(raw_data=raw_df)
cleaned_df.head()

[Data Cleaning] Total rows: 9731
[Data Cleaning] Rows with missing descriptions: 3657 (37.58%)
[Data Cleaning] Removed rows with missing descriptions
[Data Cleaning] Remaining rows : 6074
[Data Cleaning] Aggregated duplicate descriptions
[Data Cleaning] Duplicates removed : 2626 (43.23%)
[Data Cleaning] Final dataset size: 3448 rows (64.57% reduction from original)
[Data Cleaning] Run in 0.05 seconds


Unnamed: 0,desc,files_id,titles,sub_title,days,channel,category,length,start_hour,start_mins,stop_hour,stop_mins,clean_titles
0,Mon ex choisit mon next redonne une chance...,"(699,)",Mon Ex choisit mon Next,Autre\nTéléréalité\nSortie : 2022\nPays : Etat...,20231101,BET,Autre,70,22,55,0,5,Mon Ex choisit mon Next
1,"Tombe la neige , Vous permettez monsieur ...","(5841,)","Stoemp, pèkèt... et des rawettes !",Culture Infos\nMagazine des loisirs,20231101,La Une,Culture Infos,30,10,30,11,0,"Stoemp, pèkèt... et des rawettes !"
2,Bienvenue chez vous . Cricket Green et sa fam...,"(2598,)",Les Green à Big City (S1-E7),Série TV\nSérie d'animation\nDurée : 20min\nRé...,20231101,Disney Channel Wallonia,Série TV,22,7,56,8,18,Les Green à Big City
3,Billy et Bam Bam est une série dynamique et ...,"(787, 813, 857, 900, 950)",Billy Bam Bam,Autre\nEmission jeunesse,20231101,Baby TV,Autre,6,6,48,6,54,Billy Bam Bam
4,"Bleu, Blanc, Bouge met en lumière une innova...","(4966,)","Bleu, Blanc, Bouge",Culture Infos\nMagazine de services\nSortie : ...,20231101,France 5,Culture Infos,5,21,0,21,5,"Bleu, Blanc, Bouge"


## Step 2:  Analysis of Named Entity Recognition (CasEN, SpaCy, Stanza)

- The goal of this step is to compare and analyze the outputs of different NER systems for the same dataset.
- Each NER system produces its own annotations:
    - **CasEN**: rule-based French NER system, generally precise for domain-specific entities.
    - **SpaCy**: statistical model, fast and flexible, may detect entities missed by rule-based systems.
    - **Stanza**: neural network model, often captures context-sensitive entities, including ambiguous cases.
- The outputs of this step will serve as the **foundation for the consensus process**, which will determine the final labels assigned to each entity.

In [37]:
importlib.reload(casen_config)
from utils.casen_config import CasEN

# Initialize CasEN tool

c = CasEN(
    generate_new_corpus = False,  # Assume CasEN result already existing
    corpus_mode = "single", # "single", "mutiple", "collection"
    lightmode = False, # lightmode ON -> compute only necessary column
    include_tags_name = False,
    define_label_with_grf = False,
    grf_limit = None, # None if we want to keep everything else int for asign limit
    remove_undefined_labels = False,
    archiving_result = False,
    auto_analyse = True,
    order_dataframe = True, 
    timer = True,
    logging = False,
    verbose=True
)

c_df = c.run(cleaned_df)
# df.to_excel("CasEN_collection.xlsx", index=False)
c_df.head()

[get_config] Config Loaded sucessfuly !
[get_files] Founds 1 .txt files.
[analyse_files] run in : 0.86s
[order] run in : 0.00s
#################################### CASEN DATAFRAME ####################################
DataFrame size : 1.38 Mo (11558225 bits), shape: (15754, 13)
Total NE founds : 15754
Unique NE founds (by NE + files_id): 14366
 ---- Labels founds ---- :
PER        : 6159 (39.09%)
Undefined  : 4816 (30.57%)
ORG        : 2669 (16.94%)
LOC        : 2110 (13.39%)
---- Graphs frequency ----:
grfpersGenerique     : 3525 (17.28%)
grfroleName          : 2419 (11.86%)
grftagNomFamille     : 2112 (10.36%)
grfpersPrenomNom     : 1789 (8.77%)
grftagPrenom         : 1727 (8.47%)
grfgeog              : 986 (4.83%)
grforgProximite      : 766 (3.76%)
grfamountPrepDuree   : 590 (2.89%)
grfplaceBatiment     : 543 (2.66%)
grftime              : 353 (1.73%)
grfgeogPhysique      : 349 (1.71%)
grfpersContextePersonne : 275 (1.35%)
grforgGenerique      : 260 (1.27%)
grftimeAdverbeTempsHeure :

Unnamed: 0,NE,label,files_id,pos,method,desc,grf_1,grf_2,grf_3,grf_4,grf_5,grf_6,grf_7
0,Nora,PER,"(0,)","(206, 210)",casEN,"leurs, elle est persuadée que Nora a divulgué ...",grfpersGenerique,,,,,,
1,Marcel,PER,"(0,)","(353, 359)",casEN,oue toujours pas où se trouve Marcel.,grfpersGenerique,,,,,,
2,enquêteurs,Undefined,"(1,)","(420, 430)",casEN,"tentant de le débusquer, les enquêteurs décou...",grfroleName,,,,,,
3,directrice,Undefined,"(1,)","(108, 118)",casEN,la galerie Delandin. Mais la directrice et fo...,grfroleName,,,,,,
4,Selma Berrayah,PER,"(1,)","(217, 231)",casEN,ire Magellan et le lieutenant Selma Berrayah s...,grfpersPrenomNom,grftagPrenom,grftagNomFamille,,,,


In [38]:
importlib.reload(spacy_config)
from utils.spacy_config import SpaCyConfig

spc = SpaCyConfig(
    production_mode= False,
    explode_ids = False,
    order_dataframe=True,
    auto_analyse= True, 
    timer=True, 
    logging=False, 
    verbose=True
)

sp_df = spc.run(cleaned_df)
sp_df.head()
# sp_df.to_excel("spacy.xlsx", index=False)

[load config] Config Loaded sucessfuly !
[spaCy] spaCy version: 3.8.7
[spaCy] spaCy model: core_news_sm
order in : 0.01s
#################################### CASEN DATAFRAME ####################################
DataFrame size : 0.91 Mo (7664541 bits), shape: (14147, 7)
Total NE founds : 14147
Unique NE founds (by NE + files_id): 12658
 ---- Labels founds ---- :
PER        : 6577 (46.49%)
LOC        : 4102 (29.00%)
MISC       : 2684 (18.97%)
ORG        : 784 (5.54%)
#########################################################################################
self_analyse in : 0.02s
SpaCy DataFrame shape: (14147, 7)
run in : 49.15s


Unnamed: 0,titles,NE,label,desc,method,files_id,pos
0,Faster than fear,Marcel,PER,oue toujours pas où se trouve Marcel.,spaCy,"(0,)","(353, 359)"
1,Faster than fear,Sunny,LOC,i-ci demande à ne parler qu'à Sunny. D'ailleur...,spaCy,"(0,)","(164, 169)"
2,Faster than fear,Haffner,PER,"us rien à voir avec l'affaire Haffner, mais ce...",spaCy,"(0,)","(116, 123)"
3,Faster than fear,Haffner,PER,"er à... elle. En garde à vue, Haffner n'avoue ...",spaCy,"(0,)","(311, 318)"
4,Faster than fear,Nora,LOC,"leurs, elle est persuadée que Nora a divulgué ...",spaCy,"(0,)","(206, 210)"


In [39]:
importlib.reload(stanza_config)
from utils.stanza_config import StanzaConfig

stz = StanzaConfig(
    use_gpu=True, 
    production_mode=False,
    order_dataframe = True,
    timer=True, 
    logging=False, 
    verbose=True
)

stz_df = stz.run(cleaned_df)
stz_df.head()

2025-11-27 16:45:42 INFO: Loading these models for language: fr (French):
| Processor | Package            |
----------------------------------
| tokenize  | combined           |
| mwt       | combined           |
| ner       | wikinergold_charlm |

2025-11-27 16:45:42 INFO: Using device: cuda
2025-11-27 16:45:42 INFO: Loading: tokenize
2025-11-27 16:45:42 INFO: Loading: mwt
2025-11-27 16:45:42 INFO: Loading: ner


[load config] Config Loaded sucessfuly !


2025-11-27 16:45:46 INFO: Done loading processors!


[stanza] Stanza version: 1.10.1
[stanza] Pipeline lang: fr
order in : 0.01s
Stanza DataFrame shape: (13709, 7)
run in : 70.07s


Unnamed: 0,titles,NE,label,desc,method,files_id,pos
0,Faster than fear,Marcel,PER,oue toujours pas où se trouve Marcel.,stanza,"(0,)","(353, 359)"
1,Faster than fear,Haffner,PER,"er à... elle. En garde à vue, Haffner n'avoue ...",stanza,"(0,)","(311, 318)"
2,Faster than fear,Nora,PER,"leurs, elle est persuadée que Nora a divulgué ...",stanza,"(0,)","(206, 210)"
3,Faster than fear,Sunny,MISC,a pu prouver son innocence et Sunny a été susp...,stanza,"(0,)","(35, 40)"
4,Faster than fear,Ralf,PER,Ralf a pu prouver son innocence et,stanza,"(0,)","(0, 4)"


In [49]:
importlib.reload(ner_config2)
from utils.ner_config2 import NerConfig

ner = NerConfig(
    process_priority_merge= True,
    process_casen_opti= True,
    remove_duplicated_entity_per_desc= True,
    verbose= True
)

ner_df = ner.run(cleaned_df, [c_df, sp_df, stz_df])
ner_df.head(10)
# ner_df.to_excel("priority_all_casen_opti.xlsx", index=False)

[load config] Config Loaded sucessfuly !
[_merge] Shape of every DataFrame : [(15754, 13), (14147, 7), (13709, 7)]
[keep precise graphs] CasENOpti : 21 lines updated


Unnamed: 0,NE,label,files_id,pos,grf_1,grf_2,grf_3,grf_4,grf_5,grf_6,grf_7,method,titles,desc
0,Nora,PER,"(0,)","(206, 210)",grfpersGenerique,,,,,,,casEN_stanza_priority,Faster than fear,"leurs, elle est persuadée que Nora a divulgué ..."
1,affaire Haffner,MISC,"(0,)","(108, 123)",,,,,,,,stanza,Faster than fear,"e n'a plus rien à voir avec l'affaire Haffner,..."
2,Haffner,PER,"(0,)","(116, 123)",,,,,,,,spaCy,Faster than fear,"us rien à voir avec l'affaire Haffner, mais ce..."
3,Marcel,PER,"(0,)","(353, 359)",grfpersGenerique,,,,,,,casEN_spaCy_stanza,Faster than fear,oue toujours pas où se trouve Marcel.
4,Ralf,PER,"(0,)","(0, 4)",,,,,,,,stanza,Faster than fear,Ralf a pu prouver son innocence et
5,Haffner,PER,"(0,)","(311, 318)",,,,,,,,spaCy_stanza,Faster than fear,"er à... elle. En garde à vue, Haffner n'avoue ..."
6,Sunny,PER,"(0,)","(35, 40)",,,,,,,,spaCy,Faster than fear,a pu prouver son innocence et Sunny a été susp...
7,Sunny,MISC,"(0,)","(35, 40)",,,,,,,,stanza,Faster than fear,a pu prouver son innocence et Sunny a été susp...
8,Sunny,LOC,"(0,)","(164, 169)",,,,,,,,spaCy_stanza,Faster than fear,i-ci demande à ne parler qu'à Sunny. D'ailleur...
9,Nora,LOC,"(0,)","(206, 210)",,,,,,,,spaCy,Faster than fear,"leurs, elle est persuadée que Nora a divulgué ..."
