In [16]:
import importlib
import pandas as pd

In [17]:
# LOAD THE DATAS
DATAS = pd.read_excel("C:\\Users\\valen\\Documents\\Informatique-L3\\Stage_NER\\NER\\src\\Ressources\\20231101_raw.xlsx")

# CasEN Configuration

| Parameter         | Type             | Description                                                                                                 |
|------------------|-------------------|-------------------------------------------------------------------------------------------------------------|
| `run_casen`           | `bool`| If `True`, executes CasEN. If `False`, assumes data already exists in `corpus_folder` and `result_folder`.  |        
| `single_corpus`  | `bool`            | If `True`, produces a single corpus file; otherwise, one per description in the `data`.                           |
| `production_mode`  | `bool`            | If `True`, keep only the needed columns (use less memory).                           |                                                             
| `remove_misc`    | `bool`            | If `True`, removes all MISC tags from the output.                                                           |
| `logging`        | `bool`            | Enables logging of key function execution times to a log file.                                              |
| `timer`          | `bool`            | Displays execution time in the console during runtime.                                                      |
| `archiving_result`          | `bool`            | Store the current files in the CasEN result folder to the Archiving folder before running CasEN.                                                      |
| `verbose`        | `bool`            | Enables detailed debug output in the console.                                                               |

In [27]:
from tools import casen_config
importlib.reload(casen_config)
from tools.casen_config import CasenConfig

# ========================= CASEN EXEMPLE ===================== 
c = CasenConfig(
    run_casen= True,
    single_corpus= True,
    production_mode = True, # production_mode True : 5432714 bytes VS False : 8422013 bytes 
    remove_misc= True,
    logging= False,
    timer= False,
    archiving_result= False,
    verbose= False
)

c_df = c.run(DATAS)
c_df.to_excel("casen_generique_at_end.xlsx", index=False)
c_df.head()

{'time', 'date', 'name', 'demonym', 'vieuxSigle', 'geogName', 'productName', 'placeName', 'nationality', 'timePeriod', 'adress', 'orgName', 'ref', 'geogFeat', 'roleName', 'org', 'extent', 'product', 'persName', 'datePeriod', 'place', 'measure', 'event', 'gYear'}
corpus.txt
C:\Users\valen\Documents\Informatique-L3\Stage_NER\NER\src\Results\Corpus		 -> results in : C:\Users\valen\Documents\Informatique-L3\Stage_NER\NER\src\Results\CasEN\Res_CasEN
1 files to process with CasEN in  C:\Users\valen\Documents\Informatique-L3\Stage_NER\NER\src\Results\Corpus



Unnamed: 0,NER,NER_label,method,main_graph,second_graph,third_graph,file_id,entity_start,entity_end
0,Nora,PER,casEN,grfpersGenerique,,,0,206,210
1,Marcel,PER,casEN,grfpersGenerique,,,0,353,359
2,Tristan Garil,PER,casEN,grfpersPrenomNom,grftagPrenom,grftagNomFamille,1,35,48
3,galerie,ORG,casEN,grforgProximite,,,1,82,89
4,Delandin,PER,casEN,grfpersGenerique,,,1,90,98


# SpaCy Configuration

| Parameter         | Type             | Description                                                                                                 |
|------------------|-------------------|-------------------------------------------------------------------------------------------------------------|
| `model`        | `str`            | Choose the NLP to load from SpaCy:  `fr_core_news_md`, `fr_core_news_lg` (you must download them before).                                              |
| `production_mode`  | `bool`            | If `True`, keep only the needed columns (use less memory).                           |      
| `logging`        | `bool`            | Enables logging of key function execution times to a log file.                                              |
| `timer`          | `bool`            | Displays execution time in the console during runtime.                                                      |
| `verbose`        | `bool`            | Enables detailed debug output in the console.                                                               |

In [20]:
from tools import spacy_wrapper
importlib.reload(spacy_wrapper)
from tools.spacy_wrapper import SpaCyConfig

sp = SpaCyConfig(
    model = "fr_core_news_sm",
    production_mode = True,
    timer = False,
    logging = False,
    verbose = False
)

# sp_df = sp.run(DATAS)
# sp_df.head()

# Stanza Configuration

| Parameter         | Type             | Description                                                                                                 |
|------------------|-------------------|-------------------------------------------------------------------------------------------------------------|
| `use_gpu`        | `bool`            | Run Stanza on the `GPU`, to make it faster. (You must have to install some dependencies before)                                             |
| `production_mode`  | `bool`            | If `True`, keep only the needed columns (use less memory).                           |      
| `logging`        | `bool`            | Enables logging of key function execution times to a log file.                                              |
| `timer`          | `bool`            | Displays execution time in the console during runtime.                                                      |
| `verbose`        | `bool`            | Enables detailed debug output in the console.                                                               |

In [None]:
from tools import stanza_wrapper
importlib.reload(stanza_wrapper)
from tools.stanza_wrapper import StanzaConfig

st = StanzaConfig(
    use_gpu = True,
    production_mode = True,
    timer = False,
    logging = False,
    verbose = False
)

# st_df = st.run(DATAS)
# st_df.head()

In [None]:
# We can also load are DataFrames

c_df = pd.read_excel("Results/short_casen.xlsx")
sp_df = pd.read_excel("Results/short_spacy.xlsx")
st_df = pd.read_excel("Results/short_stanza.xlsx")

# NER Configuration

| Parameter         | Type             | Description                                                                                                 |
|------------------|-------------------|-------------------------------------------------------------------------------------------------------------|
| `process_priority_merge`  | `bool` | If systems agree on the entity but not on the label, we prioritize the most frequent or reliable label among agreeing systems                                         |
| `process_casen_opti`  | `bool`            | We keep the entities found only by CasEN but found by graphs judged to be precise.                           |   
| `remove_duplicated_entity_per_desc`  | `bool`            | Remove every duplicated entities for same description                     |
| `keep_only_trustable_methods`  | `bool`            | Keep all entities when they are find with the good methods (remove all potential wrong entities)                           |
| `save_to_file`  | `bool`            | Save the result to  a `xlsx` or `csv` file                           |
| `production_mode`  | `bool`            | If `True`, keep only the needed columns (use less memory).                           |   
| `logging`        | `bool`            | Enables logging of key function execution times to a log file.                                              |
| `timer`          | `bool`            | Displays execution time in the console during runtime.                                                      |
| `verbose`        | `bool`            | Enables detailed debug output in the console.                                                               |

In [30]:
from tools import ner_config
importlib.reload(ner_config)
from tools.ner_config import NerConfig


ner = NerConfig(
    process_priority_merge = True,
    process_casen_opti = True,
    remove_duplicated_entity_per_desc = True,
    keep_only_trustable_methods = True,
    save_to_file = True,
    production_mode = True,
    logging = False,
    timer = False,
    verbose = False
)

ner_df = ner.run(data=DATAS, dfs=[c_df, sp_df, st_df], correction="./Results/NER_casEN_spaCy_stanza_PER_entity_track.xlsx") 
ner_df.head()

File saved at : Results\20231101_priority_CasenOpti_TrustMethods_prod(2).xlsx


Unnamed: 0,manual cat,correct,extent,NER_category,titles,sub_title,days,channel,category,NER,NER_label,clean_titles,method,file_id
0,PER,1.0,1.0,1,Faster than fear,S√©rie TV\nS√©rie polici√®re\nR√©alisateur :\nFlor...,20231101,13eme RUE,S√©rie TV,Nora,PER,Faster than fear,casEN_stanza_priority,0.0
1,PER,1.0,1.0,1,Faster than fear,S√©rie TV\nS√©rie polici√®re\nR√©alisateur :\nFlor...,20231101,13eme RUE,S√©rie TV,Marcel,PER,Faster than fear,casEN_spaCy_stanza,0.0
2,PER,1.0,1.0,1,Commissaire Magellan (S1-E30),S√©rie TV\nS√©rie polici√®re\nDur√©e : 1h40min\nR√©...,20231101,13eme RUE,S√©rie TV,Tristan Garil,PER,Commissaire Magellan,casEN_stanza,1.0
3,PER,1.0,1.0,1,Commissaire Magellan (S1-E30),S√©rie TV\nS√©rie polici√®re\nDur√©e : 1h40min\nR√©...,20231101,13eme RUE,S√©rie TV,Delandin,PER,Commissaire Magellan,casEN_spaCy_stanza,1.0
4,PER,1.0,1.0,1,Commissaire Magellan (S1-E30),S√©rie TV\nS√©rie polici√®re\nDur√©e : 1h40min\nR√©...,20231101,13eme RUE,S√©rie TV,Estelle Delandin,PER,Commissaire Magellan,casEN_spaCy_stanza,1.0


### üß™ Example: Using `NER_Consensus`

---

#### üì¶ Import

```python
from tools.ner_consensus import NER_Consensus
ner_df = NER_Consensus(your_dataframe)
ner_df.head() # Show the output DataFrame
```

#### üîß Internal Processing
- Merges results from all NER systems.
- Applies priority rules between detected entities.
- Uses casEN_opti configuration for optimized merging.
- Removes duplicated entities per description.

In [23]:
from tools import ner_consensus
importlib.reload(ner_consensus)
from tools.ner_consensus import NER_Consensus



result_df =  NER_Consensus(DATAS)
result_df.to_excel("NER_Consensus.xlsx", index=False)
result_df.head()

2025-06-20 09:57:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 26.0MB/s]                    
2025-06-20 09:57:24 INFO: Downloaded file to C:\Users\valen\stanza_resources\resources.json
2025-06-20 09:57:25 INFO: Loading these models for language: fr (French):
| Processor | Package            |
----------------------------------
| tokenize  | combined           |
| mwt       | combined           |
| ner       | wikinergold_charlm |

2025-06-20 09:57:25 INFO: Using device: cpu
2025-06-20 09:57:25 INFO: Loading: tokenize
2025-06-20 09:57:25 INFO: Loading: mwt
2025-06-20 09:57:25 INFO: Loading: ner
2025-06-20 09:57:28 INFO: Done loading processors!


{'time', 'date', 'name', 'demonym', 'vieuxSigle', 'geogName', 'productName', 'placeName', 'nationality', 'timePeriod', 'adress', 'orgName', 'ref', 'geogFeat', 'roleName', 'org', 'extent', 'product', 'persName', 'datePeriod', 'place', 'measure', 'event', 'gYear'}
corpus.txt
C:\Users\valen\Documents\Informatique-L3\Stage_NER\NER\src\Results\Corpus		 -> results in : C:\Users\valen\Documents\Informatique-L3\Stage_NER\NER\src\Results\CasEN\Res_CasEN
1 files to process with CasEN in  C:\Users\valen\Documents\Informatique-L3\Stage_NER\NER\src\Results\Corpus



Unnamed: 0,titles,sub_title,days,channel,category,NER,NER_label,clean_titles,method,file_id
7,Faster than fear,S√©rie TV\nS√©rie polici√®re\nR√©alisateur :\nFlor...,20231101,13eme RUE,S√©rie TV,Nora,PER,Faster than fear,casEN_stanza_priority,0.0
9,Faster than fear,S√©rie TV\nS√©rie polici√®re\nR√©alisateur :\nFlor...,20231101,13eme RUE,S√©rie TV,Marcel,PER,Faster than fear,casEN_spaCy_stanza,0.0
11,Commissaire Magellan (S1-E30),S√©rie TV\nS√©rie polici√®re\nDur√©e : 1h40min\nR√©...,20231101,13eme RUE,S√©rie TV,Tristan Garil,PER,Commissaire Magellan,casEN_stanza,1.0
14,Commissaire Magellan (S1-E30),S√©rie TV\nS√©rie polici√®re\nDur√©e : 1h40min\nR√©...,20231101,13eme RUE,S√©rie TV,Delandin,PER,Commissaire Magellan,spaCy_stanza_priority,1.0
15,Commissaire Magellan (S1-E30),S√©rie TV\nS√©rie polici√®re\nDur√©e : 1h40min\nR√©...,20231101,13eme RUE,S√©rie TV,Estelle Delandin,PER,Commissaire Magellan,casEN_spaCy_stanza,1.0


# üß† Named Entity Recognition (NER)

This project focuses on optimizing a natural language processing (NLP) pipeline to detect and classify named entities in **French texts**, across the following categories:

* `PER` ‚Äì Person
* `LOC` ‚Äì Location
* `ORG` ‚Äì Organization
* `MISC` ‚Äì Miscellaneous

We leverage **multiple NER tools** to maximize accuracy:

* **CasEN**: A linguistic rule-based system based on **Unitex**, developed by linguists.
* **spaCy**: A fast and efficient NLP library.
* **Stanza**: A deep learning-based NLP library from Stanford, well-suited for morphologically rich languages.

---

### üìÅ Single vs. Multiple Corpus Processing

We implemented an option that lets you choose whether to generate **one file per description** or a **single file for all descriptions combined**.

To preserve the traceability of each description's origin, we wrap them with custom tags in the merged file:

```xml
<doc id="X">
    [description content]
</doc>
```

This allows the system to:

- ‚úÖ Significantly reduce execution time (more than 2√ó faster in our tests)

- ‚úÖ Better exploit generic graph-based rules, which can tag all similar entities once one is found

üìä Entity Detection Results

| Mode                     | Total Entities Found | Gain    |
| ------------------------ | -------------------- | ------- |
| One file per description | 9,446                | ‚Äî       |
| One file for all         | 13,233               | +40.09% |


---

## üöÄ CasEN Optimization (method : casENOpti)

We then evaluated the **precision** and **entity yield** of each graph individually.

This analysis helped us identify certain graphs‚Äîor combinations of graphs‚Äîthat provided the most benefit. We leveraged this insight to **prioritize and retain their extracted entities**, even if they were not detected by other systems.

### üîç Example of a Graph Sequence

| Step            | Graph Name               |
|------------------|--------------------------|
| main_graph      | `grfpersCivilitePersonne` |
| second_graph  | `grftagCiviliteS`         |
| third_graph   | `grftagNomFamille`        |

These optimized sequences allow us to improve both recall and consistency across descriptions by capturing entities that would otherwise be missed.


---
## üîÑ Multi-Model Entity Detection & Cross-Validation

Each text description is first processed individually by all three systems (**CasEN**, **spaCy**, and **Stanza**).
Then, we apply a **cross-validation strategy** during result fusion:

### Cross-System Agreement

* If multiple systems detect the **same entity**, we merge their outputs and label them accordingly.
* Example: If both **CasEN** and **Stanza** detect "Nora" as a `PER`, the merged method becomes `CasEN_Stanza`.

###  Conflict Resolution with Priority Rules

When an entity is detected by **multiple systems with different labels**, we apply **priority rules**:

* Entities found by **more systems** are considered more reliable.
* If systems agree on the **entity** but not on the **label**, we prioritize the **most frequent or reliable label** among agreeing systems.

‚ö†Ô∏è **Important:** Currently, this system works only for **PER** entities.  
After a brief analysis, this configuration appears to yield the highest number of entities with minimal loss in precision.


#### Example

![Excel Result Preview](src/images/image.png)

As shown above:

* Both **CasEN** and **Stanza** classify **‚ÄúNora‚Äù** as a **Person (`PER`)**.
* **spaCy**, however, classifies it as a **Location (`LOC`)**.

As a result, the merged label becomes: CasEN_Stanza_priority


This indicates that CasEN and Stanza agreed on both the entity and the label, and their interpretation takes precedence over spaCy‚Äôs.

---
## üìä Named Entity Recognition (NER) ‚Äì Evaluation Results

This section presents the evolution of NER performance across different configurations using **CasEN**, **SpaCy**, **Stanza**, and optimized graph sequences.



###  Initial Evaluation (CasEN ‚à© SpaCy)

Entities detected using the intersection of CasEN and SpaCy systems at the beginning of the pipeline.

| Category | Total Entities | Accuracy |
|----------|----------------|----------|
| NE       | 4,085          | 97.67%   |
| PER      | 2,744          | 98.69%   |
| LOC      | 1,212          | 98.68%   |
| ORG      | 129            | 66.67%   |
| MISC     | 0              | 0.00%    |



### üìÅ CasEN on Single Corpus File (CasEN ‚à© SpaCy)

Performance after switching to a **single concatenated file** approach for CasEN.

| Category | Total Entities | Accuracy | Entity Gain | Accuracy Loss |
|----------|----------------|----------|--------------|----------------|
| NE       | 5,327          | ‚úÖ 97.61%   | üîº +30.40%     | üîΩ -0.06%         |
| PER      | 4,236          | ‚úÖ 98.31%   | üîº +51.37%     | üîΩ -0.37%         |
| LOC      | 952            | ‚úÖ 98.83%   | üîΩ -21.45%     | üîº +0.15%         |
| ORG      | 139            | ‚ö†Ô∏è 66.92%   | üîº +7.75%      | üîΩ -0.26%         |
| MISC     | 0              | ‚ùå 0.00%    | ‚ûñ 0.00%       | ‚ûñ 0.00%          |



### üöÄ CasEN + Optimized Graphs

Results using **CasEN with graph optimization** strategies.

| Category | Total Entities | Accuracy | Entity Gain | Accuracy Loss |
|----------|----------------|----------|--------------|----------------|
| NE       | 6,010          | ‚úÖ 97.14%   | üîº +12.82%     | üîΩ -0.47%         |
| PER      | 4,491          | ‚úÖ 98.00%   | üîº +6.02%      | üîΩ -0.31%         |
| LOC      | 1,294          | ‚úÖ 97.78%   | üîº +35.92%     | üîº +1.05%         |
| ORG      | 225            | ‚ö†Ô∏è 75.12%   | üîº +61.87%     | üîΩ -8.20%         |
| MISC     | 0              | ‚ùå 0.00%    | ‚ûñ 0.00%       | ‚ûñ 0.00%          |


### Full System: CasEN + SpaCy + Stanza + Optimization & Priority Rules

Final performance combining **all systems** with **graph priority strategies** and **CasEN optimizations**.

| Category | Total Entities | Accuracy | Entity Gain | Accuracy Loss |
|----------|----------------|----------|--------------|----------------|
| NE       | 7,086          | ‚úÖ 97.08%   | üîº +17.90%     | üîΩ -0.06%         |
| PER      | 5,592          | ‚úÖ 97.37%   | üîº +24.52%     | üîΩ -0.63%         |
| LOC      | 1,267          | ‚úÖ 98.30%   | üîΩ -2.09%      | üîº +0.52%         |
| ORG      | 227            | ‚ö†Ô∏è 82.84%   | üîº +0.89%      | üîΩ -7.72%         |
| MISC     | 0              | ‚ùå 0.00%    | ‚ûñ 0.00%       | ‚ûñ 0.00%          |



#### ‚úÖ Summary


| Category | Total Entities | Accuracy | Entity Gain | Accuracy Loss |
|----------|----------------|----------|--------------|----------------|
| NE       | 7,086          | ‚úÖ97.08%   | üîº +73.46%     | üîΩ -0.60%         |
| PER      | 5,592          | ‚úÖ97.37%   | üîº +103.79%     | üîΩ -1.31%        |
| LOC      | 1,267          | ‚úÖ98.30%   | üîº +4.54%      | üîΩ -0.38%         |
| ORG      | 227            | ‚ö†Ô∏è 82.84%   | üîº +75.97%      | üîº +16.18%         |
| MISC     | 0              | ‚ùå 0.00%    | ‚ûñ 0.00%       | ‚ûñ 0.00%          |

---
## üîÑ Suggestions for Further Work / Improvements

- ‚úÖ After two months, several updates have been made to CasEN. It would be beneficial to reanalyze the graphs (as some have changed!) in order to update the `CasENOpti` configuration.

- ‚úÖ Additionally, further analysis could be performed by modifying the order in which the graphs are applied particularly for the `Generique`     graphs.

- ‚úÖ It could also be very interesting to replace the single text file generated for CasEN with several ‚Äòcollection‚Äô type files, grouping EPGs from the same collection together. We can probably imagine a more coherent result for the use of generic graphs in this case.

- The `priority` system could also be further improved and extended.  
  Currently, it identifies all composite methods (e.g., `CasEN_Stanza`) and atomic methods (e.g., `CasEN`, `Stanza`) separately.  
  When both a composite and an atomic method detect the same entity but assign different categories, the system applies a priority rule in favor of the composite method.  
  (It might also be worth exploring comparisons between atomic methods themselves to refine the decision-making process.)

‚ö†Ô∏è **Important:** All tests and analyses were carried out on a single day's data set. It is possible that by working on much larger data sets, certain functions may no longer work or certain optimisations may no longer be consistent.


## üìÖ Installation

### 1. Clone the repository

```bash
git clone https://github.com/Valentin-Gauthier/NER.git
cd NER
```

### 2. Install dependencies

```bash
pip install -r requirements.txt
```

---

## ‚úçÔ∏è Author

Valentin ‚Äî Bachelor‚Äôs degree, 3rd year, Computer Science
Internship at LIFAT - 2025
